# 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 2. Load and Inspect Data

In [None]:
df = pd.read_csv(r"H:\Hazoom\Videos\Courses\ML - Cellula\Week 1\Task\first inten project.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include= 'object')

In [None]:
df.isnull().sum()

# 3. Data Preprocessisng 

Column Name Cleanup

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [None]:
df.columns

Drop Irrelevant Column 'booking_id'

In [None]:
df.drop("booking_id", axis=1, inplace=True)

Sorting DataFrame by 'date_of_reservation column' after clearning its corrupt value 2018-29-2

In [None]:
df['date_of_reservation'] = df['date_of_reservation'].replace('2018-2-29', '2/28/2018')
df['date_of_reservation'] = pd.to_datetime(df['date_of_reservation'], errors='coerce')
df.sort_values('date_of_reservation', inplace=True)

In [None]:
df.head()

Duplicates Detecting and Removal

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
# check
df.duplicated().sum()

# 4. Feature Engineering

Get DataFrame Info

In [None]:
df.info()

First, we should convert:
- type_of_meal
- room_type
- market_segment_type
- booking_status

into numerical data

In [None]:
#Getting Unique Values

print(df['type_of_meal'].unique())
print(df['room_type'].unique())
print(df['market_segment_type'].unique())
print(df['booking_status'].unique())

In [None]:
# type_of_meal
df['type_of_meal'] = df['type_of_meal'].str.strip().str.replace('Meal Plan', "")
df['type_of_meal'] = df['type_of_meal'].str.strip().str.replace('Not Selected', '0')
df['type_of_meal'] = df['type_of_meal'].astype(int)

# room_type
df['room_type'] = df['room_type'].str.strip().str.replace('Room_Type', "")
df['room_type'] = df['room_type'].astype(int)

# market_segment_type
# One hot encoding
df = pd.get_dummies(df, columns=['market_segment_type'], drop_first=True)

# booking_status
#Encoding

df['booking_status'] = df['booking_status'].map({'Canceled': 0, 'Not_Canceled': 1})

Adding New Features

In [None]:
# Combine People & Nights
df['all_people'] = df['number_of_adults'] + df['number_of_children']
df['all_nights'] = df['number_of_weekend_nights'] + df['number_of_week_nights']

# Previous Cancellation Flag
df['has_previous_cancelation'] = (df['p-c'] > 0).astype(int)

# Booking Ratio
total_bookings = df['p-c'] + df['p-not-c']
df['booking_ratio'] = df['p-c'] / total_bookings.replace(0, np.nan)
df['booking_ratio'] = df['booking_ratio'].fillna(0)

df['booking_ratio'].isna().sum()


In [None]:
df.info()

convert the needed colums to int type

In [None]:
df['market_segment_type_Complementary'] = df['market_segment_type_Complementary'].astype(int)
df['market_segment_type_Corporate'] = df['market_segment_type_Corporate'].astype(int)
df['market_segment_type_Offline'] = df['market_segment_type_Offline'].astype(int)
df['market_segment_type_Online'] = df['market_segment_type_Online'].astype(int)

# 5. Outliers

Box Plot

In [None]:
numeric_columns = [
    'lead_time',
    'average_price',
    'number_of_children',
    'number_of_weekend_nights',
    'number_of_week_nights',
    'special_requests'
]

plt.figure(figsize=(15,10))

i = 1
for col in numeric_columns:
    plt.subplot(2,3,i)
    sns.boxplot(data= df, y= col)
    plt.title(f'Boxplot of {col}')
    i += 1

plt.tight_layout()

Histogram Plot

In [None]:
plt.figure(figsize=(15, 10))

i = 1
for col in numeric_columns:
    plt.subplot(2, 3, i)
    sns.histplot(df[col], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    i += 1

plt.tight_layout()


Log Transform Lead Time 

In [None]:
df['lead_time_log'] = np.log1p(df['lead_time'])

Histogram Plot for The Log Transform of Lead Time

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
sns.histplot(df['lead_time_log'], kde=True, bins=30, color='skyblue')
plt.title('Distribution of leat_time_log')
plt.xlabel('lead_time_log')
plt.ylabel('Count')
plt.subplot(1,2,2)
sns.boxplot(data= df, y=df['lead_time_log'])
plt.title('Box plot of lead_time_log')

Check Skewing Before and After The Transform

In [None]:
print("Before log:")
print("Lead time skew:", df['lead_time'].skew())

print("\nAfter log:")
print("Lead time log skew:", df['lead_time_log'].skew())


# 6. Feature Selection - VIF Check

Check All The Columns

In [None]:
df.info()

In [None]:
df.columns

Final Selected Features

In [None]:
vif_data = df[['type_of_meal',
 'car_parking_space',
 'room_type',
 'repeated',
 'average_price',
 'special_requests',
 'booking_status',
 'has_previous_cancelation',
 'booking_ratio',
 'market_segment_type_Complementary',
 'market_segment_type_Corporate',
 'market_segment_type_Offline',
 'market_segment_type_Online',
 'lead_time_log',
 'all_people',
 'all_nights']]

Checking Correlation With Heatmap

In [None]:
# checking correlation
corr_matrix = vif_data.corr(numeric_only=True)
plt.figure(figsize=(18,10))
sns.heatmap
sns.heatmap(data=corr_matrix, cmap='coolwarm', annot=True)
plt.title("Correlation Matrix", fontsize=16)

VIF and Transparency Calculations

In [None]:
vif_df = pd.DataFrame()
vif_df['Featture'] = vif_data.columns

vif_df

In [None]:
vif_df["VIF"] = [variance_inflation_factor(vif_data.values, i)
                 for i in range(len(vif_data.columns))]
vif_df["Tolerance"] = 1 / vif_df["VIF"]

vif_df.sort_values("VIF", ascending=False)

# 7. Logistics Regression Model

In [None]:
df.drop('p-c', axis= 1, inplace= True)
df.drop('p-not-c', axis= 1, inplace= True)
df.drop('number_of_adults', axis= 1, inplace= True)
df.drop('number_of_children', axis= 1, inplace= True)
df.drop('number_of_weekend_nights',axis= 1, inplace=True)
df.drop('number_of_week_nights', axis= 1, inplace= True)
df.drop('lead_time', axis= 1, inplace= True)
df.drop('date_of_reservation', axis= 1, inplace= True)

In [None]:
df.columns

In [None]:
# Select features
X = df.drop(columns='booking_status')
y = df['booking_status']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Cross-validation score
scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Score:", scores.mean())