In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

from scipy import stats
from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE

In [118]:
df = pd.read_csv('../Week (1)_2July25_12AM/first inten project.csv')

In [120]:
df.head()

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


In [122]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36285 entries, 0 to 36284
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Booking_ID                36285 non-null  object 
 1   number of adults          36285 non-null  int64  
 2   number of children        36285 non-null  int64  
 3   number of weekend nights  36285 non-null  int64  
 4   number of week nights     36285 non-null  int64  
 5   type of meal              36285 non-null  object 
 6   car parking space         36285 non-null  int64  
 7   room type                 36285 non-null  object 
 8   lead time                 36285 non-null  int64  
 9   market segment type       36285 non-null  object 
 10  repeated                  36285 non-null  int64  
 11  P-C                       36285 non-null  int64  
 12  P-not-C                   36285 non-null  int64  
 13  average price             36285 non-null  float64
 14  specia

<h1>Missing Values</h1>

In [125]:
print(df.isnull().sum())

Booking_ID                  0
number of adults            0
number of children          0
number of weekend nights    0
number of week nights       0
type of meal                0
car parking space           0
room type                   0
lead time                   0
market segment type         0
repeated                    0
P-C                         0
P-not-C                     0
average price               0
special requests            0
date of reservation         0
booking status              0
dtype: int64


<h1>Duplicates</h1>

In [128]:
df.duplicated().sum()

0

In [130]:
df = df.drop_duplicates()

In [132]:
df.duplicated().sum()

0

<h1>Irrelevant Features</h1>

In [135]:
df.columns

Index(['Booking_ID', 'number of adults', 'number of children',
       'number of weekend nights', 'number of week nights', 'type of meal',
       'car parking space', 'room type', 'lead time', 'market segment type',
       'repeated', 'P-C', 'P-not-C', 'average price ', 'special requests',
       'date of reservation', 'booking status'],
      dtype='object')

In [137]:
df.drop(columns=['Booking_ID'], inplace=True)

<h1>Handle Outliers</h1>

In [140]:
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower,
                np.where(df[col] > upper, upper, df[col]))


<h1>Encode Booking Status</h1>

In [143]:
df['booking status'] = df['booking status'].map({'Not_Canceled': 0, 'Canceled': 1})

In [145]:
df.head()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,2.0,0.0,2.0,5.0,Meal Plan 1,0.0,Room_Type 1,224.0,Offline,0.0,0.0,0.0,88.0,0.0,10/2/2015,0
1,2.0,0.0,1.0,3.0,Not Selected,0.0,Room_Type 1,5.0,Online,0.0,0.0,0.0,106.68,1.0,11/6/2018,0
2,2.0,0.0,1.0,3.0,Meal Plan 1,0.0,Room_Type 1,1.0,Online,0.0,0.0,0.0,50.0,0.0,2/28/2018,1
3,2.0,0.0,0.0,2.0,Meal Plan 1,0.0,Room_Type 1,211.0,Online,0.0,0.0,0.0,100.0,1.0,5/20/2017,1
4,2.0,0.0,1.0,2.0,Not Selected,0.0,Room_Type 1,48.0,Online,0.0,0.0,0.0,77.0,0.0,4/11/2018,1


<h1>Feature Engineering</h1>

In [151]:
df['date of reservation'] = pd.to_datetime(df['date of reservation'],errors='coerce')
df['reservation_month'] = df['date of reservation'].dt.month
df['reservation_day'] = df['date of reservation'].dt.day
df.drop(columns=['date of reservation'], inplace=True)

<h3>Multicolinearity</h3>

In [153]:
numeric_df = df.select_dtypes(include=['number'])

In [155]:
corr_matrix = numeric_df.corr()

In [157]:
high_corr = corr_matrix[(corr_matrix > 0.9) & (corr_matrix != 1.0)]

In [159]:
print(high_corr)

                          number of adults  number of children  \
number of adults                       NaN                 NaN   
number of children                     NaN                 NaN   
number of weekend nights               NaN                 NaN   
number of week nights                  NaN                 NaN   
car parking space                      NaN                 NaN   
lead time                              NaN                 NaN   
repeated                               NaN                 NaN   
P-C                                    NaN                 NaN   
P-not-C                                NaN                 NaN   
average price                          NaN                 NaN   
special requests                       NaN                 NaN   
booking status                         NaN                 NaN   
reservation_month                      NaN                 NaN   
reservation_day                        NaN                 NaN   

         

<h1>Encoding</h1>

In [162]:
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

type of meal: ['Meal Plan 1' 'Not Selected' 'Meal Plan 2' 'Meal Plan 3']
room type: ['Room_Type 1' 'Room_Type 4' 'Room_Type 2' 'Room_Type 6' 'Room_Type 5'
 'Room_Type 7' 'Room_Type 3']
market segment type: ['Offline' 'Online' 'Corporate' 'Aviation' 'Complementary']


In [164]:
df = pd.get_dummies(df, columns=['type of meal', 'room type', 'market segment type'], drop_first=True)

In [166]:
df.head()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,car parking space,lead time,repeated,P-C,P-not-C,average price,...,room type_Room_Type 2,room type_Room_Type 3,room type_Room_Type 4,room type_Room_Type 5,room type_Room_Type 6,room type_Room_Type 7,market segment type_Complementary,market segment type_Corporate,market segment type_Offline,market segment type_Online
0,2.0,0.0,2.0,5.0,0.0,224.0,0.0,0.0,0.0,88.0,...,False,False,False,False,False,False,False,False,True,False
1,2.0,0.0,1.0,3.0,0.0,5.0,0.0,0.0,0.0,106.68,...,False,False,False,False,False,False,False,False,False,True
2,2.0,0.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,50.0,...,False,False,False,False,False,False,False,False,False,True
3,2.0,0.0,0.0,2.0,0.0,211.0,0.0,0.0,0.0,100.0,...,False,False,False,False,False,False,False,False,False,True
4,2.0,0.0,1.0,2.0,0.0,48.0,0.0,0.0,0.0,77.0,...,False,False,False,False,False,False,False,False,False,True


<h1>Scaling and Normalization</h1>

In [169]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop('booking status', axis=1))

In [171]:
df.head()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,car parking space,lead time,repeated,P-C,P-not-C,average price,...,room type_Room_Type 2,room type_Room_Type 3,room type_Room_Type 4,room type_Room_Type 5,room type_Room_Type 6,room type_Room_Type 7,market segment type_Complementary,market segment type_Corporate,market segment type_Offline,market segment type_Online
0,2.0,0.0,2.0,5.0,0.0,224.0,0.0,0.0,0.0,88.0,...,False,False,False,False,False,False,False,False,True,False
1,2.0,0.0,1.0,3.0,0.0,5.0,0.0,0.0,0.0,106.68,...,False,False,False,False,False,False,False,False,False,True
2,2.0,0.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,50.0,...,False,False,False,False,False,False,False,False,False,True
3,2.0,0.0,0.0,2.0,0.0,211.0,0.0,0.0,0.0,100.0,...,False,False,False,False,False,False,False,False,False,True
4,2.0,0.0,1.0,2.0,0.0,48.0,0.0,0.0,0.0,77.0,...,False,False,False,False,False,False,False,False,False,True


<h1>Train Test Split</h1>

In [174]:
X = pd.DataFrame(X_scaled, columns=df.drop('booking status', axis=1).columns)
y = df['booking status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h1>Handle Class Imbalance with SMOTE</h1>

In [177]:
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [182]:
from sklearn.impute import SimpleImputer

# Create an imputer for numeric data
imputer = SimpleImputer(strategy='mean')  # or 'median'

# Fit on X_train and transform both X_train and X_test
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [184]:
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

<h1>Modeling</h1>

<h2>LogisticRegression</h2>

In [187]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [197]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7796610169491526
[[3837 1018]
 [ 581 1821]]
              precision    recall  f1-score   support

           0       0.87      0.79      0.83      4855
           1       0.64      0.76      0.69      2402

    accuracy                           0.78      7257
   macro avg       0.75      0.77      0.76      7257
weighted avg       0.79      0.78      0.78      7257



<h2>RandomForest</h2>

In [200]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [202]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [204]:
rf_pred = rf_model.predict(X_test)

In [206]:
print("Random Forest Evaluation")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

Random Forest Evaluation
Accuracy: 0.8949979330301777
[[4526  329]
 [ 433 1969]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      4855
           1       0.86      0.82      0.84      2402

    accuracy                           0.89      7257
   macro avg       0.88      0.88      0.88      7257
weighted avg       0.89      0.89      0.89      7257



<h2>Decision Tree</h2>

In [209]:
from sklearn.tree import DecisionTreeClassifier

In [211]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [213]:
dt_pred = dt_model.predict(X_test)

In [215]:
print("Decision Tree Evaluation")
print("Accuracy:", accuracy_score(y_test, dt_pred))
print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

Decision Tree Evaluation
Accuracy: 0.858481466170594
[[4304  551]
 [ 476 1926]]
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      4855
           1       0.78      0.80      0.79      2402

    accuracy                           0.86      7257
   macro avg       0.84      0.84      0.84      7257
weighted avg       0.86      0.86      0.86      7257



<h2>XGBoost</h2>

In [218]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 1.9 MB/s eta 0:01:21
   ---------------------------------------- 1.0/150.0 MB 1.5 MB/s eta 0:01:41
   ---------------------------------------- 1.3/150.0 MB 1.6 MB/s eta 0:01:36
   ---------------------------------------- 1.6/150.0 MB 1.4 MB/s eta 0:01:48
   ---------------------------------------- 1.8/150.0 MB 1.5 MB/s eta 0:01:36
    --------------------------------------- 2.6/150.0 MB 1.7 MB/s eta 0:01:27
    --------------------------------------- 2.9/150.0 MB 1.7 MB/s eta 0:01:26
    --------------------------------------- 3.1/150.0 MB 1.7 MB/s eta 0:01:26
    -----------

In [220]:
from xgboost import XGBClassifier

In [222]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [224]:
xgb_pred = xgb_model.predict(X_test)

In [226]:
print("XGBoost Evaluation")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print(confusion_matrix(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))

XGBoost Evaluation
Accuracy: 0.8838362959900785
[[4457  398]
 [ 445 1957]]
              precision    recall  f1-score   support

           0       0.91      0.92      0.91      4855
           1       0.83      0.81      0.82      2402

    accuracy                           0.88      7257
   macro avg       0.87      0.87      0.87      7257
weighted avg       0.88      0.88      0.88      7257



<h1>Comaprisons</h1>

In [229]:
results = {
    "Logistic Regression": accuracy_score(y_test, y_pred),
    "Random Forest": accuracy_score(y_test, rf_pred),
    "Decision Tree": accuracy_score(y_test, dt_pred),
    "XGBoost": accuracy_score(y_test, xgb_pred)
}

print("Model Comparison (Accuracy):")
print(results)

Model Comparison (Accuracy):
{'Logistic Regression': 0.7796610169491526, 'Random Forest': 0.8949979330301777, 'Decision Tree': 0.858481466170594, 'XGBoost': 0.8838362959900785}
