## Model Training

#### Import Data and Required Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
file_path = r'D:\MLOPS\ML-Approach-for-Predict-Cancellation-Prevent-Loss-with-MLflow\artifacts\data_ingestion\booking.csv'
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


### Data Preparation

In [4]:
df["booking status"] = df["booking status"].apply(lambda x : 1 if x=='Not_Canceled' else 0) 

In [214]:
df = df[~df["date of reservation"].str.contains("-")]

df["date of reservation"] = pd.to_datetime(df["date of reservation"])
df["month"] = df["date of reservation"].dt.month

In [215]:
df.drop(['Booking_ID','date of reservation'], axis=1, inplace=True)

In [216]:
X = df.drop('booking status',axis=1)
y = df['booking status']

In [217]:
X.head()

Unnamed: 0,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,month
0,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10
1,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11
2,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2
3,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5
4,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4


In [218]:
y

0        1
1        1
2        0
3        0
4        0
        ..
36280    0
36281    1
36282    1
36283    1
36284    1
Name: booking status, Length: 36248, dtype: int64

In [219]:
X.shape, y.shape

((36248, 15), (36248,))

In [220]:
num_feature = X.select_dtypes(exclude="object").columns 
cat_feature = X.select_dtypes(include="object").columns

In [221]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

In [222]:
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_feature),
         ("StandardScaler", numeric_transformer, num_feature),        
    ]
)

In [223]:
X = preprocessor.fit_transform(X)

In [224]:
X

array([[ 1.        ,  0.        ,  0.        , ..., -0.43999395,
        -0.78842765,  0.83828472],
       [ 0.        ,  0.        ,  0.        , ...,  0.09249069,
         0.48315969,  1.1643828 ],
       [ 1.        ,  0.        ,  0.        , ..., -1.52320683,
        -0.78842765, -1.77049987],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.0619897 ,
         0.48315969,  1.49048087],
       [ 1.        ,  0.        ,  0.        , ..., -0.1862941 ,
         0.48315969, -0.1400095 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.8553006 ,
         3.02633438,  0.83828472]])

In [225]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [226]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((28998, 28), (7250, 28))

In [227]:
models = {
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(), 
    "Ada_boost Classifier" :  AdaBoostClassifier(),
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "XGB Classifier" : XGBClassifier()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

K-Neighbors Classifier
Model performance for Training set
- Root Mean Squared Error: 0.3192
- Mean Absolute Error: 0.1019
- R2 Score: 0.5377
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3856
- Mean Absolute Error: 0.1487
- R2 Score: 0.3241


Decision Tree Classifier
Model performance for Training set
- Root Mean Squared Error: 0.0790
- Mean Absolute Error: 0.0062
- R2 Score: 0.9717
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3762
- Mean Absolute Error: 0.1415
- R2 Score: 0.3567


Random Forest Classifier
Model performance for Training set
- Root Mean Squared Error: 0.0790
- Mean Absolute Error: 0.0062
- R2 Score: 0.9717
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.3236
- Mean Absolute Error: 0.1047
- R2 Score: 0.5241


Ada_boost Classifier
Model performance for Training set
- Root Mean Squared Error: 0.4301
- Mean Absolute Error: 0.1849


In [228]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Random Forest Classifier,0.524111
5,XGB Classifier,0.491507
4,CatBoosting Classifier,0.480221
1,Decision Tree Classifier,0.356703
0,K-Neighbors Classifier,0.324099
3,Ada_boost Classifier,0.188668


In [234]:
rfc_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_model = rfc_model.fit(X_train, y_train)
y_pred = rfc_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 51.28


In [236]:
from sklearn.metrics import accuracy_score, classification_report
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:\n', classification_report_str)

Accuracy: 0.8928275862068965
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83      2369
           1       0.91      0.94      0.92      4881

    accuracy                           0.89      7250
   macro avg       0.88      0.87      0.88      7250
weighted avg       0.89      0.89      0.89      7250

