### Importing the needed libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler #you can use minmax scaler too
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from imblearn.ensemble import BalancedBaggingClassifier
#Import other necessary model libraries, for this example, using Logistic Regression

### Importing the dataset

In [4]:
train_data = pd.read_csv(r"../Data/Preprocessed-Datasets/train-data-encoded.csv", header = 0)
test_data = pd.read_csv(r"../Data/Preprocessed-Datasets/test-data-encoded.csv", header = 0)
validation_data = pd.read_csv(r"../Data/Preprocessed-Datasets/validation-data-encoded.csv", header = 0)

In [5]:
train_data=train_data.drop(['Reservation-id'], axis = 1)
validation_data=validation_data.drop(['Reservation-id'], axis = 1)
test_data=test_data.drop(['Reservation-id'], axis = 1)

In [6]:
validation_data.head()

Unnamed: 0,Age,Adults,Children,Babies,Reservation_Status,Discount_Rate,Room_Rate,Expected_stay_days,Reservation_gap,Gender_F,...,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Required_Car_Parking_No,Required_Car_Parking_Yes,Use_Promotion_No,Use_Promotion_Yes
0,56,2,2,0,No-Show,15,192,2,195,0,...,1,0,0,1,0,0,1,0,0,1
1,60,2,2,0,Canceled,0,187,4,175,0,...,1,0,0,0,0,1,0,1,1,0
2,58,3,1,0,Canceled,10,227,1,193,1,...,1,0,0,0,1,0,1,0,0,1
3,23,1,2,0,Check-In,25,189,2,103,1,...,0,0,1,0,1,0,1,0,0,1
4,47,1,1,0,Check-In,10,218,1,92,1,...,1,0,0,0,0,1,0,1,0,1


In [7]:
validation_data.head()

Unnamed: 0,Age,Adults,Children,Babies,Reservation_Status,Discount_Rate,Room_Rate,Expected_stay_days,Reservation_gap,Gender_F,...,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Required_Car_Parking_No,Required_Car_Parking_Yes,Use_Promotion_No,Use_Promotion_Yes
0,56,2,2,0,No-Show,15,192,2,195,0,...,1,0,0,1,0,0,1,0,0,1
1,60,2,2,0,Canceled,0,187,4,175,0,...,1,0,0,0,0,1,0,1,1,0
2,58,3,1,0,Canceled,10,227,1,193,1,...,1,0,0,0,1,0,1,0,0,1
3,23,1,2,0,Check-In,25,189,2,103,1,...,0,0,1,0,1,0,1,0,0,1
4,47,1,1,0,Check-In,10,218,1,92,1,...,1,0,0,0,0,1,0,1,0,1


### Seperating the columns of categorical and quantitative data

In [8]:
data_quan_cols = ['Age','Discount_Rate', 'Room_Rate','Expected_stay_days', 
                        'Reservation_gap', 'Adults', 'Children','Babies']

In [9]:
data_cat_cols = ['Gender', 'Ethnicity', 'Educational_Level', 'Income', 'Country_region',
                 'Hotel_Type', 'Meal_Type', 'Visted_Previously',
                 'Previous_Cancellations', 'Deposit_type', 'Booking_channel',
                 'Required_Car_Parking', 'Use_Promotion']

### Scaling the quantitative variables

In [10]:
train_data_quan = train_data[data_quan_cols]
validation_data_quan = validation_data[data_quan_cols]
test_data_quan = test_data[data_quan_cols]

In [11]:
sc = StandardScaler()

In [12]:
train_data_quan_scaled = sc.fit_transform(train_data_quan)
validation_data_quan_scaled = sc.transform(validation_data_quan)
test_data_quan_scaled = sc.transform(test_data_quan)

In [13]:
train_data_quan_scaled

array([[-0.25991554, -0.22266812,  0.97691768, ..., -0.28353117,
         0.35475481, -0.61317041],
       [ 0.32820944, -1.11506055,  0.22480302, ...,  0.56644415,
         1.73987514, -0.61317041],
       [-0.1292211 , -1.11506055, -1.27942629, ...,  0.56644415,
         1.73987514, -0.61317041],
       ...,
       [ 0.32820944, -0.66886434,  0.61225603, ..., -0.28353117,
        -1.03036552,  1.13107011],
       [ 1.37376495,  0.66972431, -0.41335487, ...,  0.56644415,
         0.35475481, -0.61317041],
       [ 0.26286222, -1.11506055,  0.79458685, ...,  0.56644415,
        -1.03036552, -0.61317041]])

In [14]:
type(train_data_quan_scaled)

numpy.ndarray

### Separating the categorical variables

In [15]:
cols_to_drop = data_quan_cols.copy()
cols_to_drop.append("Reservation_Status")
cols_to_drop

['Age',
 'Discount_Rate',
 'Room_Rate',
 'Expected_stay_days',
 'Reservation_gap',
 'Adults',
 'Children',
 'Babies',
 'Reservation_Status']

In [16]:
print(cols_to_drop)

['Age', 'Discount_Rate', 'Room_Rate', 'Expected_stay_days', 'Reservation_gap', 'Adults', 'Children', 'Babies', 'Reservation_Status']


In [17]:
train_data_cat = train_data.drop(cols_to_drop, axis=1)
validation_data_cat = validation_data.drop(cols_to_drop, axis=1)
test_data_cat = test_data.drop(data_quan_cols, axis=1)
train_data_cat.head()

Unnamed: 0,Gender_F,Gender_M,Ethnicity_African American,Ethnicity_Asian American,Ethnicity_Latino,Ethnicity_caucasian,Educational_Level_College,Educational_Level_Grad,Educational_Level_High-School,Educational_Level_Mid-School,...,Deposit_type_No Deposit,Deposit_type_Non-Refundable,Deposit_type_Refundable,Booking_channel_Agent,Booking_channel_Direct,Booking_channel_Online,Required_Car_Parking_No,Required_Car_Parking_Yes,Use_Promotion_No,Use_Promotion_Yes
0,1,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,1,0,1,0,1
1,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,1,0,1,1,0
2,1,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,1,0,1,1,0
3,0,1,1,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,1,0,1
4,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1


In [18]:
train_data_cat_array = train_data_cat.to_numpy()
validation_data_cat_array = validation_data_cat.to_numpy()
test_data_cat_array = test_data_cat.to_numpy()
train_data_cat_array.shape

(27499, 38)

### Joining the pre-processed data

In [19]:
train_data_processed = np.concatenate((train_data_quan_scaled, train_data_cat_array), axis = 1)
validation_data_processed = np.concatenate((validation_data_quan_scaled, validation_data_cat_array), axis = 1)
test_data_processed = np.concatenate((test_data_quan_scaled, test_data_cat_array), axis = 1)

In [20]:
train_data_processed[0]

array([-0.25991554, -0.22266812,  0.97691768, -0.84878606, -0.88293639,
       -0.28353117,  0.35475481, -0.61317041,  1.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        1.        ])

In [21]:
test_data_quan_scaled.shape

(4318, 8)

In [22]:
test_data_cat_array.shape

(4318, 38)

In [23]:
validation_data_processed.shape

(2749, 46)

### Encoding the labels

In [24]:
train_y = train_data['Reservation_Status']
validation_y = validation_data['Reservation_Status']

In [25]:
le = preprocessing.LabelEncoder()

In [26]:
train_y = le.fit_transform(train_y)
validation_y = le.transform(validation_y)

In [27]:
validation_y.shape

(2749,)

In [29]:
le.classes_

array(['Canceled', 'Check-In', 'No-Show'], dtype=object)

### Coding...