In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv("hotel_bookings.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
#we have already performed EDA on this
#in this dataset let us just focus on Data Preprocessing

In [5]:
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [6]:
imputer = SimpleImputer(strategy="most_frequent")
X_imputer = imputer.fit_transform(data)

In [7]:
X_imputer

array([['Resort Hotel', 0, 342, ..., 0, 'Check-Out', '2015-07-01'],
       ['Resort Hotel', 0, 737, ..., 0, 'Check-Out', '2015-07-01'],
       ['Resort Hotel', 0, 7, ..., 0, 'Check-Out', '2015-07-02'],
       ...,
       ['City Hotel', 0, 34, ..., 4, 'Check-Out', '2017-09-07'],
       ['City Hotel', 0, 109, ..., 0, 'Check-Out', '2017-09-07'],
       ['City Hotel', 0, 205, ..., 2, 'Check-Out', '2017-09-07']],
      dtype=object)

In [8]:
numeric = []
category = []
for col in data.columns:
    if data[col].dtype == "O":
        category.append(col)
    else:
        numeric.append(col)

In [9]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


In [10]:
#initialize the scale methods that is supported in Sklearn

numeric_feature_scale_std = StandardScaler()
numeric_feature_scale_std_minmax = MinMaxScaler()

In [11]:
#for encoding category data types we basically use LabelEncoder and OneHotEncoder
category_feature_encode_le = LabelEncoder()
category_feature_encode_one = OneHotEncoder()

In [12]:
data[numeric]

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0,342,2015,27,1,0,0,2,0.0,0,0,0,0,3,,,0,0.00,0,0
1,0,737,2015,27,1,0,0,2,0.0,0,0,0,0,4,,,0,0.00,0,0
2,0,7,2015,27,1,0,1,1,0.0,0,0,0,0,0,,,0,75.00,0,0
3,0,13,2015,27,1,0,1,1,0.0,0,0,0,0,0,304.0,,0,75.00,0,0
4,0,14,2015,27,1,0,2,2,0.0,0,0,0,0,0,240.0,,0,98.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,35,30,2,5,2,0.0,0,0,0,0,0,394.0,,0,96.14,0,0
119386,0,102,2017,35,31,2,5,3,0.0,0,0,0,0,0,9.0,,0,225.43,0,2
119387,0,34,2017,35,31,2,5,2,0.0,0,0,0,0,0,9.0,,0,157.71,0,4
119388,0,109,2017,35,31,2,5,2,0.0,0,0,0,0,0,89.0,,0,104.40,0,0


In [13]:
data[numeric] = numeric_feature_scale_std.fit_transform(data[numeric])

In [14]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
87338,City Hotel,-0.76704,-0.963961,-0.221286,April,-0.820662,0.022977,0.072502,-0.786207,0.247897,...,No Deposit,-0.719423,,-0.131924,Transient,-0.476122,-0.254873,0.540666,Check-Out,2016-04-18
3990,Resort Hotel,1.303712,0.065398,-0.221286,February,-1.48218,-0.432562,-0.92889,-0.262174,0.247897,...,Non Refund,-0.439574,,-0.131924,Transient,-0.629875,-0.254873,-0.720694,Canceled,2016-01-26
35092,Resort Hotel,-0.76704,-0.851667,1.192195,April,-0.894164,-0.774217,1.073895,-0.262174,-1.478447,...,No Deposit,0.986753,,-0.131924,Transient-Party,0.260586,-0.254873,0.540666,Check-Out,2017-04-13
43623,City Hotel,-0.76704,-0.41185,-1.634768,September,0.869883,0.592402,0.072502,-0.786207,0.247897,...,No Deposit,-0.602067,,-0.131924,Transient-Party,-0.728816,-0.254873,-0.720694,Check-Out,2015-09-23
21386,Resort Hotel,-0.76704,-0.907814,-0.221286,February,-1.335176,1.161827,-0.92889,-0.262174,0.247897,...,No Deposit,1.383958,,-0.131924,Transient,-0.016446,3.821932,-0.720694,Check-Out,2016-02-28
116396,City Hotel,-0.76704,2.114757,1.192195,July,0.134863,0.022977,1.073895,-0.786207,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,0.35359,3.821932,3.063386,Check-Out,2017-07-19
34291,Resort Hotel,-0.76704,-0.954603,1.192195,March,-1.188172,0.136862,-0.92889,-0.262174,0.247897,...,No Deposit,1.474232,,-0.131924,Transient,0.003342,-0.254873,-0.720694,Check-Out,2017-03-19
22335,Resort Hotel,-0.76704,-0.898456,-0.221286,March,-1.041168,0.706287,-0.92889,-0.786207,0.247897,...,No Deposit,1.383958,,-0.131924,Transient,-0.669452,-0.254873,0.540666,Check-Out,2016-03-23
86043,City Hotel,-0.76704,-0.701942,-0.221286,March,-1.041168,0.820172,-0.92889,0.261858,0.247897,...,No Deposit,0.463165,,-0.131924,Transient,-0.732773,-0.254873,-0.720694,Check-Out,2016-03-26
112806,City Hotel,-0.76704,1.150903,1.192195,May,-0.453152,1.047942,0.072502,0.261858,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,0.460445,-0.254873,-0.720694,Check-Out,2017-05-29


In [15]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


In [16]:
for i in category:
    data[i] = category_feature_encode_le.fit_transform(data[i].astype(str))

In [17]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,1,5,0,135,3,1,2,2,0,2,1,121
1,1,5,0,135,3,1,2,2,0,2,1,121
2,1,5,0,59,3,1,0,2,0,2,1,122
3,1,5,0,59,2,0,0,0,0,2,1,122
4,1,5,0,59,6,3,0,0,0,2,1,123
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,1,0,15,5,3,0,0,0,2,1,919
119386,0,1,0,56,6,3,4,4,0,2,1,920
119387,0,1,0,43,6,3,3,3,0,2,1,920
119388,0,1,0,59,6,3,0,0,0,2,1,920


In [18]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
41559,0,1.303712,-0.870383,-1.634768,1,0.502373,0.250747,-0.92889,-0.262174,0.247897,...,1,,-1.073086,-0.131924,2,-0.530935,-0.254873,-0.720694,0,164
69384,0,1.303712,0.767233,1.192195,8,-0.37965,1.617366,-0.92889,-0.262174,0.247897,...,1,-0.611094,,-0.131924,2,0.557407,-0.254873,-0.720694,0,634
58830,0,1.303712,1.674941,-0.221286,10,1.163891,0.022977,1.073895,-1.31024,0.247897,...,1,-0.773587,,-0.131924,2,-0.115386,-0.254873,-0.720694,0,344
33636,1,-0.76704,-0.365061,1.192195,3,-1.408678,0.592402,1.073895,1.309924,0.247897,...,0,1.392986,,-0.131924,2,-1.217382,-0.254873,0.540666,1,729
48068,0,1.303712,-0.804878,-0.221286,7,-1.11467,-0.090908,-0.92889,1.309924,-1.478447,...,0,,,-0.131924,2,0.533661,-0.254873,-0.720694,0,366
52214,0,1.303712,2.114757,-0.221286,6,-0.306148,-1.571412,-0.92889,-0.262174,0.247897,...,1,-0.773587,,-0.131924,2,-0.78818,-0.254873,-0.720694,0,233
77230,0,1.303712,-0.804878,-1.634768,10,0.943385,-1.457527,1.073895,1.309924,-1.478447,...,0,-0.647204,,-0.131924,2,-0.468207,-0.254873,-0.720694,0,197
13439,1,1.303712,0.842096,1.192195,1,0.428871,-0.090908,-0.92889,1.309924,0.247897,...,0,1.383958,,-0.131924,2,1.151048,-0.254873,0.540666,0,723
103648,0,-0.76704,-0.131116,-0.221286,2,1.825408,0.934057,1.073895,0.261858,1.974242,...,0,-0.701368,,-0.131924,2,0.339738,-0.254873,0.540666,1,668
116806,0,-0.76704,-0.02818,1.192195,5,0.208365,0.934057,0.072502,-1.31024,1.974242,...,0,-0.701368,,-0.131924,2,2.021722,-0.254873,1.802026,1,876


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  int64  
 1   is_canceled                     119390 non-null  float64
 2   lead_time                       119390 non-null  float64
 3   arrival_date_year               119390 non-null  float64
 4   arrival_date_month              119390 non-null  int64  
 5   arrival_date_week_number        119390 non-null  float64
 6   arrival_date_day_of_month       119390 non-null  float64
 7   stays_in_weekend_nights         119390 non-null  float64
 8   stays_in_week_nights            119390 non-null  float64
 9   adults                          119390 non-null  float64
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  float64
 12  meal            