# Importing Libraries

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Reading the data

In [8]:
filepath = "D:/Datasets/Hotel booking/hotel_bookings.csv"
df=pd.read_csv(filepath)
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,7/2/2015
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,7/2/2015
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,7/3/2015


# Data cleaning

In [9]:
df.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

*agent and company have a lot of NULL values. Dropping them since the values are just IDs*

In [10]:
df.drop(['agent','company'],axis=1,inplace=True)

Seems like country also has a lot of missing values. Consider filling them with the highest country

In [11]:
df['country'].value_counts()

PRT    48590
GBR    12129
FRA    10415
ESP     8568
DEU     7287
       ...  
DJI        1
BWA        1
HND        1
VGB        1
NAM        1
Name: country, Length: 177, dtype: int64

In [12]:
df['country'].fillna(df['country'].value_counts().index[0],inplace=True)

Missing countries are filled with PRT

In [13]:
df.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          4
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests   

In [14]:
df.fillna(0,inplace = True)

Everything else is filled with 0

In [15]:
df.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests   

# Irrelevant data

There is data where children, adult and babies are 0. If there is no one, how is the booking even possible??

Let's clean this data

In [16]:
filter_1 = (df['children']==0) & (df['adults']==0) & (df['babies']==0)
data = df[~filter_1]

In [17]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,7/1/2015
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out,7/1/2015
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out,7/2/2015
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out,7/2/2015
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out,7/3/2015


# Spacial Analysis

## 1. Percentage of guests from each country

In [18]:
data[data['is_canceled']==0]['country'].value_counts()/len(data[data['is_canceled']==0])

PRT    0.285265
GBR    0.128888
FRA    0.112890
ESP    0.085094
DEU    0.080881
         ...   
BHR    0.000013
DJI    0.000013
MLI    0.000013
NPL    0.000013
FRO    0.000013
Name: country, Length: 165, dtype: float64

We can see portugal has highest percentage of 28% overall

In [19]:
country_wise_data = data[data['is_canceled']==0]['country'].value_counts().reset_index()
country_wise_data.columns = ['country','no_of_guests']
country_wise_data

Unnamed: 0,country,no_of_guests
0,PRT,21398
1,GBR,9668
2,FRA,8468
3,ESP,6383
4,DEU,6067
...,...,...
160,BHR,1
161,DJI,1
162,MLI,1
163,NPL,1


## 2. Busiest month

There are 2 types of hotels (Resort & City). Let's take 2 dataframes for each hotel and merge it with month as key.

In [20]:
data_resort=data[(data['hotel']=='Resort Hotel') & (data['is_canceled']==0)]
data_city = data[(data['hotel']=='City Hotel') & (data['is_canceled']==0)]

In [21]:
rush_resort=data_resort['arrival_date_month'].value_counts().reset_index()
rush_resort.columns=['month','no_of_guests']
rush_city=data_city['arrival_date_month'].value_counts().reset_index()
rush_city.columns=['month','no_of_guests']
final_rush=rush_resort.merge(rush_city,on='month')
final_rush.columns=['month','no_of_guests_in_resort','no_of_guests_city']
final_rush

Unnamed: 0,month,no_of_guests_in_resort,no_of_guests_city
0,August,3257,5367
1,July,3137,4770
2,October,2575,4326
3,March,2571,4049
4,April,2550,4010
5,May,2535,4568
6,February,2308,3051
7,September,2102,4283
8,June,2037,4358
9,December,2014,2377


*The data is not sorted by month which is confusing. Let us install a package which can be utilized to sort the dataframe by month*

In [22]:
!pip install sorted-months-weekdays
!pip install sort-dataframeby-monthorweek



In [23]:
import sort_dataframeby_monthorweek

ModuleNotFoundError: No module named 'sort_dataframeby_monthorweek'

In [None]:
final_rush=sd.Sort_Dataframeby_Month(final_rush,'month')

In [24]:
data.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

# Reducing and Creating new features

## Drop adults, children, babies and create a new feature is_family and total_customers

In [25]:
def family(row):
    if (row['adults']>0) &  (row['children']>0 or row['babies']>0) :
        return 1
    else:
        return 0

In [26]:
data['is_family'] = data.apply(family, axis = 1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['is_family'] = data.apply(family, axis = 1)


In [27]:
data[data['is_family']==1]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,is_family
13,Resort Hotel,0,18,2015,July,27,1,0,4,2,...,1,No Deposit,0,Transient,154.77,0,1,Check-Out,7/5/2015,1
45,Resort Hotel,1,47,2015,July,27,2,2,5,2,...,0,No Deposit,0,Transient,153.00,0,0,Canceled,6/2/2015,1
55,Resort Hotel,0,1,2015,July,27,2,0,1,2,...,0,No Deposit,0,Transient,107.00,1,2,Check-Out,7/3/2015,1
65,Resort Hotel,0,10,2015,July,27,3,0,2,2,...,0,No Deposit,0,Transient,153.00,1,0,Check-Out,7/5/2015,1
87,Resort Hotel,1,79,2015,July,27,3,6,15,2,...,0,No Deposit,0,Transient,108.73,0,2,Canceled,4/15/2015,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119270,City Hotel,0,0,2017,August,35,29,0,3,2,...,1,No Deposit,0,Transient,135.00,0,2,Check-Out,9/1/2017,1
119287,City Hotel,0,167,2017,August,35,27,2,4,2,...,1,No Deposit,0,Transient,177.75,0,0,Check-Out,9/2/2017,1
119293,City Hotel,0,243,2017,August,35,30,0,3,2,...,0,No Deposit,0,Transient,189.00,0,3,Check-Out,9/2/2017,1
119318,City Hotel,0,160,2017,August,35,29,0,5,2,...,0,No Deposit,0,Transient,153.90,0,0,Check-Out,9/3/2017,1


In [28]:
data['total_customer'] = data['adults'] + data['babies'] + data['children']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['total_customer'] = data['adults'] + data['babies'] + data['children']


## We could merge week nights and weekend nights and create total_nights

In [29]:
data['total_nights']=data['stays_in_week_nights'] + data['stays_in_weekend_nights']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['total_nights']=data['stays_in_week_nights'] + data['stays_in_weekend_nights']


In [30]:
data.head(3)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,is_family,total_customer,total_nights
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,0,Transient,0.0,0,0,Check-Out,7/1/2015,0,2.0,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,0,Transient,0.0,0,0,Check-Out,7/1/2015,0,2.0,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,Transient,75.0,0,0,Check-Out,7/2/2015,0,1.0,1


In [31]:
data.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'is_family',
       'total_customer', 'total_nights'],
      dtype='object')

## Deposit type can also be replaced with 0 & 1

In [32]:
data['deposit_type'].unique()

array(['No Deposit', 'Refundable', 'Non Refund'], dtype=object)

In [33]:
dict1={'No Deposit':0, 'Non Refund':1, 'Refundable': 0}

In [34]:
data['deposit_given']=data['deposit_type'].map(dict1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['deposit_given']=data['deposit_type'].map(dict1)


In [35]:
data.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'is_family',
       'total_customer', 'total_nights', 'deposit_given'],
      dtype='object')

## Finally, dropping all the columns

In [36]:
data.drop(columns=['adults', 'children', 'babies', 'deposit_type'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=['adults', 'children', 'babies', 'deposit_type'],axis=1,inplace=True)


In [37]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,meal,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,is_family,total_customer,total_nights,deposit_given
0,Resort Hotel,0,342,2015,July,27,1,0,0,BB,...,Transient,0.0,0,0,Check-Out,7/1/2015,0,2.0,0,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,BB,...,Transient,0.0,0,0,Check-Out,7/1/2015,0,2.0,0,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,BB,...,Transient,75.0,0,0,Check-Out,7/2/2015,0,1.0,1,0
3,Resort Hotel,0,13,2015,July,27,1,0,1,BB,...,Transient,75.0,0,0,Check-Out,7/2/2015,0,1.0,1,0
4,Resort Hotel,0,14,2015,July,27,1,0,2,BB,...,Transient,98.0,0,1,Check-Out,7/3/2015,0,2.0,2,0


# Categorical encoding

### Encode all the string or object types into categories using mean encoding technique

In [38]:
cate_features=[col for col in data.columns if data[col].dtype=='object']

In [39]:
num_features=[col for col in data.columns if data[col].dtype!='object']

In [40]:
num_features

['is_canceled',
 'lead_time',
 'arrival_date_year',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'days_in_waiting_list',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests',
 'is_family',
 'total_customer',
 'total_nights',
 'deposit_given']

In [41]:
cate_features

['hotel',
 'arrival_date_month',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'customer_type',
 'reservation_status',
 'reservation_status_date']

In [42]:
data_cat = data[cate_features]

In [43]:
data_cat['cancellation']=data['is_canceled']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cat['cancellation']=data['is_canceled']


In [44]:
cols=data_cat.columns
cols=cols[0:-1]
cols

Index(['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'customer_type', 'reservation_status', 'reservation_status_date'],
      dtype='object')

In [45]:
for col in cols:
    dict2=data_cat.groupby([col])['cancellation'].mean().to_dict()
    data_cat[col]=data_cat[col].map(dict2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cat[col]=data_cat[col].map(dict2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cat[col]=data_cat[col].map(dict2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cat[col]=data_cat[col].map(dict2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [46]:
data_cat.head()

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,customer_type,reservation_status,reservation_status_date,cancellation
0,0.277674,0.374644,0.374106,0.562958,0.153712,0.174868,0.330827,0.188186,0.407864,0.0,0.8,0
1,0.277674,0.374644,0.374106,0.562958,0.153712,0.174868,0.330827,0.188186,0.407864,0.0,0.8,0
2,0.277674,0.374644,0.374106,0.20231,0.153712,0.174868,0.391567,0.188186,0.407864,0.0,0.985075,0
3,0.277674,0.374644,0.374106,0.20231,0.187618,0.220568,0.391567,0.445055,0.407864,0.0,0.985075,0
4,0.277674,0.374644,0.374106,0.20231,0.36759,0.410598,0.391567,0.445055,0.407864,0.0,0.522876,0


In [47]:
dataframe=pd.concat([data_cat,data[num_features]],axis=1)

## Handling outliers

In [48]:
def handle_outlier(col):
    dataframe[col]=np.log1p(dataframe[col])

In [49]:
handle_outlier('lead_time')

In [50]:
handle_outlier('adr')

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [51]:
dataframe['adr'].isnull().sum()

1

# Select important features using Co-relation & Univariate analysis

In [52]:
corr=dataframe.corr()

In [53]:
corr

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,customer_type,reservation_status,...,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,is_family,total_customer,total_nights,deposit_given
hotel,1.0,0.051197,0.061782,-0.040609,0.102592,0.182266,0.198171,0.280884,-0.008378,0.137082,...,-0.004467,-0.076598,0.072725,0.154041,-0.218961,-0.043478,-0.058094,-0.038762,-0.249747,0.172415
arrival_date_month,0.051197,1.0,-0.010208,-0.052405,0.047902,0.077075,-0.038801,0.015456,-0.029636,0.069886,...,-0.021971,-0.011049,0.029164,0.228179,-0.02191,0.002763,0.020491,0.07909,0.045648,0.057361
meal,0.061782,-0.010208,1.0,0.022476,-0.026007,-0.025486,0.010035,0.013913,0.105767,0.050584,...,0.011109,-0.027189,-0.031562,-0.019908,0.003455,0.006875,0.001466,-0.003447,-0.055689,0.047597
country,-0.040609,-0.052405,0.022476,1.0,0.078982,-0.129774,0.153177,0.125415,-0.02932,0.357232,...,0.091795,-0.061513,0.067054,-0.16052,0.007613,-0.200822,-0.044139,-0.117351,-0.160759,0.395685
market_segment,0.102592,0.047902,-0.026007,0.078982,1.0,0.540116,0.182813,0.209272,-0.265606,0.267006,...,-0.117519,-0.057578,0.096385,-0.004009,-0.145726,-0.155463,-0.104584,-0.023874,-0.004546,0.397229
distribution_channel,0.182266,0.077075,-0.025486,-0.129774,0.540116,1.0,0.070381,0.11897,-0.023234,0.177167,...,-0.14265,-0.118461,0.048001,0.126516,-0.146562,0.075047,-0.026278,0.087776,0.084999,0.11516
reserved_room_type,0.198171,-0.038801,0.010035,0.153177,0.182813,0.070381,1.0,0.722724,-0.14339,0.072769,...,0.022533,-0.040134,0.068336,-0.185932,-0.093401,-0.161406,-0.133748,-0.246347,-0.188866,0.201923
assigned_room_type,0.280884,0.015456,0.013913,0.125415,0.209272,0.11897,0.722724,1.0,-0.084025,0.20157,...,-0.003205,-0.096277,0.067129,-0.038347,-0.131035,-0.135337,-0.179356,-0.229081,-0.103571,0.263383
customer_type,-0.008378,-0.029636,0.105767,-0.02932,-0.265606,-0.023234,-0.14339,-0.084025,1.0,0.136617,...,0.024716,-0.095486,-0.101833,0.117592,0.061786,0.137114,0.106618,0.124318,0.03772,0.121789
reservation_status,0.137082,0.069886,0.050584,0.357232,0.267006,0.177167,0.072769,0.20157,0.136617,1.0,...,-0.057365,-0.144832,0.054301,0.08166,-0.195701,-0.234877,-0.013226,0.044826,0.018554,0.481507


In [54]:
corr['is_canceled'].sort_values(ascending=False)

is_canceled                       1.000000
cancellation                      1.000000
reservation_status                1.000000
reservation_status_date           0.488307
deposit_given                     0.481507
country                           0.357232
lead_time                         0.320075
market_segment                    0.267006
assigned_room_type                0.201570
distribution_channel              0.177167
hotel                             0.137082
customer_type                     0.136617
previous_cancellations            0.110139
adr                               0.081660
reserved_room_type                0.072769
arrival_date_month                0.069886
days_in_waiting_list              0.054301
meal                              0.050584
total_customer                    0.044826
stays_in_week_nights              0.025542
total_nights                      0.018554
arrival_date_year                 0.016622
arrival_date_week_number          0.008315
stays_in_we

## We could probably drop the ones that are co-related too high and too low to avoid overfitting and underfitting respectively

In [55]:
corr['is_canceled'].sort_values(ascending=False).index

Index(['is_canceled', 'cancellation', 'reservation_status',
       'reservation_status_date', 'deposit_given', 'country', 'lead_time',
       'market_segment', 'assigned_room_type', 'distribution_channel', 'hotel',
       'customer_type', 'previous_cancellations', 'adr', 'reserved_room_type',
       'arrival_date_month', 'days_in_waiting_list', 'meal', 'total_customer',
       'stays_in_week_nights', 'total_nights', 'arrival_date_year',
       'arrival_date_week_number', 'stays_in_weekend_nights',
       'arrival_date_day_of_month', 'is_family',
       'previous_bookings_not_canceled', 'is_repeated_guest',
       'booking_changes', 'required_car_parking_spaces',
       'total_of_special_requests'],
      dtype='object')

In [56]:
features_to_drop=['reservation_status', 'reservation_status_date','arrival_date_year',
       'arrival_date_week_number', 'stays_in_weekend_nights',
       'arrival_date_day_of_month']
dataframe.drop(features_to_drop,axis=1,inplace=True)

In [57]:
dataframe.shape

(119210, 25)

In [59]:
dataframe.drop(['cancellation'], axis = 1, inplace = True)

In [60]:
dataframe.dropna(inplace=True)

### Separating dependent and independent features

In [61]:
x=dataframe.drop('is_canceled',axis=1)
y=dataframe['is_canceled']

In [62]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [63]:
feature_sel_model=SelectFromModel(Lasso(alpha=0.005))

In [64]:
feature_sel_model.fit(x,y)

In [65]:
feature_sel_model.get_support()

array([False, False, False,  True, False, False, False, False, False,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True])

In [66]:
cols=x.columns

In [67]:
selected_feature=cols[feature_sel_model.get_support()]

In [68]:
selected_feature

Index(['country', 'lead_time', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes',
       'days_in_waiting_list', 'adr', 'required_car_parking_spaces',
       'total_of_special_requests', 'total_customer', 'total_nights',
       'deposit_given'],
      dtype='object')

In [69]:
x=x[selected_feature]

# Model Building

 ## Using Logistic Regression since this is a classification problem

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.25)

In [72]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()

In [73]:
logreg.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
pred=logreg.predict(X_test)

## Comparing the results using confusion matrix

In [75]:
from sklearn.metrics import confusion_matrix

In [76]:
confusion_matrix(y_test,pred)

array([[17372,  1371],
       [ 4627,  6433]], dtype=int64)

## Accuracy

In [77]:
from sklearn.metrics import accuracy_score

In [78]:
accuracy_score(y_test,pred)

0.798745092775895

## Applying cross-validation

In [79]:
from sklearn.model_selection import cross_val_score

In [80]:
score=cross_val_score(logreg,x,y,cv=10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [81]:
score

array([0.78584011, 0.7428907 , 0.84305008, 0.74146464, 0.73097894,
       0.78936331, 0.73886419, 0.80605654, 0.79859072, 0.89941275])

# Finally, using multiple classification algorithms and comparing the results

In [82]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [83]:
models=[]

models.append(('LogisticRegression',LogisticRegression()))
models.append(('Naive_bayes',GaussianNB()))
models.append(('Random Forest',RandomForestClassifier()))
models.append(('Decision_tree',DecisionTreeClassifier()))
models.append(('KNN',KNeighborsClassifier()))

In [84]:
for name,model in models:
    print(name)
    model.fit(X_train,y_train)
    
    predictions=model.predict(X_test)
    
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(predictions,y_test)
    print(cm)
    
    from sklearn.metrics import accuracy_score
    acc=accuracy_score(predictions,y_test)
    print(acc)
    print('\n')

LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[17372  4627]
 [ 1371  6433]]
0.798745092775895


Naive_bayes
[[ 6750   681]
 [11993 10379]]
0.5747407979062511


Random Forest
[[17144  2743]
 [ 1599  8317]]
0.8543099687950877


Decision_tree
[[15877  2652]
 [ 2866  8408]]
0.8148508539408784


KNN
[[16709  3365]
 [ 2034  7695]]
0.8188437405630306




# Hence, we can see that Random Forest gives the best results with accuracy over 85%

In [85]:
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
predictions=rfc.predict(X_test)


In [86]:
score

array([0.78609177, 0.73727036, 0.78122641, 0.71244023, 0.72049325,
       0.77862595, 0.71705394, 0.77568996, 0.73206946, 0.7761745 ])

In [87]:
accuracy_score(y_test,predictions)

0.8543435224641814