# Feature selection 

## 1. Correlation analysis

In [43]:
import pandas as pd
import numpy as np
#import seaborn as sns
#import matplotlib.pyplot as plt
#import os

In [44]:
%pwd

'/Users/oliviawang/Documents/Hotel_booking_prediction_Python'

In [49]:
%store -r cat_num

In [50]:
cat_num.columns

Index(['hotel', 'arrival_date_month', 'children', 'meal', 'country',
       'market_segment', 'distribution_channel', 'reserved_room_type',
       'assigned_room_type', 'customer_type', 'reservation_status',
       'reservation_status_date', 'weekend_or_weekday', 'is_canceled',
       'lead_time', 'arrival_date_year', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'babies', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'days_in_waiting_list', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'deposit_given'],
      dtype='object')

To investigate the distribution of the features when 'is_canceled' == 0 and 'is_canceled' == 1. The smaller the overlap, the better.

In [51]:
# Correlation Study
corr = cat_num.corr()
corr['is_canceled'].sort_values(ascending = False)

reservation_status                1.000000
is_canceled                       1.000000
reservation_status_date           0.488307
deposit_given                     0.481507
country                           0.357232
lead_time                         0.320075
market_segment                    0.267006
assigned_room_type                0.201570
distribution_channel              0.177167
hotel                             0.137082
customer_type                     0.136617
previous_cancellations            0.110139
adr                               0.081660
weekend_or_weekday                0.073162
reserved_room_type                0.072769
arrival_date_month                0.069886
adults                            0.058182
days_in_waiting_list              0.054301
meal                              0.050584
children                          0.029749
stays_in_week_nights              0.025542
arrival_date_year                 0.016622
arrival_date_week_number          0.008315
stays_in_we

- 'reservation_status' has extremely high correlation with 'is_cancled'. 
- 'children', 'stays_in_week_nights', 'arrival_date_year', 'arrival_date_week_number', 'stays_in_weekend_nights', 'arrival_date_day_of_month' have very small correlations with 'is_canceled'

In [52]:
features_to_drop = ['reservation_status', 'children', 'stays_in_week_nights', 'arrival_date_year', 'arrival_date_week_number', 'stays_in_weekend_nights', 'arrival_date_day_of_month']

In [53]:
cat_num.drop(features_to_drop, axis = 1, inplace = True)

In [54]:
cat_num.shape

(119210, 24)

In [55]:
cat_num.columns

Index(['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'customer_type', 'reservation_status_date', 'weekend_or_weekday',
       'is_canceled', 'lead_time', 'adults', 'babies', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'days_in_waiting_list', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'deposit_given'],
      dtype='object')

## 2. Find significant features using Lasso

In [56]:
cat_num.isnull().sum() # need to drop the missing value

hotel                             0
arrival_date_month                0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
reserved_room_type                0
assigned_room_type                0
customer_type                     0
reservation_status_date           0
weekend_or_weekday                0
is_canceled                       0
lead_time                         0
adults                            0
babies                            0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
booking_changes                   0
days_in_waiting_list              0
adr                               1
required_car_parking_spaces       0
total_of_special_requests         0
deposit_given                     0
dtype: int64

In [57]:
cat_num.dropna(inplace = True)

In [58]:
predictors = cat_num.drop('is_canceled', axis = 1)
response = cat_num['is_canceled']

In [59]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [60]:
lasso_mod = Lasso(alpha = 0.005) # arbitrary alpha value

In [63]:
feature_sel_mod = SelectFromModel(lasso_mod)
feature_sel_mod.fit(predictors, response)
feature_sel_mod.get_support() # False means the feature is not selected 

array([False, False, False,  True, False, False, False, False, False,
        True, False,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True])

In [64]:
xcols = predictors.columns

In [65]:
selected_features = xcols[feature_sel_mod.get_support()] 
selected_features

Index(['country', 'reservation_status_date', 'lead_time', 'adults',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'days_in_waiting_list', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'deposit_given'],
      dtype='object')

In [68]:
predictors = predictors[selected_features]
predictors.columns

Index(['country', 'reservation_status_date', 'lead_time', 'adults',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'days_in_waiting_list', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'deposit_given'],
      dtype='object')

In [69]:
%store predictors

Stored 'predictors' (DataFrame)


In [70]:
%store response

Stored 'response' (Series)


In [71]:
response.describe()

count    119209.000000
mean          0.370769
std           0.483013
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: is_canceled, dtype: float64