In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None) 

%load_ext autoreload
%autoreload 2
import logging
logging.basicConfig(level=logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
from hotelbooking.preprocessing import get_df

In [22]:
df = get_df('../data/hotel_bookings.csv')

INFO:hotelbooking.utils:[get_df            ] shape=(85977, 30),  time=0:00:00.584227


In [23]:
df.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,adr,total_of_special_requests,arrival_date_month_sin,arrival_date_month_cos,arrival_date_week_number_sin,arrival_date_week_number_cos,arrival_date_day_of_month_sin,arrival_date_day_of_month_cos,show_up
0,Resort Hotel,342,2015,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,3,No Deposit,,0,Transient,0.0,0,-0.5,-0.866025,-0.120537,-0.992709,0.201299,0.97953,1
1,Resort Hotel,737,2015,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,4,No Deposit,,0,Transient,0.0,0,-0.5,-0.866025,-0.120537,-0.992709,0.201299,0.97953,1
2,Resort Hotel,7,2015,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,0,No Deposit,,0,Transient,75.0,0,-0.5,-0.866025,-0.120537,-0.992709,0.201299,0.97953,1
3,Resort Hotel,13,2015,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,0,No Deposit,304.0,0,Transient,75.0,0,-0.5,-0.866025,-0.120537,-0.992709,0.201299,0.97953,1
4,Resort Hotel,14,2015,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,0,No Deposit,240.0,0,Transient,98.0,1,-0.5,-0.866025,-0.120537,-0.992709,0.201299,0.97953,1


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85977 entries, 0 to 119389
Data columns (total 30 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           85977 non-null  object 
 1   lead_time                       85977 non-null  int64  
 2   arrival_date_year               85977 non-null  int64  
 3   stays_in_weekend_nights         85977 non-null  int64  
 4   stays_in_week_nights            85977 non-null  int64  
 5   adults                          85977 non-null  int64  
 6   children                        85973 non-null  float64
 7   babies                          85977 non-null  int64  
 8   meal                            85977 non-null  object 
 9   country                         85530 non-null  object 
 10  market_segment                  85977 non-null  object 
 11  distribution_channel            85977 non-null  object 
 12  is_repeated_guest              

In [54]:
# Determine n_features to minimize collisions
# 1: Find the number unique combination of values, u for the categorical variables

# Get all combinations of categorical features for categorical vector size
def get_combinations(df, *cols):
    combinations = []
    for col in cols:
        combinations.append(df[col].nunique())
        
    return np.prod(combinations)
    
# 2: Choose the smallest n such C(n,m) is less than u where C(n,m) is combination defined as n!/(m! x (n-m)!)
import math

def nCr(n,r):
    f = math.factorial
    return round(f(n) / f(r) / f(n-r))

In [58]:
df.select_dtypes('object')

Unnamed: 0,hotel,meal,country,market_segment,distribution_channel,reserved_room_type,deposit_type,agent,customer_type
0,Resort Hotel,BB,PRT,Direct,Direct,C,No Deposit,,Transient
1,Resort Hotel,BB,PRT,Direct,Direct,C,No Deposit,,Transient
2,Resort Hotel,BB,GBR,Direct,Direct,A,No Deposit,,Transient
3,Resort Hotel,BB,GBR,Corporate,Corporate,A,No Deposit,304,Transient
4,Resort Hotel,BB,GBR,Online TA,TA/TO,A,No Deposit,240,Transient
...,...,...,...,...,...,...,...,...,...
119385,City Hotel,BB,BEL,Offline TA/TO,TA/TO,A,No Deposit,394,Transient
119386,City Hotel,BB,FRA,Online TA,TA/TO,E,No Deposit,9,Transient
119387,City Hotel,BB,DEU,Online TA,TA/TO,D,No Deposit,9,Transient
119388,City Hotel,BB,GBR,Online TA,TA/TO,A,No Deposit,89,Transient


In [55]:
get_combinations(df, 
                 'hotel', 
                 'meal', 
                 'country', 
                 'market_segment', 
                 'distribution_channel', 
                 'reserved_room_type', 
                 'deposit_type', 
                 'agent',
                 'customer_type')

2829168000

In [69]:
print(nCr(50,9))
nCr(50,9) > 2829168000

2505433700


False

# Train test split

In [43]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from hotelbooking.preprocessing import get_df
from hotelbooking.models import IsolationForest
import pickle
from sklearn.impute import SimpleImputer


def split_data(df):
    X = df.drop(columns='show_up')
    y = df['show_up']

    return train_test_split(X, y, test_size=0.3, stratify=y, random_state=123)

In [44]:
X_train, X_test, y_train, y_test = split_data(df)

In [45]:
numerical_pipeline = make_pipeline(
        DTypeSelector('number'),
        CorrFilterHighTotalCorrelation(),
        KNNImputer(n_neighbors=5),
        RobustScaler()
    )

In [46]:
 numerical_pipeline.fit_transform(X_train)

array([[ 1.24778761,  1.        ,  0.        , ..., -0.23687361,
        -0.20651635, -0.62407575],
       [-0.40707965,  1.        , -0.5       , ...,  0.23687361,
         0.68896692,  0.        ],
       [-0.33628319,  1.        , -0.5       , ...,  0.54974753,
         0.27204731,  0.66974526],
       ...,
       [-0.34513274,  1.        ,  0.5       , ...,  0.64858242,
         0.20651635, -0.62407575],
       [-0.14159292,  1.        ,  0.        , ...,  0.08051767,
        -0.33478669, -0.56895987],
       [ 0.14159292,  0.        , -0.5       , ..., -0.44296135,
         0.68896692,  0.        ]])

In [47]:
object_pipeline = make_pipeline(
        DTypeSelector('object'),
        SimpleImputer(strategy='most_frequent'),
        HashingEncoder()
    )

In [48]:
 object_pipeline.fit_transform(X_train)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,7
0,2,3,2,0,0,0,0,1,9.0
1,1,1,2,2,0,1,0,1,9.0
2,2,1,2,0,0,1,1,1,146.0
3,2,1,2,0,0,1,1,1,420.0
4,3,2,1,0,0,2,0,0,9.0
...,...,...,...,...,...,...,...,...,...
60178,2,2,1,2,0,0,0,1,9.0
60179,2,1,2,0,0,2,0,1,9.0
60180,1,0,2,2,0,3,0,0,14.0
60181,2,1,1,2,0,1,0,1,9.0
