In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from genpipes import declare, compose

from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
index_column = 'Booking_ID'

int_columns = [
    'no_of_adults', 'no_of_previous_bookings_not_canceled','no_of_children','no_of_weekend_nights', 
    'no_of_week_nights', 'no_of_special_requests','no_of_previous_cancellations','lead_time',
]

category_columns = [
    'type_of_meal_plan','room_type_reserved', 'market_segment_type'
]

float_columns = [
    'avg_price_per_room'
]

boolean_columns = [
    'required_car_parking_space','repeated_guest','booking_status'
]

target_column = 'booking_status'

In [3]:
columns_dtype = {}

def set_dtype(_type,columns):
    for i in columns:
        columns_dtype[i] = _type
    
set_dtype(int,int_columns+['arrival_year', 'arrival_month', 'arrival_date'])
set_dtype(str,category_columns)
set_dtype(float,float_columns)
set_dtype(str,boolean_columns)

In [4]:
df = pd.read_csv('./Dados/Hotel Reservations.csv',dtype=columns_dtype)
df.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


# Data processing

In [5]:
@declare.generator(inputs=['./Dados/Hotel Reservations.csv'])
def get_dataset(path:str) -> pd.DataFrame:
    df = pd.read_csv(path, dtype=columns_dtype)
    return df

In [6]:
normalizer_scaler = Normalizer()
@declare.processor()
def data_normalization(stream, columns,fit=False):
    for df in stream:
        if fit:
            df[columns] = normalizer_scaler.fit_transform(df[columns])
        else:
            df[columns] = normalizer_scaler.transform(df[columns])
        yield df

In [7]:
standard_scaler = StandardScaler()
@declare.processor()
def data_standard(stream, columns, fit=False):
    for df in stream:
        if fit:
            df[columns] = standard_scaler.fit_transform(df[columns])
        else:
            df[columns] = standard_scaler.transform(df[columns])
        yield df


In [8]:
minmax_scaler = MinMaxScaler()
@declare.processor()
def data_minmax(stream, columns, fit=False):
    for df in stream:
        if fit:
            df[columns] = minmax_scaler.fit_transform(df[columns])
        else:
            df[columns] = minmax_scaler.transform(df[columns])
        yield df

In [9]:
@declare.processor()
def data_dummies(stream, columns):
    for df in stream:
        df = pd.get_dummies(df,columns=category_columns)
        df.columns = [i.replace(' ','_') for i in df.columns]
        yield df

# Models

In [10]:
y = df[target_column]
df = df.drop([index_column,target_column],axis=1)
X = df

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [11]:
tree = DecisionTreeClassifier(random_state=42)
cost = tree.cost_complexity_pruning_path(X_train,y_train)

ccp_alphas = cost['ccp_alphas']

ValueError: could not convert string to float: 'Not Selected'

In [None]:
trees = []
for ccp_alpha in ccp_alphas:
    tree = DecisionTreeClassifier(random_state=42,ccp_alpha=ccp_alpha)
    tree.fit(X_train,y_train)
    trees.append(tree)

In [None]:
train_size = len(X_train.index)

knncs = []
for i in range(1,train+15):
    knnc = KNeighborsClassifier(n_neighbors=i)
    knnc.fit(X_train,y_train)
    knncs.append(knnc)

In [None]:
lr = LogisticRegression(random_state=42)

grid_param = {'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}
lr = GridSearchCV(lr,param_grid=grid_param)