In [2]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import matplotlib.pyplot as plt



# Data Inspection
df = pd.read_csv("dataset.csv", sep=";")  
    
df.head(4)

df.columns

df.isnull().values.any() # No NaN Values in df

# sns.pairplot(df, hue='cancelada')
# plt.show()

# Use replace for dependent variable to get numerical values

df['cancelada'].replace(['yes','no'], [1,0], inplace=True)

print(df['cancelada'])


# Transfer from categorical data to numerical for independent variables

df_dummy = pd.get_dummies(df, columns=['mes_chegada','hotel_de_praia', 'regime', 'ja_cliente', 'tipo_deposito'])

df_dummy.columns





0        0
1        1
2        1
3        0
4        1
        ..
19995    0
19996    0
19997    0
19998    0
19999    1
Name: cancelada, Length: 20000, dtype: int64


Index(['id', 'antecedencia_da_reserva', 'dia_chegada', 'noites_da_estadia',
       'noites_fim_de_semana', 'adultos', 'criancas', 'bebes',
       'cancelamentos_anteriores', 'reservas_anteriores_nao_canceladas',
       'alteracoes_da_reserva', 'total_de_pedidos', 'cancelada',
       'mes_chegada_April', 'mes_chegada_August', 'mes_chegada_December',
       'mes_chegada_February', 'mes_chegada_January', 'mes_chegada_July',
       'mes_chegada_June', 'mes_chegada_March', 'mes_chegada_May',
       'mes_chegada_November', 'mes_chegada_October', 'mes_chegada_September',
       'hotel_de_praia_no', 'hotel_de_praia_yes',
       'regime_BB - Bed & Breakfast', 'regime_FB - Full Board',
       'regime_HB - Half Board', 'regime_SC - Self Catering', 'ja_cliente_no',
       'ja_cliente_yes', 'tipo_deposito_No Deposit',
       'tipo_deposito_Non Refund', 'tipo_deposito_Refundable'],
      dtype='object')

In [3]:

# Creating of subsets for model

# dependent variable is "cancelada_no" and "cancelada_yes" (Y)
# independent variables is every other line except for id (X)

#slicing data 
indexes1 = list(range(1,11))
indexes2 = list(range(13,36))
indexes3 = indexes1 + indexes2

# creating X and Y with iloc
Y = df_dummy.loc[:,'cancelada']
X = df_dummy.iloc[:,indexes3]

print(X.columns)



Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y,test_size=0.25, random_state=1)


print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)


Index(['antecedencia_da_reserva', 'dia_chegada', 'noites_da_estadia',
       'noites_fim_de_semana', 'adultos', 'criancas', 'bebes',
       'cancelamentos_anteriores', 'reservas_anteriores_nao_canceladas',
       'alteracoes_da_reserva', 'mes_chegada_April', 'mes_chegada_August',
       'mes_chegada_December', 'mes_chegada_February', 'mes_chegada_January',
       'mes_chegada_July', 'mes_chegada_June', 'mes_chegada_March',
       'mes_chegada_May', 'mes_chegada_November', 'mes_chegada_October',
       'mes_chegada_September', 'hotel_de_praia_no', 'hotel_de_praia_yes',
       'regime_BB - Bed & Breakfast', 'regime_FB - Full Board',
       'regime_HB - Half Board', 'regime_SC - Self Catering', 'ja_cliente_no',
       'ja_cliente_yes', 'tipo_deposito_No Deposit',
       'tipo_deposito_Non Refund', 'tipo_deposito_Refundable'],
      dtype='object')
(15000, 33) (5000, 33) (15000,) (5000,)


In [4]:
# Use SVC model fit and train the model
Model = SVC()
Model.fit(Xtrain, Ytrain) 
Model.score(Xtest, Ytest) 

# result: 66.52 accuracy for first time

KeyboardInterrupt: 

In [5]:
# Usage of Scaler to improve accuracy
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_scaled = scaler.transform(Xtrain)

scaler = StandardScaler()
scaler.fit(Xtest)
Xtest_scaled = scaler.transform(Xtest)


# Test again with scaled data

# Model = SVC(gamma='scale', C=10, kernel='poly', degree=1,shrinking=True, cache_size= 2000)
# Model.fit(Xtrain_scaled, Ytrain) 
# Model.score(Xtest_scaled, Ytest) 

# Result: Improvement of 10 % to 76.52

# poly degree 1 --> 0.7606
# poly degree 2 --> 0.7634
# poly degree 3  --> 0.7658
# poly degree 4 --> 0.7604
# poly degree 5 --> 0.7628
# poly degree 6 --> 0.7608
# poly degree 7 --> 0.759



In [6]:

Model = SVC(kernel='poly', coef0=0.5, C=10)
Model.fit(Xtrain_scaled, Ytrain) 
Model.score(Xtest_scaled, Ytest) 

0.7658

In [8]:
# Further Improvement throught GridSearch

from sklearn.model_selection import GridSearchCV

#optimizied_values = {'C':[100,10,1,0.1], 'kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 'gamma':['scale', 'auto',0.1], 'degree' : [3,8],'coef0' : [0.01,10,0.5],'shrinking':[True], 'cache_size': [2000] }
#optimizied_values = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf'], 'shrinking':[True], 'cache_size': [2000]}

#optimizied_values = {'C':[100,10,1,0.1], 'kernel':['poly', 'sigmoid'], 'gamma':['scale', 'auto',0.1], 'degree' : [3,8],'coef0' : [0.01,10,0.5],'shrinking':[True], 'cache_size': [2000] }
optimizied_values = {'kernel':['linear', 'rbf'], 'cache_size': [50] }


improvedModel = GridSearchCV(Model,param_grid=optimizied_values,cv=2)
improvedModel.fit(X=Xtrain,y=Ytrain)
improvedModel.best_params_
# Model_best= improvedModel.best_estimator_
# Model_best.score(X=Xteste, Y=yteste)


use PCA or heat map 

{'kernel':['poly', 'sigmoid'], 'cache_size': [2000] }
        --->{'cache_size': 2000, 'kernel': 'poly'}
{'kernel':['poly', 'linear'], 'cache_size': [2000] }
        --->{'cache_size': 2000, 'kernel': 'linear'}    46 minutes


