# **Hotel Cancellation Prediction**



**1. Import**

In [1]:
import pandas as pd
import numpy as np

**2. Dataset**

In [2]:
data = pd.read_csv('booking.csv')

data.head()

Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


**3. Drop Useless Data**

In [3]:
data.drop(['Booking_ID', 'P-not-C'], axis=1, inplace=True)


**4. Set X & Y**

In [4]:
# Convert 'date of reservation' to datetime
data['date of reservation'] = pd.to_datetime(data['date of reservation'],errors='coerce')

# Create new features based on 'date of reservation'
data['reservation_year'] = data['date of reservation'].dt.year
data['reservation_month'] = data['date of reservation'].dt.month
data['reservation_day'] = data['date of reservation'].dt.day
data['reservation_dayofweek'] = data['date of reservation'].dt.dayofweek

# Now drop the original 'date of reservation' column
data = data.drop(['date of reservation'], axis=1)

data.dropna(subset=['reservation_year', 'reservation_month', 'reservation_day', 'reservation_dayofweek'], inplace=True)

# Now you can create your dummy variables
X = data.drop(['booking status'], axis=1)
y = data['booking status']
X = pd.get_dummies(X)

**5. Train, Validation and Test Split**

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

print(X_train.shape[0])
print(len(y_train))

29360
29360


**6. Standardize 'Average Price'**

In [6]:
from sklearn.preprocessing import StandardScaler

# Inicijalizacija StandardScaler-a
s = StandardScaler()

# Standardizacija 'average price' kolone na trening setu
s.fit(X_train[['average price']])

# Transformacija 'average price' kolone na trening i validacionom setu
x_train_std = s.transform(X_train[['average price']])
x_valid_std = s.transform(X_valid[['average price']])

# Kreiranje DataFrame-ova iz standardizovanih podataka
x_train_std = pd.DataFrame(x_train_std, columns=['average price'])
x_valid_std = pd.DataFrame(x_valid_std, columns=['average price'])

# Izbacivanje kolone 'average price' iz x_train_std i x_valid_std
X_train = X_train.drop(columns=['average price'])
X_valid = X_valid.drop(columns=['average price'])

# Resetovanje indeksa DataFrame-ova pre konkatenacije
X_train.reset_index(drop=True, inplace=True)
X_valid.reset_index(drop=True, inplace=True)
x_train_std.reset_index(drop=True, inplace=True)
x_valid_std.reset_index(drop=True, inplace=True)

# Konkatenacija standardizovanih podataka sa ostatkom podataka
X_train = pd.concat([x_train_std, X_train], axis=1)
X_valid = pd.concat([x_valid_std, X_valid], axis=1)

# Ispis prvih nekoliko redova konkateniranih DataFrame-ova
print(X_train.head())
print(X_valid.head())

   average price  number of adults  number of children  \
0      -0.805801                 2                   0   
1      -0.200849                 2                   0   
2      -1.033012                 1                   0   
3      -0.712076                 2                   0   
4      -0.386879                 2                   0   

   number of weekend nights  number of week nights  car parking space  \
0                         0                      3                  0   
1                         1                      1                  0   
2                         2                      0                  1   
3                         2                      0                  0   
4                         1                      2                  0   

   lead time  repeated  P-C  special requests  ...  room type_Room_Type 3  \
0        146         0    0                 0  ...                  False   
1         57         0    0                 0  ...        

**7. K-Nearest Neighbors**

In [31]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import GridSearchCV

# # Definišemo listu parametara koje želimo da ispitamo
# parameters = {'n_neighbors':[3,5,7,9,11], 'metric':('hamming', 'euclidean', 'manhattan')}

# # Inicijalizujemo KNN klasifikator
# knn = KNeighborsClassifier()

# # Inicijalizujemo GridSearchCV sa KNN klasifikatorom i parametrima
# grid_search = GridSearchCV(knn, parameters, cv=5, scoring='accuracy', verbose=3)

# # Fitujemo GridSearchCV na podacima
# grid_search.fit(X_train, y_train)

# # Najbolji model i najbolji parametri
# best_knn_model = grid_search.best_estimator_
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Najbolji model:", best_knn_model)
# print("Najbolji parametri:", best_params)
# print("Najbolji rezultat:", best_score)


# # Najbolji model: KNeighborsClassifier(metric='hamming', n_neighbors=11)
# # Najbolji parametri: {'metric': 'hamming', 'n_neighbors': 11}
# # Najbolji rezultat: 0.8198569482288829

**8. Logistic Regression**

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Definišemo listu parametara koje želimo da ispitamo
parameters = [{'penalty':['l1','l2']},
              {'C':[0.01, 0.1, 1, 10, 100, 1000]}]

# Inicijalizujemo logistički regresor
logistic_regression = LogisticRegression(max_iter=10000)

# Inicijalizujemo GridSearchCV sa logističkim regresorom i parametrima
grid_search = GridSearchCV(estimator = logistic_regression, param_grid = parameters, scoring = 'accuracy', cv = 5, verbose=0)

# Fitujemo GridSearchCV na podacima
grid_search.fit(X_train, y_train)

# Najbolji model i najbolji parametri
best_logistic_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Najbolji model:", best_logistic_model)
print("Najbolji parametri:", best_params)
print("Najbolji rezultat:", best_score)

# Najbolji model: LogisticRegression(C=10, max_iter=10000)
# Najbolji parametri: {'C': 10}
# Najbolji rezultat: 0.8013623978201634

# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

# # Definisanje parametara za grid search sa linearnim kernelom
# parameters_linear = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
#                      'kernel': ['linear'],
#                      'penalty': ['l1', 'l2']}

# # Definisanje parametara za grid search sa polinomijalnim kernelom
# parameters_poly = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
#                    'kernel': ['poly'],
#                    'degree': [2, 3, 4],
#                    'gamma': ['scale', 'auto'],
#                    'coef0': [0.0, 1.0],
#                    'penalty': ['l1', 'l2']}

# # Definisanje parametara za grid search sa RBF kernelom
# parameters_rbf = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
#                   'kernel': ['rbf'],
#                   'gamma': ['scale', 'auto'],
#                   'penalty': ['l1', 'l2']}

# # Inicijalizacija SVM klasifikatora sa linearnim kernelom
# svm_linear = SVC()

# # Inicijalizacija GridSearchCV sa SVM i parametrima za linearni kernel
# grid_search_linear = GridSearchCV(svm_linear, parameters_linear, cv=5, scoring='accuracy', verbose=3)

# # Fitovanje GridSearchCV na podacima za linearni kernel
# grid_search_linear.fit(X_train, y_train)

# # Najbolji model i najbolji parametri za linearni kernel
# best_svm_linear_model = grid_search_linear.best_estimator_
# best_params_linear = grid_search_linear.best_params_
# best_score_linear = grid_search_linear.best_score_

# print("Najbolji model za linearni kernel:", best_svm_linear_model)
# print("Najbolji parametri za linearni kernel:", best_params_linear)
# print("Najbolji rezultat za linearni kernel:", best_score_linear)

# # Inicijalizacija SVM klasifikatora sa polinomijalnim kernelom
# svm_poly = SVC()

# # Inicijalizacija GridSearchCV sa SVM i parametrima za polinomijalni kernel
# grid_search_poly = GridSearchCV(svm_poly, parameters_poly, cv=5, scoring='accuracy', verbose=3)

# # Fitovanje GridSearchCV na podacima za polinomijalni kernel
# grid_search_poly.fit(X_train, y_train)

# # Najbolji model i najbolji parametri za polinomijalni kernel
# best_svm_poly_model = grid_search_poly.best_estimator_
# best_params_poly = grid_search_poly.best_params_
# best_score_poly = grid_search_poly.best_score_

# print("Najbolji model za polinomijalni kernel:", best_svm_poly_model)
# print("Najbolji parametri za polinomijalni kernel:", best_params_poly)
# print("Najbolji rezultat za polinomijalni kernel:", best_score_poly)

# # Inicijalizacija SVM klasifikatora sa RBF kernelom
# svm_rbf = SVC()

# # Inicijalizacija GridSearchCV sa SVM i parametrima za RBF kernel
# grid_search_rbf = GridSearchCV(svm_rbf, parameters_rbf, cv=5, scoring='accuracy', verbose=3)

# # Fitovanje GridSearchCV na podacima za RBF kernel
# grid_search_rbf.fit(X_train, y_train)

# # Najbolji model i najbolji parametri za RBF kernel
# best_svm_rbf_model = grid_search_rbf.best_estimator_
# best_params_rbf = grid_search_rbf.best_params_
# best_score_rbf = grid_search_rbf.best_score_

# print("Najbolji model za RBF kernel:", best_svm_rbf_model)
# print("Najbolji parametri za RBF kernel:", best_params_rbf)
# print("Najbolji rezultat za RBF kernel:", best_score_rbf)


**9. Support Vector Machines**

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Definišemo listu parametara koje želimo da ispitamo
parameters = {'C': [0.1, 1, 10],
              'kernel': ['linear', 'rbf', 'poly'],
              'gamma': ['scale', 'auto']}

# Inicijalizujemo SVM klasifikator
svm_classifier = SVC()

# Inicijalizujemo GridSearchCV sa SVM klasifikatorom i parametrima
grid_search_svm = GridSearchCV(svm_classifier, parameters, cv=5, scoring='accuracy', verbose=3)

# Fitujemo GridSearchCV na podacima
grid_search_svm.fit(X_train, y_train)

# Najbolji model i najbolji parametri
best_svm_model = grid_search_svm.best_estimator_
best_params_svm = grid_search_svm.best_params_
best_score_svm = grid_search_svm.best_score_

print("Najbolji model SVM:", best_svm_model)
print("Najbolji parametri SVM:", best_params_svm)
print("Najbolji rezultat SVM:", best_score_svm)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.798 total time=  48.6s
[CV 2/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.804 total time=  36.6s


**10. Test Best Solutions**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Inicijalizacija KNN klasifikatora sa najboljim parametrima
best_knn_model = KNeighborsClassifier(n_neighbors=..., metric=...)

# Treniranje finalnog KNN modela na trening podacima
best_knn_model.fit(X_train, y_train)

# Evaluacija performansi finalnog KNN modela na test podacima
knn_accuracy = best_knn_model.score(X_test, y_test)
print("Tačnost finalnog KNN modela:", knn_accuracy)


In [None]:
from sklearn.linear_model import LogisticRegression

# Inicijalizacija logističkog regresora sa najboljim parametrima
best_logistic_model = LogisticRegression(penalty='...', C=..., solver='...', multi_class='...', max_iter=1000)

# Treniranje finalnog logističkog modela na trening podacima
best_logistic_model.fit(X_train, y_train)

# Evaluacija performansi finalnog logističkog modela na test podacima
logistic_accuracy = best_logistic_model.score(X_test, y_test)
print("Tačnost finalnog logističkog modela:", logistic_accuracy)


In [None]:
from sklearn.svm import SVC

# Inicijalizacija SVM klasifikatora sa najboljim parametrima
best_svm_model = SVC(C=..., kernel='...', gamma=..., decision_function_shape='...')

# Treniranje finalnog SVM modela na trening podacima
best_svm_model.fit(X_train, y_train)

# Evaluacija performansi finalnog SVM modela na test podacima
svm_accuracy = best_svm_model.score(X_test, y_test)
print("Tačnost finalnog SVM modela:", svm_accuracy)
