In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV
from itertools import product
from queue import PriorityQueue
import xgboost as xgb
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,id,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,reservation_status
0,0,Resort Hotel,312,2017,March,10,5,2,5,2,...,A,0,No Deposit,298.0,,0,Transient-Party,56.0,0,0
1,1,City Hotel,2,2015,December,51,18,0,2,2,...,D,1,No Deposit,9.0,,0,Transient,97.0,0,1
2,2,City Hotel,41,2016,March,14,31,0,3,2,...,A,0,No Deposit,9.0,,0,Transient,117.9,0,1
3,3,Resort Hotel,228,2016,August,36,29,2,5,2,...,D,0,No Deposit,175.0,,0,Transient,86.4,0,1
4,4,City Hotel,128,2017,May,19,13,0,1,3,...,A,0,No Deposit,9.0,,0,Transient,144.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94541,94541,City Hotel,26,2016,October,40,1,2,2,2,...,F,0,No Deposit,9.0,,0,Transient,294.0,0,0
94542,94542,City Hotel,269,2016,November,48,24,0,2,1,...,A,1,No Deposit,14.0,,0,Transient,93.0,0,0
94543,94543,City Hotel,302,2015,August,33,15,2,2,2,...,A,0,No Deposit,1.0,,0,Transient-Party,62.0,0,0
94544,94544,City Hotel,53,2017,June,25,19,1,3,2,...,D,0,No Deposit,42.0,,0,Transient,153.0,0,1


In [4]:
print("Columns in the dataset:")
print(train.dtypes)

Columns in the dataset:
id                                  int64
hotel                              object
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent     

In [5]:
split = StratifiedShuffleSplit(n_splits=1,test_size = 0.6)
for train_indices,val_indices in split.split(train,train[["reservation_status"]]):
    trainset = train.loc[train_indices]
    valset = train.loc[val_indices]

In [6]:
numeric_columns = [col for col in train.select_dtypes(include=['int64', 'float64']).columns if train[col].nunique() > 2]

In [7]:
def clean_dataset(df):
    cleaned_df = df.copy()
    cleaned_df = cleaned_df.replace("", pd.NA)
    cols_to_check = [col for col in cleaned_df.columns if col != 'company' and col != 'agent']
    cleaned_df = cleaned_df.dropna(subset=cols_to_check)
    return cleaned_df
trainset = clean_dataset(trainset)

In [8]:
# def removeStrangeRow(df):
#     if 'company' in df.columns:
#         df = df.drop(columns=['company'])

In [9]:
class FeatureImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='most_frequent'):
        self.strategy = strategy

    def fit(self, X, y=None):
        self.imputers = {}
        numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
        categorical_columns = X.select_dtypes(exclude=['float64', 'int64']).columns
        for col in numerical_columns:
            imputer = SimpleImputer(missing_values=np.nan, strategy=self.strategy)
            imputer.fit(X[col].values.reshape(-1, 1))
            self.imputers[col] = imputer
        for col in categorical_columns:
            imputer = SimpleImputer(missing_values=np.nan, strategy=self.strategy)
            imputer.fit(X[col].values.reshape(-1, 1))
            self.imputers[col] = imputer
        return self
    def transform(self, X):
        X_copy = X.copy()
        for col, imputer in self.imputers.items():
            if col in X_copy.columns: 
                X_copy[col] = imputer.transform(X_copy[col].values.reshape(-1, 1)).ravel()
            else:
                pass
        return X_copy

In [10]:
class FeatureScaler(BaseEstimator, TransformerMixin):    
    def fit(self, X, y=None):
        self.numerical_cols = [col for col in X.columns if X[col].dtype != 'object' and X[col].nunique() > 2]
        self.sc = StandardScaler().fit(X[self.numerical_cols])
        return self
    
    def transform(self, X):
        X[self.numerical_cols] = self.sc.transform(X[self.numerical_cols])
        return X

In [11]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.drop(["reserved_room_type","assigned_room_type","id","country"], axis=1, errors="ignore")
        return X


In [12]:
class FeatureEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.encoders = {}
        self.categories = {
            'arrival_date_month': [
                'January', 'February', 'March', 'April', 'May', 'June',
                'July', 'August', 'September', 'October', 'November', 'December'
            ],
            'hotel': ['Resort Hotel', 'City Hotel'],
            'meal': ["HB", "BB", "FB", "SC", "Undefined"],
            'market_segment': ["Groups", "Online TA", "Offline TA/TO", "Direct", "Aviation", "Corporate", "Complementary", "Undefined"],
            'distribution_channel': ['TA/TO', 'Direct', 'Corporate', 'GDS', 'Undefined'],
            'deposit_type': ["No Deposit", "Non Refund", "Refundable"],
            'customer_type': ['Transient-Party', 'Transient', 'Contract', 'Group']
        }
        for column, categories in self.categories.items():
            encoder = OneHotEncoder(categories=[categories], drop=None, sparse=False, handle_unknown='ignore')
            encoder.fit(X[[column]])
            self.encoders[column] = encoder
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        for column, encoder in self.encoders.items():
            matrix = encoder.transform(X[[column]])
            column_names = encoder.get_feature_names_out([column])
            for i in range(len(matrix.T)):
                X_encoded[column_names[i]] = matrix.T[i]
            X_encoded.drop(column, axis=1, inplace=True)
        return X_encoded

In [13]:
pipeline = Pipeline([("featureimputer",FeatureImputer()),("featurescaler",FeatureScaler()),("featuredropper",FeatureDropper()),("featureencoder",FeatureEncoder())])

In [14]:
trainset = pipeline.fit_transform(trainset)
valset = pipeline.transform(valset)
testset = pipeline.transform(test)



In [15]:
X_trainset = trainset.drop(columns=['reservation_status'])  
y_trainset = trainset['reservation_status'] 
X_valset = valset.drop(columns=['reservation_status'])  
y_valset = valset['reservation_status'] 

In [16]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

In [17]:
# def evaluate_params(params, X, y):
#     rf = RandomForestClassifier(**params, random_state=42)
#     scores = cross_val_score(rf, X, y, cv=3, scoring=make_scorer(accuracy_score))
#     return np.mean(scores)

In [18]:
# def branch_and_bound(param_grid, X, y):
#     best_params = None
#     best_score = 0
#     queue = []
#     def branch(params, remaining_params):
#         if not remaining_params:
#             score = evaluate_params(params, X, y)
#             nonlocal best_score, best_params
#             print(f"Evaluating params: {params}, Score: {score}")
#             if score > best_score:
#                 best_score = score
#                 best_params = params
#             return
#         param, values = remaining_params[0]
#         for value in values:
#             new_params = params.copy()
#             new_params[param] = value
#             print(f"Branching with params: {new_params}, Remaining: {remaining_params[1:]}")
#             heapq.heappush(queue, (-score_estimate(new_params), new_params, remaining_params[1:]))
#     def score_estimate(params):
#         # Use a heuristic to estimate the best possible score for the current parameter set
#         return evaluate_params(params, X, y)
#     initial_params = {}
#     remaining_params = list(param_grid.items())
#     branch(initial_params, remaining_params)
#     while queue:
#         _, current_params, remaining_params = heapq.heappop(queue)
#         branch(current_params, remaining_params)
#     return best_params, best_score


In [19]:
# classifier = RandomForestClassifier(random_state=42) 
# classifier.fit(X_trainset, y_trainset)
# # best_params, best_score = branch_and_bound(param_grid, X_trainset, y_trainset)
# # best_rf = RandomForestClassifier(**best_params, random_state=42)
# # best_rf.fit(X_trainset, y_trainset)
# y_pred = classifier.predict(X_valset)
# # y_predParam = best_rf.predict(X_valset)
# accuracy = accuracy_score(y_valset, y_pred)
# print(f'Validation Accuracy: {accuracy:.4f}')
# # accuracyParam = accuracy_score(y_valset, y_predParam)
# # print(f'Validation Accuracy with Tuned Parameters: {accuracyParam:.4f}')
# # print(f'Best Parameters: {best_params}')


In [20]:
classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
classifier.fit(X_trainset, y_trainset)

y_pred = classifier.predict(X_valset)
accuracy = accuracy_score(y_valset, y_pred)
print(f'Validation Accuracy without Tuning: {accuracy:.4f}')

Validation Accuracy without Tuning: 0.8229


In [21]:
def evaluate_params(params, X, y):
    model = xgb.XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric='logloss')
    scores = cross_val_score(model, X, y, cv=3, scoring='accuracy')
    return np.mean(scores)
def ucs(param_grid, X, y):
    best_params = {}
    best_score = 0
    queue = PriorityQueue()
    id_counter = 0  
    queue.put((0, id_counter, {}, list(param_grid.keys()))) 

    while not queue.empty():
        cost, _, current_params, remaining_keys = queue.get()
        if not remaining_keys:
            current_score = -cost  
            if current_score > best_score:
                best_score = current_score
                best_params = current_params
                print(f"New Best Params: {best_params} with Score: {best_score}")
        else:
            next_key = remaining_keys[0]
            for value in param_grid[next_key]:
                new_params = current_params.copy()
                new_params[next_key] = value
                new_cost = -evaluate_params(new_params, X, y)  
                id_counter += 1  
                queue.put((new_cost, id_counter, new_params, remaining_keys[1:]))
    return best_params
best_params_ucs = ucs(param_grid, X_trainset, y_trainset)
best_xgb_ucs = xgb.XGBClassifier(**best_params_ucs, random_state=42, use_label_encoder=False, eval_metric='logloss')
best_xgb_ucs.fit(X_trainset, y_trainset)
y_pred_ucs = best_xgb_ucs.predict(X_valset) 
accuracy_ucs = accuracy_score(y_valset, y_pred_ucs)
print(f'Final Validation Accuracy with Tuned Parameters (UCS): {accuracy_ucs:.4f}')
print(f'Best Parameters from UCS: {best_params_ucs}')

New Best Params: {'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.2, 'subsample': 1.0, 'colsample_bytree': 1.0} with Score: 0.8225391008789399
New Best Params: {'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.2, 'subsample': 1.0, 'colsample_bytree': 0.8} with Score: 0.8237605884383546
Final Validation Accuracy with Tuned Parameters (UCS): 0.8282
Best Parameters from UCS: {'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.2, 'subsample': 1.0, 'colsample_bytree': 0.8}


In [22]:
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
random_search = RandomizedSearchCV(estimator=xgb_classifier, param_distributions=param_grid,
                                   n_iter=10,
                                   scoring='accuracy', n_jobs=-1, cv=3, verbose=2, random_state=42)
random_search.fit(X_trainset, y_trainset)
best_params_random = random_search.best_params_
best_xgb_random = random_search.best_estimator_
y_pred_random = best_xgb_random.predict(X_valset)
accuracy_random = accuracy_score(y_valset, y_pred_random)
print(f'Validation Accuracy with Tuned Parameters (RandomizedSearchCV): {accuracy_random:.4f}')
print(f'Best Parameters from RandomizedSearchCV: {best_params_random}')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Validation Accuracy with Tuned Parameters (RandomizedSearchCV): 0.8259
Best Parameters from RandomizedSearchCV: {'subsample': 0.7, 'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


In [23]:
xgb_classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid,
                           scoring='accuracy', n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_trainset, y_trainset)
best_params_grid = grid_search.best_params_
best_xgb_grid = grid_search.best_estimator_
y_pred_grid = best_xgb_grid.predict(X_valset)
accuracy_grid = accuracy_score(y_valset, y_pred_grid)
print(f'Validation Accuracy with Tuned Parameters (GridSearchCV): {accuracy_grid:.4f}')
print(f'Best Parameters from GridSearchCV: {best_params_grid}')

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Validation Accuracy with Tuned Parameters (GridSearchCV): 0.8282
Best Parameters from GridSearchCV: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 150, 'subsample': 1.0}


In [24]:
y_pred_ucs_test = best_xgb_ucs.predict(testset)

In [25]:
print(y_pred_ucs_test)

[1 1 0 ... 0 1 1]
