# Design

XG Boost

- take previous model
- tweak parameters

# Import Dependencies

In [416]:
#data processing
import pandas as pd
import numpy as np

#viz
import matplotlib.pyplot as plt

#model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import minmax_scale, StandardScaler
import xgboost as xgb
from xgboost import XGBRegressor

#metrics
from sklearn.metrics import accuracy_score

#pipeline
from sklearn.pipeline import Pipeline

#directories
import os
from pathlib import Path

In [417]:
train = pd.read_csv(r'C:\Users\chewr\Documents\ML Labs\Hackathon\new data\train.csv')
train.head()

Unnamed: 0,ID,Overall_Experience,Ease_of_Online_Booking,Online_Support,Online_Boarding,Onboard_Entertainment,Onboard_Service,Cleanliness,Onboard_Wifi_Service,Baggage_Handling,...,CheckIn_Service,Catering,Travel_Class,Customer_Type,Arrival_Time_Convenient,Gender,Age,Platform_Location,Type_Travel,Seat_Class
0,98800001,0,3,3.0,2,3,3,3,5,2,...,4.0,6,2,2,6,1,52.0,6,2,2
1,98800002,0,5,4.0,5,2,6,5,5,1,...,2.0,2,1,2,6,0,48.0,3,1,1
2,98800003,1,6,5.0,6,5,6,6,3,5,...,4.0,3,2,2,3,1,43.0,3,2,2
3,98800004,0,4,3.0,4,3,4,4,4,3,...,4.0,4,2,2,3,1,44.0,3,2,1
4,98800005,1,5,5.0,5,5,5,5,3,4,...,4.0,4,2,2,4,1,50.0,4,2,1


In [418]:
train.shape

(94378, 22)

In [419]:
test = pd.read_csv(r'C:\Users\chewr\Documents\ML Labs\Hackathon\new data\test.csv')
test.head()

Unnamed: 0,ID,Ease_of_Online_Booking,Online_Support,Online_Boarding,Onboard_Entertainment,Onboard_Service,Cleanliness,Onboard_Wifi_Service,Baggage_Handling,Legroom,...,CheckIn_Service,Catering,Travel_Class,Customer_Type,Arrival_Time_Convenient,Gender,Age,Platform_Location,Type_Travel,Seat_Class
0,99900001,6,4,2,6,6,6,3,5,6,...,4,4,2,2,4,1,36.0,4,2,2
1,99900002,4,3,4,2,6,6,4,4,4,...,3,2,2,1,5,1,21.0,4,2,1
2,99900003,3,5,6,6,3,3,6,2,3,...,4,6,2,2,6,0,60.0,6,2,1
3,99900004,2,5,2,4,4,6,2,5,3,...,5,4,1,2,6,1,29.0,6,1,2
4,99900005,6,5,6,6,5,6,6,5,4,...,5,6,2,1,1,0,18.0,3,2,1


# Split Train and Validation Data

In [420]:
y = train['Overall_Experience']
X = train.drop(['Overall_Experience'],axis=1)
X.head()

Unnamed: 0,ID,Ease_of_Online_Booking,Online_Support,Online_Boarding,Onboard_Entertainment,Onboard_Service,Cleanliness,Onboard_Wifi_Service,Baggage_Handling,Legroom,...,CheckIn_Service,Catering,Travel_Class,Customer_Type,Arrival_Time_Convenient,Gender,Age,Platform_Location,Type_Travel,Seat_Class
0,98800001,3,3.0,2,3,3,3,5,2,4,...,4.0,6,2,2,6,1,52.0,6,2,2
1,98800002,5,4.0,5,2,6,5,5,1,3,...,2.0,2,1,2,6,0,48.0,3,1,1
2,98800003,6,5.0,6,5,6,6,3,5,6,...,4.0,3,2,2,3,1,43.0,3,2,2
3,98800004,4,3.0,4,3,4,4,4,3,4,...,4.0,4,2,2,3,1,44.0,3,2,1
4,98800005,5,5.0,5,5,5,5,3,4,5,...,4.0,4,2,2,4,1,50.0,4,2,1


In [421]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

# Build Pipeline

In [422]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBRegressor(max_depth=5, learning_rate=0.2, \
                     n_estimators=7000, colsample_bytree=0.2))
])


# Baseline Model

In [423]:

pipe.fit(X_train, y_train)

y_hat = pipe.predict(X_test)

In [424]:
validation = pd.Series(y_hat).map(lambda x: 0 if x<0.5 else 1)

#### Accuracy

In [425]:
prediction_df = pd.concat([X_test['ID'].reset_index(drop=True),validation],axis=1)
index = ['ID','Overall_Experience']
prediction_df.columns = index
prediction_df

Unnamed: 0,ID,Overall_Experience
0,98813976,0
1,98800739,1
2,98821878,0
3,98810293,0
4,98864261,0
...,...,...
18871,98875295,1
18872,98883334,0
18873,98878197,1
18874,98861362,1


In [426]:
acc = accuracy_score(y_test, validation)
print("Accuracy:", acc)

Accuracy: 0.9524793388429752


# Tune 

In [427]:
# param_grid = {
#     'classifier__max_depth': [4],
#     'classifier__learning_rate': [0.2],
#     'classifier__n_estimators': [7000],
#     'classifier__colsample_bytree':[0.2,0.25,0.3,0.35,0.5]
# }
# grid = GridSearchCV(pipe, param_grid, cv=5)
# grid.fit(X_train,y_train)

In [428]:
# best_hyperparameters = grid.best_params_
# best_estimator = grid.best_estimator_
# best_hyperparameters

In [429]:
# best_estimator

# Optimized Model

In [430]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBRegressor(max_depth=4, learning_rate=0.2, \
                     n_estimators=10000, colsample_bytree=0.25,random_state=69))
])

In [431]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=69,stratify=y)

In [432]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=0.25, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=0.2,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=4, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=10000,
                              n_jobs=None, n

In [433]:
y_hat = pipe.predict(X_test)
validation = pd.Series(y_hat).map(lambda x: 0 if x<0.5 else 1)

### Accuracy

In [434]:
prediction_df = pd.concat([X_test['ID'].reset_index(drop=True),validation],axis=1)
index = ['ID','Overall_Experience']
prediction_df.columns = index
prediction_df

Unnamed: 0,ID,Overall_Experience
0,98815137,1
1,98859770,1
2,98867353,0
3,98873170,1
4,98873051,1
...,...,...
18871,98873403,0
18872,98838983,1
18873,98814345,0
18874,98856441,0


In [435]:
acc = accuracy_score(y_test, validation)
print("Accuracy:", acc)

Accuracy: 0.953697817334181


# Submission

In [436]:
predictions = pipe.predict(test)

predictions = pd.DataFrame(predictions,columns=['XGboost'])
predictions

Unnamed: 0,XGboost
0,0.891816
1,0.841739
2,0.966186
3,0.052774
4,0.836474
...,...
35597,-0.097002
35598,1.098932
35599,0.545908
35600,0.792643


In [437]:
predictions = pd.concat([test['ID'].reset_index(drop=True),predictions],axis=1)
index = ['ID','XGboost']
predictions.columns = index
predictions

Unnamed: 0,ID,XGboost
0,99900001,0.891816
1,99900002,0.841739
2,99900003,0.966186
3,99900004,0.052774
4,99900005,0.836474
...,...,...
35597,99935598,-0.097002
35598,99935599,1.098932
35599,99935600,0.545908
35600,99935601,0.792643


In [438]:
predictions.to_csv(r'C:\Users\chewr\Documents\ML Labs\Hackathon\metadata\xg.csv',index=False)

In [439]:
predictions = pd.Series(predictions.XGboost).map(lambda x: 0 if x<0.5 else 1)

In [440]:
submission = pd.concat([test['ID'].reset_index(drop=True),predictions],axis=1)
index_submit = ['ID','Overall_Experience']
submission.columns = index_submit
submission['ID'] = submission['ID'].astype('int')
submission['Overall_Experience'] = submission['Overall_Experience'].astype('int')
submission

Unnamed: 0,ID,Overall_Experience
0,99900001,1
1,99900002,1
2,99900003,1
3,99900004,0
4,99900005,1
...,...,...
35597,99935598,0
35598,99935599,1
35599,99935600,1
35600,99935601,1


In [441]:
submission.shape

(35602, 2)

In [442]:
submission.to_csv(r'C:\Users\chewr\Documents\ML Labs\Hackathon\submissions/1-3.csv',index=False)