## Importing libraries

In [1]:
from utils import preprocess_titanic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

---

## Importing data

In [2]:
y_train, X_train, X_test = preprocess_titanic()

In [3]:
y_train.head()

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [4]:
X_train.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,is_male,embarked_cherbourg,embarked_queenstown,embarked_southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3.0,-0.530005,1.0,-0.473408,-0.502163,1.0,0.0,0.0,1.0
2,1.0,0.57143,1.0,-0.473408,0.786404,0.0,1.0,0.0,0.0
3,3.0,-0.254646,0.0,-0.473408,-0.48858,0.0,0.0,0.0,1.0
4,1.0,0.364911,1.0,-0.473408,0.420494,0.0,0.0,0.0,1.0
5,3.0,0.364911,0.0,-0.473408,-0.486064,1.0,0.0,0.0,1.0


In [5]:
X_test.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,is_male,embarked_cherbourg,embarked_queenstown,embarked_southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,3.0,0.330491,0.0,-0.473408,-0.490508,1.0,0.0,1.0,0.0
893,3.0,1.190988,1.0,-0.473408,-0.507194,0.0,0.0,0.0,1.0
894,2.0,2.223584,0.0,-0.473408,-0.453112,1.0,0.0,1.0,0.0
895,3.0,-0.185807,0.0,-0.473408,-0.473739,1.0,0.0,0.0,1.0
896,3.0,-0.530005,1.0,0.767199,-0.400792,0.0,0.0,0.0,1.0


---

## Importing models

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

## 1) XGBoost

### Standard parameters

In [7]:
std_params = {'random_state': 0,
              'n_estimators': 1000,
               'max_depth': 6,
               'early_stopping_rounds': 10,
               'learning_rate': 0.05,
               'reg_lambda': 1,
               'reg_alpha': 0.5,
               'eval_set': [(X_train, y_train)],
               'predictor': 'gpu_predictor'}

### Creating and fitting model

In [8]:
from sklearn.metrics import accuracy_score, recall_score, fbeta_score

In [9]:
params = {'random_state': 0,
          'n_estimators': 1000,
          'max_depth': 10,
          'early_stopping_rounds': 10,
          'learning_rate': 0.01,
          'reg_lambda': 1,
          'reg_alpha': 0.2,
          'eval_set': [(X_train, y_train)],
          'predictor': 'gpu_predictor'}

def create_xgb_model(params):
    xgb = XGBClassifier(random_state = params['random_state'],
                       n_estimators = params['n_estimators'], 
                       max_depth = params['max_depth'], 
                       learning_rate = params['learning_rate'],
                       early_stopping_rounds = params['early_stopping_rounds'],
                       reg_lambda = params['reg_lambda'], 
                       reg_alpha = params['reg_alpha'],
                       predictor = params['predictor'])
    return xgb

xgb_model = create_xgb_model(params)
xgb_model.fit(X_train, y_train, eval_metric = ['logloss'], eval_set = params['eval_set'])



[0]	validation_0-logloss:0.68747
[1]	validation_0-logloss:0.68189
[2]	validation_0-logloss:0.67641
[3]	validation_0-logloss:0.67103
[4]	validation_0-logloss:0.66581
[5]	validation_0-logloss:0.66077
[6]	validation_0-logloss:0.65580
[7]	validation_0-logloss:0.65094
[8]	validation_0-logloss:0.64602
[9]	validation_0-logloss:0.64130
[10]	validation_0-logloss:0.63669
[11]	validation_0-logloss:0.63213
[12]	validation_0-logloss:0.62752
[13]	validation_0-logloss:0.62314
[14]	validation_0-logloss:0.61880
[15]	validation_0-logloss:0.61453
[16]	validation_0-logloss:0.61035
[17]	validation_0-logloss:0.60625
[18]	validation_0-logloss:0.60217
[19]	validation_0-logloss:0.59820
[20]	validation_0-logloss:0.59424
[21]	validation_0-logloss:0.59036
[22]	validation_0-logloss:0.58652
[23]	validation_0-logloss:0.58276
[24]	validation_0-logloss:0.57909
[25]	validation_0-logloss:0.57541
[26]	validation_0-logloss:0.57186
[27]	validation_0-logloss:0.56834
[28]	validation_0-logloss:0.56486
[29]	validation_0-loglos

[238]	validation_0-logloss:0.29328
[239]	validation_0-logloss:0.29280
[240]	validation_0-logloss:0.29233
[241]	validation_0-logloss:0.29188
[242]	validation_0-logloss:0.29140
[243]	validation_0-logloss:0.29097
[244]	validation_0-logloss:0.29049
[245]	validation_0-logloss:0.29005
[246]	validation_0-logloss:0.28963
[247]	validation_0-logloss:0.28915
[248]	validation_0-logloss:0.28872
[249]	validation_0-logloss:0.28827
[250]	validation_0-logloss:0.28786
[251]	validation_0-logloss:0.28744
[252]	validation_0-logloss:0.28700
[253]	validation_0-logloss:0.28657
[254]	validation_0-logloss:0.28618
[255]	validation_0-logloss:0.28578
[256]	validation_0-logloss:0.28535
[257]	validation_0-logloss:0.28497
[258]	validation_0-logloss:0.28456
[259]	validation_0-logloss:0.28419
[260]	validation_0-logloss:0.28378
[261]	validation_0-logloss:0.28341
[262]	validation_0-logloss:0.28304
[263]	validation_0-logloss:0.28264
[264]	validation_0-logloss:0.28227
[265]	validation_0-logloss:0.28190
[266]	validation_0-l

[473]	validation_0-logloss:0.22948
[474]	validation_0-logloss:0.22937
[475]	validation_0-logloss:0.22919
[476]	validation_0-logloss:0.22901
[477]	validation_0-logloss:0.22890
[478]	validation_0-logloss:0.22880
[479]	validation_0-logloss:0.22862
[480]	validation_0-logloss:0.22852
[481]	validation_0-logloss:0.22841
[482]	validation_0-logloss:0.22821
[483]	validation_0-logloss:0.22810
[484]	validation_0-logloss:0.22790
[485]	validation_0-logloss:0.22772
[486]	validation_0-logloss:0.22753
[487]	validation_0-logloss:0.22733
[488]	validation_0-logloss:0.22716
[489]	validation_0-logloss:0.22693
[490]	validation_0-logloss:0.22675
[491]	validation_0-logloss:0.22650
[492]	validation_0-logloss:0.22632
[493]	validation_0-logloss:0.22614
[494]	validation_0-logloss:0.22597
[495]	validation_0-logloss:0.22581
[496]	validation_0-logloss:0.22565
[497]	validation_0-logloss:0.22551
[498]	validation_0-logloss:0.22528
[499]	validation_0-logloss:0.22507
[500]	validation_0-logloss:0.22494
[501]	validation_0-l

[708]	validation_0-logloss:0.19468
[709]	validation_0-logloss:0.19452
[710]	validation_0-logloss:0.19443
[711]	validation_0-logloss:0.19429
[712]	validation_0-logloss:0.19412
[713]	validation_0-logloss:0.19398
[714]	validation_0-logloss:0.19381
[715]	validation_0-logloss:0.19373
[716]	validation_0-logloss:0.19356
[717]	validation_0-logloss:0.19339
[718]	validation_0-logloss:0.19321
[719]	validation_0-logloss:0.19310
[720]	validation_0-logloss:0.19295
[721]	validation_0-logloss:0.19278
[722]	validation_0-logloss:0.19264
[723]	validation_0-logloss:0.19247
[724]	validation_0-logloss:0.19235
[725]	validation_0-logloss:0.19219
[726]	validation_0-logloss:0.19211
[727]	validation_0-logloss:0.19196
[728]	validation_0-logloss:0.19185
[729]	validation_0-logloss:0.19174
[730]	validation_0-logloss:0.19159
[731]	validation_0-logloss:0.19142
[732]	validation_0-logloss:0.19132
[733]	validation_0-logloss:0.19118
[734]	validation_0-logloss:0.19101
[735]	validation_0-logloss:0.19085
[736]	validation_0-l

[943]	validation_0-logloss:0.17243
[944]	validation_0-logloss:0.17239
[945]	validation_0-logloss:0.17234
[946]	validation_0-logloss:0.17229
[947]	validation_0-logloss:0.17225
[948]	validation_0-logloss:0.17219
[949]	validation_0-logloss:0.17213
[950]	validation_0-logloss:0.17209
[951]	validation_0-logloss:0.17204
[952]	validation_0-logloss:0.17201
[953]	validation_0-logloss:0.17193
[954]	validation_0-logloss:0.17184
[955]	validation_0-logloss:0.17181
[956]	validation_0-logloss:0.17176
[957]	validation_0-logloss:0.17169
[958]	validation_0-logloss:0.17165
[959]	validation_0-logloss:0.17158
[960]	validation_0-logloss:0.17151
[961]	validation_0-logloss:0.17146
[962]	validation_0-logloss:0.17138
[963]	validation_0-logloss:0.17134
[964]	validation_0-logloss:0.17126
[965]	validation_0-logloss:0.17121
[966]	validation_0-logloss:0.17115
[967]	validation_0-logloss:0.17112
[968]	validation_0-logloss:0.17104
[969]	validation_0-logloss:0.17098
[970]	validation_0-logloss:0.17092
[971]	validation_0-l

#### Accuracy, recall and F2 Score in the training set:

In [10]:
y_pred_train = xgb_model.predict(X_train)
acc = accuracy_score(y_pred_train, y_train)
rec = recall_score(y_pred_train, y_train)
f2 = fbeta_score(y_pred_train, y_train, beta = 2)

print(f"Model accuracy: {acc}")
print(f"Model recall: {rec}")
print(f"Model F2 score: {f2}")

Model accuracy: 0.9483726150392817
Model recall: 0.9625
Model F2 score: 0.9494451294697905


#### Using KFold cross-validation method to analyze generatization capability

In [11]:
params = {'random_state': 0,
          'n_estimators': 1000,
          'max_depth': 10,
          'early_stopping_rounds': 10,
          'learning_rate': 0.01,
          'reg_lambda': 1,
          'reg_alpha': 0.2,
          'eval_set': [(X_train, y_train)],
          'predictor': 'gpu_predictor'}

In [12]:
import warnings
warnings.filterwarnings('ignore')

indexes = np.array(X_train.index)
rand = np.random.RandomState(params['random_state'])
rand.shuffle(indexes)
X_shuffled = X_train.loc[indexes]
y_shuffled = y_train.loc[indexes]

folds = 5
kf = KFold(n_splits = folds)
kfold_scores = []
kfold_acc = []
kfold_rec = []
kfold_f2 = []

for (train_indexes, val_indexes) in kf.split(X_train):
    X_train_kfold = X_shuffled.iloc[train_indexes].copy()
    y_train_kfold = y_shuffled.iloc[train_indexes].copy()
    X_val_kfold = X_shuffled.iloc[val_indexes].copy()
    y_val_kfold = y_shuffled.iloc[val_indexes].copy()
    
    model = create_xgb_model(params)
    model.fit(X_train_kfold, y_train_kfold, eval_metric = 'logloss', eval_set = params['eval_set'], verbose = 0)
    y_pred = model.predict(X_val_kfold)
    kfold_scores.append(mean_squared_error(y_val_kfold, y_pred))
    kfold_acc.append(accuracy_score(y_val_kfold, y_pred))
    kfold_rec.append(recall_score(y_val_kfold, y_pred))
    kfold_f2.append(fbeta_score(y_val_kfold, y_pred, beta = 2))
    
print(f'KFold final score: {sum(kfold_scores)/len(kfold_scores)}')
print(f'Kfold accuracy score: {sum(kfold_acc)/len(kfold_acc)}')
print(f'Kfold recall score: {sum(kfold_rec)/len(kfold_rec)}')
print(f'Kfold f2 score: {sum(kfold_f2)/len(kfold_f2)}')

KFold final score: 0.17287050404871004
Kfold accuracy score: 0.82712949595129
Kfold recall score: 0.7450448628643114
Kfold f2 score: 0.7536243759596462


Important:
**Kfold accuracy is an estimate of the competition score.** However, it's certainly overestimated or underestimated.

---

## Exporting result to submission

In [13]:
xgb_model = create_xgb_model(params)
xgb_model.fit(X_train, y_train, eval_metric = ['logloss'], eval_set = params['eval_set'])

y_pred = xgb_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, index = X_test.index.rename('PassengerId'), columns = ['Survived'])

[0]	validation_0-logloss:0.68747
[1]	validation_0-logloss:0.68189
[2]	validation_0-logloss:0.67641
[3]	validation_0-logloss:0.67103
[4]	validation_0-logloss:0.66581
[5]	validation_0-logloss:0.66077
[6]	validation_0-logloss:0.65580
[7]	validation_0-logloss:0.65094
[8]	validation_0-logloss:0.64602
[9]	validation_0-logloss:0.64130
[10]	validation_0-logloss:0.63669
[11]	validation_0-logloss:0.63213
[12]	validation_0-logloss:0.62752
[13]	validation_0-logloss:0.62314
[14]	validation_0-logloss:0.61880
[15]	validation_0-logloss:0.61453
[16]	validation_0-logloss:0.61035
[17]	validation_0-logloss:0.60625
[18]	validation_0-logloss:0.60217
[19]	validation_0-logloss:0.59820
[20]	validation_0-logloss:0.59424
[21]	validation_0-logloss:0.59036
[22]	validation_0-logloss:0.58652
[23]	validation_0-logloss:0.58276
[24]	validation_0-logloss:0.57909
[25]	validation_0-logloss:0.57541
[26]	validation_0-logloss:0.57186
[27]	validation_0-logloss:0.56834
[28]	validation_0-logloss:0.56486
[29]	validation_0-loglos

[238]	validation_0-logloss:0.29328
[239]	validation_0-logloss:0.29280
[240]	validation_0-logloss:0.29233
[241]	validation_0-logloss:0.29188
[242]	validation_0-logloss:0.29140
[243]	validation_0-logloss:0.29097
[244]	validation_0-logloss:0.29049
[245]	validation_0-logloss:0.29005
[246]	validation_0-logloss:0.28963
[247]	validation_0-logloss:0.28915
[248]	validation_0-logloss:0.28872
[249]	validation_0-logloss:0.28827
[250]	validation_0-logloss:0.28786
[251]	validation_0-logloss:0.28744
[252]	validation_0-logloss:0.28700
[253]	validation_0-logloss:0.28657
[254]	validation_0-logloss:0.28618
[255]	validation_0-logloss:0.28578
[256]	validation_0-logloss:0.28535
[257]	validation_0-logloss:0.28497
[258]	validation_0-logloss:0.28456
[259]	validation_0-logloss:0.28419
[260]	validation_0-logloss:0.28378
[261]	validation_0-logloss:0.28341
[262]	validation_0-logloss:0.28304
[263]	validation_0-logloss:0.28264
[264]	validation_0-logloss:0.28227
[265]	validation_0-logloss:0.28190
[266]	validation_0-l

[473]	validation_0-logloss:0.22948
[474]	validation_0-logloss:0.22937
[475]	validation_0-logloss:0.22919
[476]	validation_0-logloss:0.22901
[477]	validation_0-logloss:0.22890
[478]	validation_0-logloss:0.22880
[479]	validation_0-logloss:0.22862
[480]	validation_0-logloss:0.22852
[481]	validation_0-logloss:0.22841
[482]	validation_0-logloss:0.22821
[483]	validation_0-logloss:0.22810
[484]	validation_0-logloss:0.22790
[485]	validation_0-logloss:0.22772
[486]	validation_0-logloss:0.22753
[487]	validation_0-logloss:0.22733
[488]	validation_0-logloss:0.22716
[489]	validation_0-logloss:0.22693
[490]	validation_0-logloss:0.22675
[491]	validation_0-logloss:0.22650
[492]	validation_0-logloss:0.22632
[493]	validation_0-logloss:0.22614
[494]	validation_0-logloss:0.22597
[495]	validation_0-logloss:0.22581
[496]	validation_0-logloss:0.22565
[497]	validation_0-logloss:0.22551
[498]	validation_0-logloss:0.22528
[499]	validation_0-logloss:0.22507
[500]	validation_0-logloss:0.22494
[501]	validation_0-l

[708]	validation_0-logloss:0.19468
[709]	validation_0-logloss:0.19452
[710]	validation_0-logloss:0.19443
[711]	validation_0-logloss:0.19429
[712]	validation_0-logloss:0.19412
[713]	validation_0-logloss:0.19398
[714]	validation_0-logloss:0.19381
[715]	validation_0-logloss:0.19373
[716]	validation_0-logloss:0.19356
[717]	validation_0-logloss:0.19339
[718]	validation_0-logloss:0.19321
[719]	validation_0-logloss:0.19310
[720]	validation_0-logloss:0.19295
[721]	validation_0-logloss:0.19278
[722]	validation_0-logloss:0.19264
[723]	validation_0-logloss:0.19247
[724]	validation_0-logloss:0.19235
[725]	validation_0-logloss:0.19219
[726]	validation_0-logloss:0.19211
[727]	validation_0-logloss:0.19196
[728]	validation_0-logloss:0.19185
[729]	validation_0-logloss:0.19174
[730]	validation_0-logloss:0.19159
[731]	validation_0-logloss:0.19142
[732]	validation_0-logloss:0.19132
[733]	validation_0-logloss:0.19118
[734]	validation_0-logloss:0.19101
[735]	validation_0-logloss:0.19085
[736]	validation_0-l

[943]	validation_0-logloss:0.17243
[944]	validation_0-logloss:0.17239
[945]	validation_0-logloss:0.17234
[946]	validation_0-logloss:0.17229
[947]	validation_0-logloss:0.17225
[948]	validation_0-logloss:0.17219
[949]	validation_0-logloss:0.17213
[950]	validation_0-logloss:0.17209
[951]	validation_0-logloss:0.17204
[952]	validation_0-logloss:0.17201
[953]	validation_0-logloss:0.17193
[954]	validation_0-logloss:0.17184
[955]	validation_0-logloss:0.17181
[956]	validation_0-logloss:0.17176
[957]	validation_0-logloss:0.17169
[958]	validation_0-logloss:0.17165
[959]	validation_0-logloss:0.17158
[960]	validation_0-logloss:0.17151
[961]	validation_0-logloss:0.17146
[962]	validation_0-logloss:0.17138
[963]	validation_0-logloss:0.17134
[964]	validation_0-logloss:0.17126
[965]	validation_0-logloss:0.17121
[966]	validation_0-logloss:0.17115
[967]	validation_0-logloss:0.17112
[968]	validation_0-logloss:0.17104
[969]	validation_0-logloss:0.17098
[970]	validation_0-logloss:0.17092
[971]	validation_0-l

In [14]:
y_pred.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [15]:
y_pred.to_csv('xgb_predictions.csv', index = True)