## Importing libraries

In [46]:
from utils import preprocess_titanic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

---

## Importing data

In [47]:
y_train, X_train, X_test = preprocess_titanic()

In [48]:
y_train.head()

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [49]:
X_train.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,is_male,embarked_cherbourg,embarked_queenstown,embarked_southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3.0,-0.530005,1.0,-0.473408,-0.502163,1.0,0.0,0.0,1.0
2,1.0,0.57143,1.0,-0.473408,0.786404,0.0,1.0,0.0,0.0
3,3.0,-0.254646,0.0,-0.473408,-0.48858,0.0,0.0,0.0,1.0
4,1.0,0.364911,1.0,-0.473408,0.420494,0.0,0.0,0.0,1.0
5,3.0,0.364911,0.0,-0.473408,-0.486064,1.0,0.0,0.0,1.0


In [50]:
X_test.head()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,is_male,embarked_cherbourg,embarked_queenstown,embarked_southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,3.0,0.330491,0.0,-0.473408,-0.490508,1.0,0.0,1.0,0.0
893,3.0,1.190988,1.0,-0.473408,-0.507194,0.0,0.0,0.0,1.0
894,2.0,2.223584,0.0,-0.473408,-0.453112,1.0,0.0,1.0,0.0
895,3.0,-0.185807,0.0,-0.473408,-0.473739,1.0,0.0,0.0,1.0
896,3.0,-0.530005,1.0,0.767199,-0.400792,0.0,0.0,0.0,1.0


---

## Importing models

In [51]:
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

## 1) XGBoost

### Standard parameters

In [52]:
std_params = {'random_state': 0,
              'n_estimators': 1000,
               'max_depth': 6,
               'early_stopping_rounds': 10,
               'learning_rate': 0.05,
               'reg_lambda': 1,
               'reg_alpha': 0.5,
               'eval_set': [(X_train, y_train)],
               'predictor': 'gpu_predictor'}

### Creating and fitting model

In [53]:
from sklearn.metrics import accuracy_score, recall_score, fbeta_score

In [54]:
params = std_params

def create_xgb_model(params):
    xgb = XGBClassifier(random_state = params['random_state'],
                       n_estimators = params['n_estimators'], 
                       max_depth = params['max_depth'], 
                       learning_rate = params['learning_rate'],
                       early_stopping_rounds = params['early_stopping_rounds'],
                       reg_lambda = params['reg_lambda'], 
                       reg_alpha = params['reg_alpha'],
                       predictor = params['predictor'])
    return xgb

xgb_model = create_xgb_model(params)
xgb_model.fit(X_train, y_train, eval_metric = ['logloss'], eval_set = params['eval_set'])

[0]	validation_0-logloss:0.66739
[1]	validation_0-logloss:0.64398
[2]	validation_0-logloss:0.62300
[3]	validation_0-logloss:0.60364
[4]	validation_0-logloss:0.58536
[5]	validation_0-logloss:0.56931
[6]	validation_0-logloss:0.55427
[7]	validation_0-logloss:0.54020
[8]	validation_0-logloss:0.52739
[9]	validation_0-logloss:0.51557
[10]	validation_0-logloss:0.50463
[11]	validation_0-logloss:0.49411
[12]	validation_0-logloss:0.48461
[13]	validation_0-logloss:0.47571
[14]	validation_0-logloss:0.46721
[15]	validation_0-logloss:0.45954
[16]	validation_0-logloss:0.45196
[17]	validation_0-logloss:0.44474
[18]	validation_0-logloss:0.43808
[19]	validation_0-logloss:0.43210
[20]	validation_0-logloss:0.42627
[21]	validation_0-logloss:0.42107
[22]	validation_0-logloss:0.41638
[23]	validation_0-logloss:0.41164
[24]	validation_0-logloss:0.40727
[25]	validation_0-logloss:0.40292
[26]	validation_0-logloss:0.39860
[27]	validation_0-logloss:0.39448
[28]	validation_0-logloss:0.39120
[29]	validation_0-loglos

[238]	validation_0-logloss:0.21414
[239]	validation_0-logloss:0.21367
[240]	validation_0-logloss:0.21345
[241]	validation_0-logloss:0.21325
[242]	validation_0-logloss:0.21281
[243]	validation_0-logloss:0.21239
[244]	validation_0-logloss:0.21194
[245]	validation_0-logloss:0.21172
[246]	validation_0-logloss:0.21139
[247]	validation_0-logloss:0.21090
[248]	validation_0-logloss:0.21060
[249]	validation_0-logloss:0.21018
[250]	validation_0-logloss:0.20993
[251]	validation_0-logloss:0.20951
[252]	validation_0-logloss:0.20937
[253]	validation_0-logloss:0.20896
[254]	validation_0-logloss:0.20884
[255]	validation_0-logloss:0.20856
[256]	validation_0-logloss:0.20834
[257]	validation_0-logloss:0.20819
[258]	validation_0-logloss:0.20784
[259]	validation_0-logloss:0.20764
[260]	validation_0-logloss:0.20745
[261]	validation_0-logloss:0.20719
[262]	validation_0-logloss:0.20699
[263]	validation_0-logloss:0.20667
[264]	validation_0-logloss:0.20649
[265]	validation_0-logloss:0.20632
[266]	validation_0-l

[473]	validation_0-logloss:0.16787
[474]	validation_0-logloss:0.16775
[475]	validation_0-logloss:0.16766
[476]	validation_0-logloss:0.16759
[477]	validation_0-logloss:0.16741
[478]	validation_0-logloss:0.16728
[479]	validation_0-logloss:0.16719
[480]	validation_0-logloss:0.16706
[481]	validation_0-logloss:0.16696
[482]	validation_0-logloss:0.16689
[483]	validation_0-logloss:0.16676
[484]	validation_0-logloss:0.16670
[485]	validation_0-logloss:0.16663
[486]	validation_0-logloss:0.16657
[487]	validation_0-logloss:0.16644
[488]	validation_0-logloss:0.16635
[489]	validation_0-logloss:0.16629
[490]	validation_0-logloss:0.16623
[491]	validation_0-logloss:0.16612
[492]	validation_0-logloss:0.16605
[493]	validation_0-logloss:0.16577
[494]	validation_0-logloss:0.16570
[495]	validation_0-logloss:0.16561
[496]	validation_0-logloss:0.16554
[497]	validation_0-logloss:0.16543
[498]	validation_0-logloss:0.16534
[499]	validation_0-logloss:0.16527
[500]	validation_0-logloss:0.16519
[501]	validation_0-l

[708]	validation_0-logloss:0.14634
[709]	validation_0-logloss:0.14627
[710]	validation_0-logloss:0.14621
[711]	validation_0-logloss:0.14614
[712]	validation_0-logloss:0.14599
[713]	validation_0-logloss:0.14591
[714]	validation_0-logloss:0.14586
[715]	validation_0-logloss:0.14580
[716]	validation_0-logloss:0.14574
[717]	validation_0-logloss:0.14559
[718]	validation_0-logloss:0.14546
[719]	validation_0-logloss:0.14539
[720]	validation_0-logloss:0.14526
[721]	validation_0-logloss:0.14519
[722]	validation_0-logloss:0.14506
[723]	validation_0-logloss:0.14494
[724]	validation_0-logloss:0.14484
[725]	validation_0-logloss:0.14478
[726]	validation_0-logloss:0.14466
[727]	validation_0-logloss:0.14455
[728]	validation_0-logloss:0.14450
[729]	validation_0-logloss:0.14444
[730]	validation_0-logloss:0.14438
[731]	validation_0-logloss:0.14428
[732]	validation_0-logloss:0.14421
[733]	validation_0-logloss:0.14411
[734]	validation_0-logloss:0.14405
[735]	validation_0-logloss:0.14401
[736]	validation_0-l

[943]	validation_0-logloss:0.12988
[944]	validation_0-logloss:0.12985
[945]	validation_0-logloss:0.12981
[946]	validation_0-logloss:0.12972
[947]	validation_0-logloss:0.12967
[948]	validation_0-logloss:0.12963
[949]	validation_0-logloss:0.12959
[950]	validation_0-logloss:0.12957
[951]	validation_0-logloss:0.12949
[952]	validation_0-logloss:0.12945
[953]	validation_0-logloss:0.12940
[954]	validation_0-logloss:0.12937
[955]	validation_0-logloss:0.12922
[956]	validation_0-logloss:0.12919
[957]	validation_0-logloss:0.12915
[958]	validation_0-logloss:0.12912
[959]	validation_0-logloss:0.12904
[960]	validation_0-logloss:0.12893
[961]	validation_0-logloss:0.12890
[962]	validation_0-logloss:0.12886
[963]	validation_0-logloss:0.12882
[964]	validation_0-logloss:0.12871
[965]	validation_0-logloss:0.12868
[966]	validation_0-logloss:0.12861
[967]	validation_0-logloss:0.12857
[968]	validation_0-logloss:0.12852
[969]	validation_0-logloss:0.12845
[970]	validation_0-logloss:0.12835
[971]	validation_0-l

#### Accuracy, recall and F2 Score in the training set:

In [55]:
y_pred_train = xgb_model.predict(X_train)
acc = accuracy_score(y_pred_train, y_train)
rec = recall_score(y_pred_train, y_train)
f2 = fbeta_score(y_pred_train, y_train, beta = 2)

print(f"Model accuracy: {acc}")
print(f"Model recall: {rec}")
print(f"Model F2 score: {f2}")

Model accuracy: 0.9663299663299664
Model recall: 0.9727272727272728
Model F2 score: 0.9657039711191335


#### Using KFold cross-validation method to analyze generatization capability

In [56]:
params = {'random_state': 0,
          'n_estimators': 1000,
          'max_depth': 10,
          'early_stopping_rounds': 10,
          'learning_rate': 0.01,
          'reg_lambda': 1,
          'reg_alpha': 0.2,
          'eval_set': [(X_train, y_train)],
          'predictor': 'gpu_predictor'}

In [57]:
import warnings
warnings.filterwarnings('ignore')

indexes = np.array(X_train.index)
rand = np.random.RandomState(params['random_state'])
rand.shuffle(indexes)
X_shuffled = X_train.loc[indexes]
y_shuffled = y_train.loc[indexes]

folds = 5
kf = KFold(n_splits = folds)
kfold_scores = []
kfold_acc = []
kfold_rec = []
kfold_f2 = []

for (train_indexes, val_indexes) in kf.split(X_train):
    X_train_kfold = X_shuffled.iloc[train_indexes].copy()
    y_train_kfold = y_shuffled.iloc[train_indexes].copy()
    X_val_kfold = X_shuffled.iloc[val_indexes].copy()
    y_val_kfold = y_shuffled.iloc[val_indexes].copy()
    
    model = create_xgb_model(params)
    model.fit(X_train_kfold, y_train_kfold, eval_metric = 'logloss', eval_set = params['eval_set'], verbose = 0)
    y_pred = model.predict(X_val_kfold)
    kfold_scores.append(mean_squared_error(y_val_kfold, y_pred))
    kfold_acc.append(accuracy_score(y_val_kfold, y_pred))
    kfold_rec.append(recall_score(y_val_kfold, y_pred))
    kfold_f2.append(fbeta_score(y_val_kfold, y_pred, beta = 2))
    
print(f'KFold final score: {sum(kfold_scores)/len(kfold_scores)}')
print(f'Kfold accuracy score: {sum(kfold_acc)/len(kfold_acc)}')
print(f'Kfold recall score: {sum(kfold_rec)/len(kfold_rec)}')
print(f'Kfold f2 score: {sum(kfold_f2)/len(kfold_f2)}')

KFold final score: 0.17287050404871004
Kfold accuracy score: 0.82712949595129
Kfold recall score: 0.7450448628643114
Kfold f2 score: 0.7536243759596462


---

## Exporting result to submission

In [58]:
y_pred = xgb_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, index = X_test.index.rename('PassengerId'), columns = ['Survived'])

In [59]:
y_pred.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,1
896,0


In [60]:
y_pred.to_csv('xgb_predictions.csv', index = True)