## Importing libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

---

## Importing data

In [14]:
y_train_path = os.path.join('data', 'preprocessed_y_train.csv')
X_train_path = os.path.join('data', 'preprocessed_X_train.csv')
X_test_path = os.path.join('data', 'preprocessed_X_test.csv')

y_train = pd.read_csv(y_train_path, index_col = 'PassengerId')
X_train = pd.read_csv(X_train_path, index_col = 'PassengerId')
X_test = pd.read_csv(X_test_path, index_col = 'PassengerId')

In [15]:
y_train.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
1,0
2,1
3,1
4,1
5,0


In [16]:
X_train.head()

Unnamed: 0_level_0,Pclass,Is_Male,Age_Bin,Fare_Bin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Embark_Place_C,Embark_Place_Q,Embark_Place_S,Family_Size,Traveled_Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.826913,0.737281,0.143177,-1.395914,-0.216681,-0.511611,0.850054,-0.405612,-0.16269,-0.481772,-0.30739,0.615493,0.059127,-1.230954
2,-1.565228,-1.354813,0.143177,1.32566,-0.216681,-0.511611,-1.175075,2.462644,-0.16269,2.073341,-0.30739,-1.622891,0.059127,-1.230954
3,0.826913,-1.354813,0.143177,-0.488723,-0.216681,1.952417,-1.175075,-0.405612,-0.16269,-0.481772,-0.30739,0.615493,-0.56066,0.811467
4,-1.565228,-1.354813,0.143177,1.32566,-0.216681,-0.511611,-1.175075,2.462644,-0.16269,-0.481772,-0.30739,0.615493,0.059127,-1.230954
5,0.826913,0.737281,0.143177,-0.488723,-0.216681,-0.511611,0.850054,-0.405612,-0.16269,-0.481772,-0.30739,0.615493,-0.56066,0.811467


In [17]:
X_test.head()

Unnamed: 0_level_0,Pclass,Is_Male,Age_Bin,Fare_Bin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Embark_Place_C,Embark_Place_Q,Embark_Place_S,Family_Size,Traveled_Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
892,0.826913,0.737281,0.143177,-1.395914,-0.216681,-0.511611,0.850054,-0.405612,-0.16269,-0.481772,3.249548,-1.622891,-0.56066,0.811467
893,0.826913,-1.354813,1.444916,-1.395914,-0.216681,-0.511611,-1.175075,2.462644,-0.16269,-0.481772,-0.30739,0.615493,0.059127,-1.230954
894,-0.369158,0.737281,1.444916,-0.488723,-0.216681,-0.511611,0.850054,-0.405612,-0.16269,-0.481772,3.249548,-1.622891,-0.56066,0.811467
895,0.826913,0.737281,0.143177,-0.488723,-0.216681,-0.511611,0.850054,-0.405612,-0.16269,-0.481772,-0.30739,0.615493,-0.56066,0.811467
896,0.826913,-1.354813,0.143177,-0.488723,-0.216681,-0.511611,-1.175075,2.462644,-0.16269,-0.481772,-0.30739,0.615493,0.678913,-1.230954


---

## Importing models

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, fbeta_score

---

## 1) XGBoost

### Creating and fitting model

In [19]:
xgb_params = {'random_state': 0,
          'n_estimators': 1000,
          'max_depth': 6,
          'early_stopping_rounds': 10,
          'learning_rate': 0.3,
          'reg_lambda': 1,
          'reg_alpha': 0.5,
          'predictor': 'cpu_predictor'}

In [20]:
def create_xgb_model(params):
    xgb = XGBClassifier(random_state = xgb_params['random_state'],
                       n_estimators = xgb_params['n_estimators'], 
                       max_depth = xgb_params['max_depth'], 
                       learning_rate = xgb_params['learning_rate'],
                       early_stopping_rounds = xgb_params['early_stopping_rounds'],
                       reg_lambda = xgb_params['reg_lambda'], 
                       reg_alpha = xgb_params['reg_alpha'],
                       predictor = xgb_params['predictor'])
    return xgb

### Using KFold cross-validation method to analyze generatization capability

In [21]:
xgb_results = []
eval_set = [[X_train, y_train]]


def kfold_scores_xgb(params, folds, results):
    indexes = np.array(X_train.index)
    rand = np.random.RandomState(params['random_state'])
    rand.shuffle(indexes)
    X_shuffled = X_train.loc[indexes].copy()
    y_shuffled = y_train.loc[indexes].copy()
    folds = folds
    kf = KFold(n_splits = folds)
    kfold_scores = []
    kfold_acc = []
    kfold_rec = []
    kfold_f2 = []

    for (train_indexes, val_indexes) in kf.split(X_train):
        X_train_kfold = X_shuffled.iloc[train_indexes].copy()
        y_train_kfold = y_shuffled.iloc[train_indexes].copy()
        X_val_kfold = X_shuffled.iloc[val_indexes].copy()
        y_val_kfold = y_shuffled.iloc[val_indexes].copy()

        model = create_xgb_model(params)
        model.fit(X_train_kfold, y_train_kfold, eval_metric = 'logloss', eval_set = eval_set, verbose = 0)
        y_pred = model.predict(X_val_kfold)
        kfold_scores.append(mean_squared_error(y_val_kfold, y_pred))
        kfold_acc.append(accuracy_score(y_val_kfold, y_pred))
        kfold_rec.append(recall_score(y_val_kfold, y_pred))
        kfold_f2.append(fbeta_score(y_val_kfold, y_pred, beta = 2))

    print(f'Last KFold final score: {sum(kfold_scores)/len(kfold_scores)}')
    print(f'Last Kfold accuracy score: {sum(kfold_acc)/len(kfold_acc)}')
    print(f'Last Kfold recall score: {sum(kfold_rec)/len(kfold_rec)}')
    print(f'Last Kfold f2 score: {sum(kfold_f2)/len(kfold_f2)}')
    
    results.append((params, {sum(kfold_acc)/len(kfold_acc)}))
    
    return results

In [22]:
xgb_results = kfold_scores_xgb(xgb_params, 5, xgb_results)

Last KFold final score: 0.18295147824995292
Last Kfold accuracy score: 0.8170485217500472
Last Kfold recall score: 0.7187765708080177
Last Kfold f2 score: 0.7302167470472407


In [23]:
xgb_results

[({'random_state': 0,
   'n_estimators': 1000,
   'max_depth': 6,
   'early_stopping_rounds': 10,
   'learning_rate': 0.3,
   'reg_lambda': 1,
   'reg_alpha': 0.5,
   'predictor': 'cpu_predictor'},
  {0.8170485217500472})]

Important:
**Kfold accuracy is an estimate of the competition score.** However, it's certainly overestimated or underestimated.

---

## Exporting result to submission

In [24]:
xgb_model = create_xgb_model(xgb_params)
xgb_model.fit(X_train, y_train, eval_metric = ['logloss'], eval_set = eval_set)

y_pred = xgb_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, index = X_test.index.rename('PassengerId'), columns = ['Survived'])

y_pred_path = os.path.join('data', 'xgb_predictions.csv')
y_pred.to_csv(y_pred_path, index = True)

[0]	validation_0-logloss:0.56183
[1]	validation_0-logloss:0.49133
[2]	validation_0-logloss:0.44631
[3]	validation_0-logloss:0.41654
[4]	validation_0-logloss:0.39733
[5]	validation_0-logloss:0.38450
[6]	validation_0-logloss:0.37586
[7]	validation_0-logloss:0.36986
[8]	validation_0-logloss:0.36488
[9]	validation_0-logloss:0.36102
[10]	validation_0-logloss:0.35808
[11]	validation_0-logloss:0.35582
[12]	validation_0-logloss:0.35253
[13]	validation_0-logloss:0.34964
[14]	validation_0-logloss:0.34864
[15]	validation_0-logloss:0.34735
[16]	validation_0-logloss:0.34486
[17]	validation_0-logloss:0.34319
[18]	validation_0-logloss:0.34195
[19]	validation_0-logloss:0.34018
[20]	validation_0-logloss:0.33842
[21]	validation_0-logloss:0.33690
[22]	validation_0-logloss:0.33561
[23]	validation_0-logloss:0.33461
[24]	validation_0-logloss:0.33370
[25]	validation_0-logloss:0.33296
[26]	validation_0-logloss:0.33243
[27]	validation_0-logloss:0.33191
[28]	validation_0-logloss:0.33031
[29]	validation_0-loglos

---

## 2) Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

### Using KFold cross-validation method to analyze generatization capability

In [26]:
rf_params = {'random_state': 0,
          'n_estimators': 1000,
          'max_depth': 4,
          'max_features': 'sqrt',
          'min_samples_leaf': 2}

In [27]:
def create_rf_model(params):
    rf = RandomForestClassifier(random_state = rf_params['random_state'],
                                n_estimators = rf_params['n_estimators'], 
                                max_depth = rf_params['max_depth'],
                                max_features = rf_params['max_features'],
                                min_samples_leaf = rf_params['min_samples_leaf'])
    return rf

In [28]:
rf_results = []
def kfold_scores_rf(params, folds, results):
    indexes = np.array(X_train.index)
    rand = np.random.RandomState(params['random_state'])
    rand.shuffle(indexes)
    X_shuffled = X_train.loc[indexes].copy()
    y_shuffled = y_train.loc[indexes].copy()
    folds = folds
    kf = KFold(n_splits = folds)
    kfold_scores = []
    kfold_acc = []
    kfold_rec = []
    kfold_f2 = []

    for (train_indexes, val_indexes) in kf.split(X_train):
        X_train_kfold = X_shuffled.iloc[train_indexes].copy()
        y_train_kfold = y_shuffled.iloc[train_indexes].copy()
        X_val_kfold = X_shuffled.iloc[val_indexes].copy()
        y_val_kfold = y_shuffled.iloc[val_indexes].copy()

        model = create_rf_model(params)
        model.fit(X_train_kfold, y_train_kfold)
        y_pred = model.predict(X_val_kfold)
        kfold_scores.append(mean_squared_error(y_val_kfold, y_pred))
        kfold_acc.append(accuracy_score(y_val_kfold, y_pred))
        kfold_rec.append(recall_score(y_val_kfold, y_pred))
        kfold_f2.append(fbeta_score(y_val_kfold, y_pred, beta = 2))

    print(f'Last KFold final score: {sum(kfold_scores)/len(kfold_scores)}')
    print(f'Last Kfold accuracy score: {sum(kfold_acc)/len(kfold_acc)}')
    print(f'Last Kfold recall score: {sum(kfold_rec)/len(kfold_rec)}')
    print(f'Last Kfold f2 score: {sum(kfold_f2)/len(kfold_f2)}')
    
    results.append((params, {sum(kfold_acc)/len(kfold_acc)}))
    
    return results

In [29]:
rf_results = kfold_scores_rf(rf_params, 5, rf_results)

Last KFold final score: 0.1683447366769192
Last Kfold accuracy score: 0.8316552633230808
Last Kfold recall score: 0.7370775640372186
Last Kfold f2 score: 0.7490380751469796


Important:
**Kfold accuracy is an estimate of the competition score.** However, it's certainly overestimated or underestimated.

## Exporting result to submission

In [30]:
rf_model = create_rf_model(rf_params)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, index = X_test.index.rename('PassengerId'), columns = ['Survived'])

y_pred_path = os.path.join('data', 'rf_predictions.csv')
y_pred.to_csv(y_pred_path, index = True)

---

## 3) Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

In [33]:
y_pred = logreg_model.predict(X_test)
y_pred = pd.DataFrame(y_pred, index = X_test.index.rename('PassengerId'), columns = ['Survived'])

In [34]:
y_pred_path = os.path.join('data', 'logreg_predictions.csv')
y_pred.to_csv(y_pred_path, index = True)