In [1]:
import sys
sys.path.append('..')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from datetime import datetime, timezone, timedelta
today = datetime.now(timezone(timedelta(hours=9)))
exp_version = today.strftime('%Y%m%d')

import os
os.environ['exp_version'] = exp_version

from logs.logger import create_logger
create_logger(exp_version)

In [4]:
import numpy as np
import pandas as pd

In [5]:
from features.funcs import load_feather
from evals.funcs import get_pred_result, get_acc_and_logloss, print_conf_matrix
from models.xgb import ModelXGB

In [6]:
import os

input_files = os.listdir('../input/')
print(input_files)

['test.csv', 'train.csv', 'gender_submission.csv']


In [7]:
base_path = '../input/'
train_df = pd.read_csv(base_path + 'train.csv')
test_df = pd.read_csv(base_path + 'test.csv')
print(f'Columns in the training data ({train_df.shape[1]}):')
print(train_df.columns)
print()
print(f'Columns in the test data ({test_df.shape[1]}):')
print(test_df.columns)

Columns in the training data (12):
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Columns in the test data (11):
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [8]:
def load_data():
    train_x = train_df.drop(['Survived'], axis=1)
    train_y = train_df['Survived']
    test_x = test_df
    return train_x, train_y, test_x

### Base

In [9]:
os.environ['model_name'] = 'BASE MODEL'

In [10]:
train_x, train_y, test_x = load_data()
train_x = train_x.drop(['Name', 'Cabin', 'Ticket'], axis=1)
test_x = test_x.drop(['Name', 'Cabin', 'Ticket'], axis=1)

base_path = '../features/'

train_x = load_feather(train_x, base_path + 'sex_train.feather', 'Sex')
test_x = load_feather(test_x, base_path + 'sex_test.feather', 'Sex')

train_x = load_feather(train_x, base_path + 'embarked_train.feather', 'Embarked')
test_x = load_feather(test_x, base_path + 'embarked_test.feather', 'Embarked')

In [11]:
model = ModelXGB()
pred_train1, preds_test1 = get_pred_result(model,train_x, train_y, test_x)

[0]	train-logloss:0.54032	eval-logloss:0.58476
[1]	train-logloss:0.45127	eval-logloss:0.53676
[2]	train-logloss:0.39128	eval-logloss:0.50857
[3]	train-logloss:0.34878	eval-logloss:0.49415
[4]	train-logloss:0.31972	eval-logloss:0.49030
[5]	train-logloss:0.29561	eval-logloss:0.48866
[6]	train-logloss:0.27656	eval-logloss:0.49654
[7]	train-logloss:0.26086	eval-logloss:0.50617
[8]	train-logloss:0.24263	eval-logloss:0.50667
[9]	train-logloss:0.22808	eval-logloss:0.51274
[0]	train-logloss:0.54673	eval-logloss:0.58209
[1]	train-logloss:0.46331	eval-logloss:0.52585
[2]	train-logloss:0.40880	eval-logloss:0.49644
[3]	train-logloss:0.36922	eval-logloss:0.47830
[4]	train-logloss:0.33897	eval-logloss:0.46656
[5]	train-logloss:0.31576	eval-logloss:0.46058
[6]	train-logloss:0.29641	eval-logloss:0.45693
[7]	train-logloss:0.28004	eval-logloss:0.45260
[8]	train-logloss:0.26846	eval-logloss:0.44551
[9]	train-logloss:0.25781	eval-logloss:0.44293
[0]	train-logloss:0.55261	eval-logloss:0.56436
[1]	train-log

In [12]:
get_acc_and_logloss(pred_train1, train_y)
print_conf_matrix(pred_train1, train_y)

accuracy: 0.8181818182
logloss: 0.4377829715
[[492  57]
 [105 237]]


### Cols without Embarked

In [13]:
os.environ['model_name'] = 'MODEL WITHOUT EMBARKED'

In [14]:
train_x, train_y, test_x = load_data()
train_x = train_x.drop(['Name', 'Cabin', 'Ticket', 'Embarked'], axis=1)
test_x = test_x.drop(['Name', 'Cabin', 'Ticket', 'Embarked'], axis=1)

base_path = '../features/'

train_x = load_feather(train_x, base_path + 'sex_train.feather', 'Sex')
test_x = load_feather(test_x, base_path + 'sex_test.feather', 'Sex')

In [15]:
model = ModelXGB()
pred_train1, preds_test1 = get_pred_result(model, train_x, train_y, test_x)

[0]	train-logloss:0.54220	eval-logloss:0.59055
[1]	train-logloss:0.45272	eval-logloss:0.54021
[2]	train-logloss:0.39321	eval-logloss:0.51694
[3]	train-logloss:0.35435	eval-logloss:0.50316
[4]	train-logloss:0.32421	eval-logloss:0.49580
[5]	train-logloss:0.30211	eval-logloss:0.49630
[6]	train-logloss:0.28496	eval-logloss:0.49979
[7]	train-logloss:0.27057	eval-logloss:0.50552
[8]	train-logloss:0.25138	eval-logloss:0.50684
[9]	train-logloss:0.23777	eval-logloss:0.51587
[0]	train-logloss:0.54673	eval-logloss:0.58209
[1]	train-logloss:0.46331	eval-logloss:0.52585
[2]	train-logloss:0.40891	eval-logloss:0.49481
[3]	train-logloss:0.36968	eval-logloss:0.47818
[4]	train-logloss:0.34233	eval-logloss:0.46233
[5]	train-logloss:0.31821	eval-logloss:0.46253
[6]	train-logloss:0.29912	eval-logloss:0.46465
[7]	train-logloss:0.28159	eval-logloss:0.46842
[8]	train-logloss:0.27014	eval-logloss:0.45922
[9]	train-logloss:0.25800	eval-logloss:0.46113
[0]	train-logloss:0.55764	eval-logloss:0.56199
[1]	train-log

In [16]:
get_acc_and_logloss(pred_train1, train_y)

accuracy: 0.8181818182
logloss: 0.4407730976


### Cols with AgeGroup

In [17]:
os.environ['model_name'] = 'MODEL WITH AGEGROUP'

In [18]:
train_x, train_y, test_x = load_data()
train_x = train_x.drop(['Name', 'Cabin', 'Ticket'], axis=1)
test_x = test_x.drop(['Name', 'Cabin', 'Ticket'], axis=1)

base_path = '../features/'

train_x = load_feather(train_x, base_path + 'sex_train.feather', 'Sex')
test_x = load_feather(test_x, base_path + 'sex_test.feather', 'Sex')

train_x = load_feather(train_x, base_path + 'embarked_train.feather', 'Embarked')
test_x = load_feather(test_x, base_path + 'embarked_test.feather', 'Embarked')

train_x = load_feather(train_x, base_path + 'age_train.feather', 'AgeGroup')
test_x = load_feather(test_x, base_path + 'age_test.feather', 'AgeGroup')

In [19]:
model = ModelXGB()
pred_train, preds_test = get_pred_result(model,train_x, train_y, test_x)

[0]	train-logloss:0.54032	eval-logloss:0.58476
[1]	train-logloss:0.45127	eval-logloss:0.53676
[2]	train-logloss:0.39128	eval-logloss:0.50857
[3]	train-logloss:0.34878	eval-logloss:0.49415
[4]	train-logloss:0.31972	eval-logloss:0.49030
[5]	train-logloss:0.29561	eval-logloss:0.48866
[6]	train-logloss:0.27656	eval-logloss:0.49654
[7]	train-logloss:0.26086	eval-logloss:0.50617
[8]	train-logloss:0.24263	eval-logloss:0.50667
[9]	train-logloss:0.22808	eval-logloss:0.51274
[0]	train-logloss:0.54673	eval-logloss:0.58209
[1]	train-logloss:0.46331	eval-logloss:0.52585
[2]	train-logloss:0.40880	eval-logloss:0.49644
[3]	train-logloss:0.36922	eval-logloss:0.47830
[4]	train-logloss:0.33897	eval-logloss:0.46656
[5]	train-logloss:0.31576	eval-logloss:0.46058
[6]	train-logloss:0.29641	eval-logloss:0.45693
[7]	train-logloss:0.28004	eval-logloss:0.45260
[8]	train-logloss:0.26846	eval-logloss:0.44551
[9]	train-logloss:0.25781	eval-logloss:0.44293
[0]	train-logloss:0.55261	eval-logloss:0.56436
[1]	train-log

In [20]:
get_acc_and_logloss(pred_train, train_y)

accuracy: 0.8181818182
logloss: 0.4377829715


#### different params

In [35]:
os.environ['model_name'] = 'MODEL WITH AGEGROUP WITH NUM ROUND CHANGED'

In [36]:
model = ModelXGB(num_round=15, verbose_eval=False)
pred_train, preds_test = get_pred_result(model,train_x, train_y, test_x)

In [38]:
get_acc_and_logloss(pred_train, train_y)

data size: 891
correct predictions: 725
accuracy: 0.8136924804
logloss: 0.4540425585
