In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [2]:
DATA = Path('data')
RAW  = DATA/'raw'
SUBMISSIONS = DATA/'submissions'

In [3]:
train      = pd.read_csv(RAW/'train_jqd04QH.csv', low_memory=False)
test       = pd.read_csv(RAW/'test_GYi4Gz5.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission_sxfcbdx.csv', low_memory=False)

In [4]:
id_col = 'enrollee_id'
cat_cols = ['city', 'gender', 'relevent_experience', 'enrolled_university', 
            'education_level', 'major_discipline', 'experience', 'company_size', 
            'company_type', 'last_new_job']
num_cols = ['city_development_index', 'training_hours']
target_col = 'target'

In [6]:
from src.utils import preprocess

In [7]:
%%time
preprocess(train, test, cat_cols, num_cols)

CPU times: user 108 ms, sys: 24 ms, total: 132 ms
Wall time: 127 ms


In [10]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [11]:
columns = list(train.drop([id_col, target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [12]:
kfolds = StratifiedKFold(n_splits=5, random_state=42)

## LightGBM

In [182]:
from src.utils import eval_lgbm, print_results
import lightgbm as lgbm

In [201]:
%%time
# LGBMClassifier(n_estimators=1000, random_state=42, max_depth=5)
trn_aucs, val_aucs = eval_lgbm(lgbm,
                    train.drop([id_col, target_col], axis=1), 
                    train[target_col], kfolds, cat_indices)



No. estimators: 22 | Train AUC: 72.46 | Val AUC: 68.34
No. estimators: 11 | Train AUC: 70.87 | Val AUC: 64.65
No. estimators: 13 | Train AUC: 70.94 | Val AUC: 65.98
No. estimators: 12 | Train AUC: 71.34 | Val AUC: 66.63
No. estimators: 18 | Train AUC: 71.49 | Val AUC: 68.46

CPU times: user 916 ms, sys: 4 ms, total: 920 ms
Wall time: 913 ms


In [202]:
# max_depth 5
print_results(trn_aucs, val_aucs)

71.42 +/- 1.14 | 66.81 +/- 2.89


In [124]:
# max_depth 10
print_results(trn_aucs, val_aucs)

73.33 +/- 4.22 | 66.78 +/- 2.82


In [26]:
print_results(trn_aucs, val_aucs)

72.70 +/- 6.57 | 66.38 +/- 2.41


## Random Forest

In [35]:
from src.utils import eval_tree
from sklearn.ensemble import RandomForestClassifier

In [72]:
%%time
trn_aucs, val_aucs = eval_tree(RandomForestClassifier(n_estimators=200, 
                        random_state=42, max_depth=7, n_jobs=-1),
                    train.drop([id_col, target_col], axis=1), train[target_col], kfolds)

Train AUC: 74.58 | Val AUC: 68.76
Train AUC: 75.43 | Val AUC: 64.34
Train AUC: 74.85 | Val AUC: 65.94
Train AUC: 74.81 | Val AUC: 67.64
Train AUC: 74.91 | Val AUC: 67.07

CPU times: user 13.3 s, sys: 576 ms, total: 13.9 s
Wall time: 3.3 s


In [54]:
# n_estimators 200 | max_depth 7
print_results(trn_aucs, val_aucs)

74.91 +/- 0.56 | 66.75 +/- 3.02


In [46]:
# n_estimators 200 | max_depth 6
print_results(trn_aucs, val_aucs)

72.05 +/- 0.55 | 66.71 +/- 3.04


## Extra Trees

In [74]:
from sklearn.ensemble import ExtraTreesClassifier

In [95]:
%%time
trn_aucs, val_aucs = eval_tree(ExtraTreesClassifier(n_estimators=400, 
                        random_state=42, max_depth=8, n_jobs=-1),
                    train.drop([id_col, target_col], axis=1), train[target_col], kfolds)

Train AUC: 75.66 | Val AUC: 67.67
Train AUC: 76.51 | Val AUC: 63.85
Train AUC: 75.89 | Val AUC: 65.28
Train AUC: 75.89 | Val AUC: 67.96
Train AUC: 76.21 | Val AUC: 66.82

CPU times: user 20 s, sys: 936 ms, total: 20.9 s
Wall time: 5.03 s


In [84]:
# n_estimators 200 | max_depth 8
print_results(trn_aucs, val_aucs)

75.97 +/- 0.60 | 66.34 +/- 2.96


## XGBoost

In [143]:
from src.utils import eval_xgb
import xgboost as xgb

In [166]:
%%time
trn_aucs, val_aucs = eval_xgb(xgb,
                    train.drop([id_col, target_col], axis=1),
                    train[target_col], kfolds)

No. estimators: 15 | Train AUC: 68.32 | Val AUC: 68.00
No. estimators: 38 | Train AUC: 72.18 | Val AUC: 64.42
No. estimators: 16 | Train AUC: 69.48 | Val AUC: 66.02
No. estimators: 43 | Train AUC: 72.26 | Val AUC: 66.56
No. estimators: 13 | Train AUC: 68.08 | Val AUC: 67.74

CPU times: user 1.75 s, sys: 0 ns, total: 1.75 s
Wall time: 1.75 s


In [167]:
# n_estimators 1000 | max_depth 3
print_results(trn_aucs, val_aucs)

70.07 +/- 3.65 | 66.55 +/- 2.58


In [148]:
# n_estimators 1000 | max_depth 6
print_results(trn_aucs, val_aucs)

73.44 +/- 4.55 | 66.10 +/- 2.47


## Catboost

In [168]:
from src.utils import eval_catboost
from catboost import CatBoostClassifier

In [180]:
%%time
trn_aucs, val_aucs = eval_catboost(CatBoostClassifier(iterations=1000, 
                        random_seed=42, eval_metric='AUC', od_pval=1e-3,
                        depth=7),
                    train.drop([id_col, target_col], axis=1), 
                    train[target_col], kfolds, cat_indices)

No. estimators: 489 | Train AUC: 71.69 | Val AUC: 68.96
No. estimators: 568 | Train AUC: 72.72 | Val AUC: 64.28
No. estimators: 366 | Train AUC: 71.03 | Val AUC: 66.64
No. estimators: 429 | Train AUC: 71.04 | Val AUC: 67.69
No. estimators: 409 | Train AUC: 71.93 | Val AUC: 68.16

CPU times: user 12min 17s, sys: 51.3 s, total: 13min 8s
Wall time: 1min 20s


In [181]:
# n_estimators 1000 | max_depth 7
print_results(trn_aucs, val_aucs)

71.68 +/- 1.26 | 67.15 +/- 3.23


In [173]:
# n_estimators 1000 | max_depth 10
print_results(trn_aucs, val_aucs)

73.98 +/- 2.70 | 67.11 +/- 3.27


In [171]:
# n_estimators 1000 | max_depth 6
print_results(trn_aucs, val_aucs)

71.66 +/- 2.83 | 67.00 +/- 3.29


## Neural Net

In [209]:
from src.utils import eval_neuralnet
from src.neuralnet import StructuredNet

In [217]:
USE_CUDA = True

In [227]:
%%time
trn_aucs, val_aucs = eval_neuralnet(
                train.drop([id_col, target_col], axis=1), 
                train[target_col], kfolds, cat_cols, num_cols, USE_CUDA)

Epoch: 1 | Train AUC: 67.05 | Val AUC: 66.30
Epoch: 2 | Train AUC: 68.04 | Val AUC: 65.89
Epoch: 3 | Train AUC: 68.51 | Val AUC: 67.39
Epoch: 4 | Train AUC: 69.80 | Val AUC: 66.85
Epoch: 5 | Train AUC: 70.71 | Val AUC: 67.44
Epoch: 6 | Train AUC: 71.76 | Val AUC: 66.89
Epoch: 7 | Train AUC: 70.96 | Val AUC: 66.66
Best epoch: 5 | Train AUC: 70.71 | Val AUC: 67.44

Epoch: 1 | Train AUC: 66.57 | Val AUC: 61.91
Epoch: 2 | Train AUC: 69.79 | Val AUC: 64.08
Epoch: 3 | Train AUC: 69.64 | Val AUC: 63.50
Epoch: 4 | Train AUC: 68.95 | Val AUC: 63.69
Epoch: 5 | Train AUC: 71.01 | Val AUC: 63.01
Epoch: 6 | Train AUC: 71.88 | Val AUC: 62.96
Epoch: 7 | Train AUC: 72.65 | Val AUC: 63.25
Best epoch: 2 | Train AUC: 69.79 | Val AUC: 64.08

Epoch: 1 | Train AUC: 65.11 | Val AUC: 59.24
Epoch: 2 | Train AUC: 68.55 | Val AUC: 63.00
Epoch: 3 | Train AUC: 69.71 | Val AUC: 63.99
Epoch: 4 | Train AUC: 70.23 | Val AUC: 64.15
Epoch: 5 | Train AUC: 70.37 | Val AUC: 63.82
Epoch: 6 | Train AUC: 71.72 | Val AUC: 63.9

In [228]:
# szs [1000,500]
print_results(trn_aucs, val_aucs)

70.30 +/- 1.83 | 65.86 +/- 3.01
