In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [2]:
DATA = Path('data')
RAW  = DATA/'raw'
SUBMISSIONS = DATA/'submissions'

In [3]:
train      = pd.read_csv(RAW/'train_jqd04QH.csv', low_memory=False)
test       = pd.read_csv(RAW/'test_GYi4Gz5.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission_sxfcbdx.csv', low_memory=False)

In [4]:
id_col = 'enrollee_id'
cat_cols = ['city', 'gender', 'relevent_experience', 'enrolled_university', 
            'education_level', 'major_discipline', 'experience', 'company_size', 
            'company_type', 'last_new_job']
num_cols = ['city_development_index', 'training_hours']
target_col = 'target'

In [6]:
from src.utils import preprocess

In [7]:
%%time
preprocess(train, test, cat_cols, num_cols)

CPU times: user 108 ms, sys: 24 ms, total: 132 ms
Wall time: 127 ms


In [10]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [11]:
columns = list(train.drop([id_col, target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [12]:
kfolds = StratifiedKFold(n_splits=5, random_state=42)

In [13]:
from src.utils import eval_gbm, print_results

In [31]:
%%time
trn_aucs, val_aucs = eval_gbm(LGBMClassifier(n_estimators=1000, 
                        random_state=42, max_depth=5),
                    train.drop([id_col, target_col], axis=1), train[target_col], kfolds)

No. estimators: 17 | Train AUC: 70.11 | Val AUC: 67.97
No. estimators: 15 | Train AUC: 70.72 | Val AUC: 64.58
No. estimators: 16 | Train AUC: 70.63 | Val AUC: 65.52
No. estimators: 42 | Train AUC: 74.77 | Val AUC: 66.78
No. estimators: 20 | Train AUC: 70.56 | Val AUC: 67.62

CPU times: user 6.86 s, sys: 32 ms, total: 6.9 s
Wall time: 669 ms


In [32]:
# max_depth 5
print_results(trn_aucs, val_aucs)

71.36 +/- 3.44 | 66.49 +/- 2.55


In [28]:
# max_depth 10
print_results(trn_aucs, val_aucs)

73.03 +/- 8.27 | 66.42 +/- 2.46


In [26]:
print_results(trn_aucs, val_aucs)

72.70 +/- 6.57 | 66.38 +/- 2.41


## Random Forest

In [35]:
from src.utils import eval_tree
from sklearn.ensemble import RandomForestClassifier

In [72]:
%%time
trn_aucs, val_aucs = eval_tree(RandomForestClassifier(n_estimators=200, 
                        random_state=42, max_depth=7, n_jobs=-1),
                    train.drop([id_col, target_col], axis=1), train[target_col], kfolds)

Train AUC: 74.58 | Val AUC: 68.76
Train AUC: 75.43 | Val AUC: 64.34
Train AUC: 74.85 | Val AUC: 65.94
Train AUC: 74.81 | Val AUC: 67.64
Train AUC: 74.91 | Val AUC: 67.07

CPU times: user 13.3 s, sys: 576 ms, total: 13.9 s
Wall time: 3.3 s


In [54]:
# n_estimators 200 | max_depth 7
print_results(trn_aucs, val_aucs)

74.91 +/- 0.56 | 66.75 +/- 3.02


In [46]:
# n_estimators 200 | max_depth 6
print_results(trn_aucs, val_aucs)

72.05 +/- 0.55 | 66.71 +/- 3.04


## Extra Trees

In [74]:
from sklearn.ensemble import ExtraTreesClassifier

In [95]:
%%time
trn_aucs, val_aucs = eval_tree(ExtraTreesClassifier(n_estimators=400, 
                        random_state=42, max_depth=8, n_jobs=-1),
                    train.drop([id_col, target_col], axis=1), train[target_col], kfolds)

Train AUC: 75.66 | Val AUC: 67.67
Train AUC: 76.51 | Val AUC: 63.85
Train AUC: 75.89 | Val AUC: 65.28
Train AUC: 75.89 | Val AUC: 67.96
Train AUC: 76.21 | Val AUC: 66.82

CPU times: user 20 s, sys: 936 ms, total: 20.9 s
Wall time: 5.03 s


In [84]:
# n_estimators 200 | max_depth 8
print_results(trn_aucs, val_aucs)

75.97 +/- 0.60 | 66.34 +/- 2.96
