In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [2]:
DATA = Path('data')
RAW  = DATA/'raw'
SUBMISSIONS = DATA/'submissions'

In [4]:
train      = pd.read_csv(RAW/'train_jqd04QH.csv', low_memory=False)
test       = pd.read_csv(RAW/'test_GYi4Gz5.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission_sxfcbdx.csv', low_memory=False)

In [5]:
id_col = 'enrollee_id'
cat_cols = ['city', 'gender', 'relevent_experience', 'enrolled_university', 
            'education_level', 'major_discipline', 'experience', 'company_size', 
            'company_type', 'last_new_job']
num_cols = ['city_development_index', 'training_hours']
target_col = 'target'

In [6]:
train.isnull().sum(axis=0)

enrollee_id                  0
city                         0
city_development_index       0
gender                    4098
relevent_experience          0
enrolled_university        342
education_level            457
major_discipline          2838
experience                  59
company_size              4779
company_type              5039
last_new_job               367
training_hours               0
target                       0
dtype: int64

In [7]:
train.shape

(18359, 14)

## Preprocessing

In [10]:
%%time
def get_dict(labels):
    return {label: idx for idx, label in enumerate(labels)}

labels = {
    'enrolled_university': get_dict(['no_enrollment', 'Part time course',
                                     'Full time course']),
    'education_level': get_dict(['Primary School', 'High School', 
                                 'Graduate', 'Masters', 'Phd']),
    'experience': get_dict(['<1'] + [str(x) for x in range(1,21)] + ['>20']),
    'company_size': get_dict(['<10', '10/49', '50-99', '100-500', '500-999', 
                              '1000-4999', '5000-9999', '10000+']),
    'last_new_job': get_dict([str(x) for x in range(1,5)] + ['>4', 'never'])
}

for col in labels:
    train[col] = train[col].map(labels[col])
    test[col] = test[col].map(labels[col])
    
from src.utils import to_cat_codes, apply_cats
to_cat_codes(train, [c for c in cat_cols if c not in labels])
apply_cats(test, train)
for col in cat_cols: 
    if col not in labels:
        train[col] = train[col].cat.codes
        test[col] = test[col].cat.codes

CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 71.3 ms


In [11]:
%%time
train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

nan_cols = [c for c in cat_cols if \
            any(df[c].min() < 0 for df in [train, test])]

for c in nan_cols:
    train[c] = train[c] + 1
    test[c] = test[c] + 1

CPU times: user 32 ms, sys: 8 ms, total: 40 ms
Wall time: 115 ms


In [12]:
%%time
# Num features
for col in labels:
    train[f'{col}_num'] = train[col]
    test[f'{col}_num'] = test[col]
    num_cols.append(f'{col}_num')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 6.79 ms


In [15]:
%%time
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(pd.concat([train[num_cols], 
                                         test[num_cols]]))

def scale_features(df, scaler, num_cols):
    scaled = scaler.transform(df[num_cols])
    for i, col in enumerate(num_cols):
        df[col] = scaled[:,i]
        
scale_features(train, scaler, num_cols)
scale_features(test, scaler, num_cols)

CPU times: user 44 ms, sys: 28 ms, total: 72 ms
Wall time: 518 ms


## Validation

In [16]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [18]:
columns = list(train.drop([id_col, target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [54]:
kfolds = StratifiedKFold(n_splits=5, random_state=42)

In [55]:
def eval_model(model, X, y, kfolds):
    trn_aucs, val_aucs = [], []
    for trn_idx, val_idx in kfolds.split(X, y):
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
        model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)],
                  use_best_model=True, cat_features=cat_indices, 
                  verbose=False)
        y_trn_pred = model.predict_proba(X_trn)[:,1]
        y_val_pred = model.predict_proba(X_val)[:,1]
        trn_aucs.append(roc_auc_score(y_trn, y_trn_pred))
        val_aucs.append(roc_auc_score(y_val, y_val_pred))
        print(f'No. estimators: {model.tree_count_} | '
              f'Train AUC: {100*trn_aucs[-1]:.2f} | '
              f'Val AUC: {100*val_aucs[-1]:.2f}')
    print()
    return trn_aucs, val_aucs

In [56]:
def print_results(trn_aucs, val_aucs):
    print(f'{100*np.mean(trn_aucs):.2f} +/- {200*np.std(trn_aucs):.2f} | '
          f'{100*np.mean(val_aucs):.2f} +/- {200*np.std(val_aucs):.2f}')

In [59]:
%%time
trn_aucs, val_aucs = eval_model(CatBoostClassifier(iterations=1000, od_pval=0.001, 
                                eval_metric='AUC', random_seed=42,
                                depth=10),
                     train.drop([id_col, target_col], axis=1), train[target_col], kfolds)

No. estimators: 341 | Train AUC: 72.24 | Val AUC: 68.90
No. estimators: 573 | Train AUC: 76.26 | Val AUC: 64.35
No. estimators: 462 | Train AUC: 73.63 | Val AUC: 66.23
No. estimators: 520 | Train AUC: 74.49 | Val AUC: 67.82
No. estimators: 370 | Train AUC: 73.26 | Val AUC: 68.23

CPU times: user 28min 45s, sys: 1min 21s, total: 30min 6s
Wall time: 2min 46s


In [60]:
# 5 folds, depth 10
print_results(trn_aucs, val_aucs)

73.98 +/- 2.70 | 67.11 +/- 3.27


In [53]:
# 10 folds
print_results(trn_aucs, val_aucs)

71.47 +/- 2.91 | 67.30 +/- 4.23


In [48]:
# 5 folds
print_results(trn_aucs, val_aucs)

71.66 +/- 2.83 | 67.00 +/- 3.29
