In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [53]:
DATA = Path('data')
RAW  = DATA/'raw'
SUBMISSIONS = DATA/'submissions'

In [7]:
train      = pd.read_csv(RAW/'train_jqd04QH.csv', low_memory=False)
test       = pd.read_csv(RAW/'test_GYi4Gz5.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission_sxfcbdx.csv', low_memory=False)

In [4]:
train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0
2,46,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,,No relevent experience,no_enrollment,Masters,STEM,8,,,2,88,1


In [8]:
test.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,16548,city_33,0.448,,No relevent experience,Full time course,Graduate,STEM,<1,1000-4999,Public Sector,,15
1,12036,city_28,0.939,Male,No relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,1.0,94
2,11061,city_103,0.92,Male,No relevent experience,Full time course,Graduate,STEM,3,,,1.0,17
3,5032,city_104,0.924,Male,No relevent experience,no_enrollment,Phd,STEM,>20,50-99,Pvt Ltd,2.0,76
4,17599,city_77,0.83,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,<10,Pvt Ltd,2.0,65


In [6]:
train.target.value_counts()

0    15934
1     2425
Name: target, dtype: int64

In [23]:
id_col = 'enrollee_id'
cat_cols = ['city', 'gender', 'relevent_experience', 'enrolled_university', 
            'education_level', 'major_discipline', 'experience', 'company_size', 
            'company_type', 'last_new_job']
num_cols = ['city_development_index', 'training_hours']
target_col = 'target'

In [13]:
from src.utils import to_cat_codes, apply_cats

In [14]:
%%time
to_cat_codes(train, cat_cols)
apply_cats(test, train)

CPU times: user 32.4 ms, sys: 30 µs, total: 32.4 ms
Wall time: 33 ms


In [19]:
for col in cat_cols:
    train[col] = train[col].cat.codes
    test[col] = test[col].cat.codes

In [35]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [24]:
kfolds = StratifiedKFold(n_splits=5, random_state=42)

In [36]:
def run_model(model, X_trn, y_trn, X_tst, kfolds):
    y_tests = []
    for trn_idx, val_idx in kfolds.split(X_trn, y_trn):
        model.fit(X_trn.iloc[trn_idx], y_trn.iloc[trn_idx], 
                  eval_set=[(X_trn.iloc[val_idx], y_trn.iloc[val_idx])],
                  early_stopping_rounds=100,
                  eval_metric='auc')
        y_trn_pred = model.predict_proba(X_trn.iloc[trn_idx])[:,1]
        y_val_pred = model.predict_proba(X_trn.iloc[val_idx])[:,1]
        print(f'Train AUC: {roc_auc_score(y_trn.iloc[trn_idx], y_trn_pred):.4f}')
        print(f'Val AUC: {roc_auc_score(y_trn.iloc[val_idx], y_val_pred):.4f}')        
        y_tests.append(model.predict_proba(X_tst)[:,1])
        print()
        
    return y_tests

In [33]:
from lightgbm import LGBMClassifier

In [37]:
y_tests = run_model(LGBMClassifier(n_estimators=1000),
                    train.drop([id_col, target_col], axis=1), train[target_col],
                    test.drop(id_col, axis=1), kfolds)

[1]	valid_0's auc: 0.657752
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.656392
[3]	valid_0's auc: 0.669592
[4]	valid_0's auc: 0.671015
[5]	valid_0's auc: 0.669113
[6]	valid_0's auc: 0.668328
[7]	valid_0's auc: 0.669342
[8]	valid_0's auc: 0.673104
[9]	valid_0's auc: 0.675274
[10]	valid_0's auc: 0.674278
[11]	valid_0's auc: 0.675091
[12]	valid_0's auc: 0.674707
[13]	valid_0's auc: 0.674526
[14]	valid_0's auc: 0.673845
[15]	valid_0's auc: 0.672048
[16]	valid_0's auc: 0.670803
[17]	valid_0's auc: 0.670472
[18]	valid_0's auc: 0.670595
[19]	valid_0's auc: 0.670895
[20]	valid_0's auc: 0.670143
[21]	valid_0's auc: 0.67048
[22]	valid_0's auc: 0.669835
[23]	valid_0's auc: 0.669933
[24]	valid_0's auc: 0.67019
[25]	valid_0's auc: 0.670727
[26]	valid_0's auc: 0.670068
[27]	valid_0's auc: 0.669631
[28]	valid_0's auc: 0.670808
[29]	valid_0's auc: 0.669374
[30]	valid_0's auc: 0.670527
[31]	valid_0's auc: 0.669437
[32]	valid_0's auc: 0.669018
[33]	valid_0's auc: 

In [44]:
y_test = np.array(y_tests).sum(axis=0) / 5

In [46]:
test[target_col] = y_test

In [50]:
submission = pd.merge(submission[[id_col]], test[[id_col, target_col]], how='left', on=id_col)

In [51]:
submission.head()

Unnamed: 0,enrollee_id,target
0,16548,0.276542
1,12036,0.091457
2,11061,0.300464
3,5032,0.095307
4,17599,0.115762


In [52]:
submission[target_col].describe()

count    15021.000000
mean         0.132586
std          0.053070
min          0.069913
25%          0.095577
50%          0.107573
75%          0.156957
max          0.367429
Name: target, dtype: float64

In [54]:
submission.to_csv(SUBMISSIONS/'01-lgbm_baseline.csv', index=False)

## CatBoost

In [56]:
columns = list(train.drop([id_col, target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [62]:
def run_model(model, X_trn, y_trn, X_tst, kfolds):
    y_tests = []
    for trn_idx, val_idx in kfolds.split(X_trn, y_trn):
        model.fit(X_trn.iloc[trn_idx], y_trn.iloc[trn_idx], 
                  eval_set=[(X_trn.iloc[val_idx], y_trn.iloc[val_idx])],
                  use_best_model=True,
                  cat_features=cat_indices)
        y_trn_pred = model.predict_proba(X_trn.iloc[trn_idx])[:,1]
        y_val_pred = model.predict_proba(X_trn.iloc[val_idx])[:,1]
        print(f'Train AUC: {roc_auc_score(y_trn.iloc[trn_idx], y_trn_pred):.4f}')
        print(f'Val AUC: {roc_auc_score(y_trn.iloc[val_idx], y_val_pred):.4f}')        
        y_tests.append(model.predict_proba(X_tst)[:,1])
        print()
        
    return y_tests

In [60]:
from catboost import CatBoostClassifier

In [63]:
y_tests = run_model(CatBoostClassifier(od_pval=0.00001, eval_metric='AUC'),
                    train.drop([id_col, target_col], axis=1), train[target_col],
                    test.drop(id_col, axis=1), kfolds)

Learning rate set to 0.103462
0:	test: 0.5922307	best: 0.5922307 (0)	total: 127ms	remaining: 2m 6s
1:	test: 0.5922307	best: 0.5922307 (0)	total: 152ms	remaining: 1m 16s
2:	test: 0.5922307	best: 0.5922307 (0)	total: 173ms	remaining: 57.5s
3:	test: 0.6240749	best: 0.6240749 (3)	total: 249ms	remaining: 1m 2s
4:	test: 0.6219002	best: 0.6240749 (3)	total: 309ms	remaining: 1m 1s
5:	test: 0.6285461	best: 0.6285461 (5)	total: 332ms	remaining: 55s
6:	test: 0.6269727	best: 0.6285461 (5)	total: 350ms	remaining: 49.7s
7:	test: 0.6432094	best: 0.6432094 (7)	total: 400ms	remaining: 49.7s
8:	test: 0.6429690	best: 0.6432094 (7)	total: 455ms	remaining: 50.1s
9:	test: 0.6438676	best: 0.6438676 (9)	total: 529ms	remaining: 52.4s
10:	test: 0.6424172	best: 0.6438676 (9)	total: 555ms	remaining: 49.9s
11:	test: 0.6504236	best: 0.6504236 (11)	total: 588ms	remaining: 48.4s
12:	test: 0.6537050	best: 0.6537050 (12)	total: 609ms	remaining: 46.3s
13:	test: 0.6539696	best: 0.6539696 (13)	total: 626ms	remaining: 44.1

### LGBM
| Train | Val | Iterations
| ------------------- |
| 0.7050 | 0.6753 | 9
| 0.7156 | 0.6414 | 9
| 0.7442 | 0.6581 | 21
| 0.7332 | 0.6662 | 17
| 0.7201 | 0.6752 | 13

### CatBoost
| Train | Val | Iterations
| ------------------- |
| 0.7012 | 0.6843 | 118
| 0.7199 | 0.6415 | 229
| 0.7017 | 0.6632 | 126
| 0.7145 | 0.6740 | 228
| 0.7032 | 0.6793 | 113

In [64]:
test[target_col] = np.array(y_tests).sum(axis=0) / 5

In [65]:
submission = pd.read_csv(RAW/'sample_submission_sxfcbdx.csv', low_memory=False)
submission = pd.merge(submission[[id_col]], test[[id_col, target_col]], how='left', on=id_col)

In [66]:
submission.head()

Unnamed: 0,enrollee_id,target
0,16548,0.546757
1,12036,0.073884
2,11061,0.391194
3,5032,0.06913
4,17599,0.092532


In [67]:
submission[target_col].describe()

count    15021.000000
mean         0.130130
std          0.071805
min          0.053290
25%          0.079183
50%          0.101023
75%          0.158160
max          0.568715
Name: target, dtype: float64

In [68]:
submission.to_csv(SUBMISSIONS/'02-catboost_baseline.csv', index=False)