In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [4]:
from src.utils import make_val_set

In [5]:
%%time
week_0416 = make_val_set(trade[trade.TradeDateKey >= 20180416],
                         challenge)

CPU times: user 31.1 s, sys: 136 ms, total: 31.3 s
Wall time: 31.3 s


In [6]:
week_0416.CustomerInterest.value_counts()

0.0    476311
1.0     17279
Name: CustomerInterest, dtype: int64

In [7]:
# 3.5 % of positive labels in val set
17279 / len(week_0416)

0.03500678700946129

In [8]:
%%time
week_0409 = make_val_set(trade[(trade.TradeDateKey >= 20180409) &
                               (trade.TradeDateKey <  20180416)],
                         challenge)

CPU times: user 31.2 s, sys: 100 ms, total: 31.3 s
Wall time: 31.3 s


In [9]:
week_0409.CustomerInterest.value_counts()

0.0    475374
1.0     17532
Name: CustomerInterest, dtype: int64

In [10]:
# 3.55 % of positive labels in train set
17532 / len(week_0409)

0.035568647977504836

In [11]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [12]:
from src.utils import add_datediffs

In [13]:
%%time
add_datediffs(week_0409, trade[trade.TradeDateKey > 20180000])
add_datediffs(week_0416, trade[trade.TradeDateKey > 20180000])

CPU times: user 1min 23s, sys: 316 ms, total: 1min 24s
Wall time: 1min 24s


In [14]:
week_0409.reset_index().to_feather(PROCESSED/'week_0409_val.feather')
week_0416.reset_index().to_feather(PROCESSED/'week_0416_val.feather')

## Train/val/test

In [15]:
train = week_0409
val   = week_0416
test  = pd.read_feather(PROCESSED/'test_datediffs.feather')

In [16]:
from src.utils import preprocessing_pipeline
test  = preprocessing_pipeline(test, customer, isin, trade)
val   = preprocessing_pipeline(val, customer, isin, trade)
train = preprocessing_pipeline(train, customer, isin, trade)

In [17]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [18]:
test.drop('DateKey', axis=1, inplace=True)

In [19]:
train.shape, val.shape, test.shape

((492906, 29), (493590, 29), (484758, 30))

## Preprocessing

In [20]:
from src.utils import apply_cats
for col in cat_cols:
    test[col] = test[col].astype('category').cat.as_ordered()
apply_cats(train, test)
apply_cats(val, test)

In [21]:
for col in cat_cols:
    train[col] = train[col].cat.codes
    val[col] = val[col].cat.codes
    test[col] = test[col].cat.codes

## Model

In [22]:
import time, pprint
pp = pprint.PrettyPrinter(indent=3)

In [23]:
from sklearn.metrics import roc_auc_score

# globals: [cat_indices]
def fit_model(model, model_name, X_trn, y_trn, X_val, y_val):
    if model_name in ['XGBClassifier', 'LGBMClassifier']:
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=30,
                  eval_metric='auc')
    elif model_name == 'CatBoostClassifier':
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  use_best_model=True,
                  cat_features=cat_indices)
    else:
        model.fit(X_trn, y_trn)
        
def calculate_metrics(model, metrics, X_trn, y_trn, X_val, y_val):
    metric_function = {'auc': roc_auc_score}
    dset = {'trn': {'X': X_trn, 'y': y_trn},
            'val': {'X': X_val, 'y': y_val}}
    
    for d in dset:
        y_pred = model.predict_proba(dset[d]['X'])[:,1]
        for m in metrics:
            metrics[m][d] += [metric_function[m](dset[d]['y'], y_pred)]
                
    pp.pprint(metrics)
    print()

In [24]:
def run_model(model, X_train, y_train, X_val, y_val, X_test, 
              metric_names, results=None, dataset_desc='', params_desc=''):
    model_name = str(model.__class__).split('.')[-1].replace('>','').replace("'",'')
    print(model_name, '\n')
    if results is None: results = pd.DataFrame()
    metrics = {metric: {'trn': [], 'val': []} for metric in metric_names}
    y_test = np.zeros((len(X_test)))
    start = time.time()
    
    fit_model(model, model_name, X_train, y_train, X_val, y_val)
    calculate_metrics(model, metrics, X_train, y_train, X_val, y_val)
    y_test = model.predict_proba(X_test)[:,1]
            
    end = time.time()
    means = {f'{d}_{m}_mean': np.mean(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    stds  = {f'{d}_{m}_std': np.std(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    metadata = {'model': model_name, 'dataset': dataset_desc,
                'params': params_desc, 'time': round(end - start, 2)}
    pp.pprint(means)
    results = results.append(pd.Series({**metadata, **means, **stds}), ignore_index=True)
    return y_test, metrics, results, model

In [25]:
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [30]:
%%time
## previous week
y_test_0416, metrics, results, model = run_model(
            LGBMClassifier(n_estimators=100),
            val.drop(id_cols + [target_col], axis=1),
            val[target_col],
            train.drop(id_cols + [target_col], axis=1),
            train[target_col],
            test.drop(id_cols + [target_col, pred_col], axis=1),
            metric_names, results, 
            params_desc='n_estimators=100',
            dataset_desc='week_datediffs_0416')

LGBMClassifier 

[1]	valid_0's auc: 0.710341
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.713879
[3]	valid_0's auc: 0.715493
[4]	valid_0's auc: 0.715745
[5]	valid_0's auc: 0.716249
[6]	valid_0's auc: 0.717218
[7]	valid_0's auc: 0.716961
[8]	valid_0's auc: 0.717076
[9]	valid_0's auc: 0.717012
[10]	valid_0's auc: 0.718282
[11]	valid_0's auc: 0.722244
[12]	valid_0's auc: 0.722388
[13]	valid_0's auc: 0.723009
[14]	valid_0's auc: 0.723447
[15]	valid_0's auc: 0.723379
[16]	valid_0's auc: 0.723424
[17]	valid_0's auc: 0.723724
[18]	valid_0's auc: 0.724602
[19]	valid_0's auc: 0.724979
[20]	valid_0's auc: 0.724764
[21]	valid_0's auc: 0.725522
[22]	valid_0's auc: 0.72634
[23]	valid_0's auc: 0.726645
[24]	valid_0's auc: 0.726865
[25]	valid_0's auc: 0.727343
[26]	valid_0's auc: 0.727477
[27]	valid_0's auc: 0.72806
[28]	valid_0's auc: 0.728749
[29]	valid_0's auc: 0.728842
[30]	valid_0's auc: 0.729482
[31]	valid_0's auc: 0.729735
[32]	valid_0's auc: 0.729898
[33]

In [46]:
%%time
## previous week
y_test_0416, metrics, results = run_model(
            LGBMClassifier(n_estimators=100),
            val.drop(id_cols + [target_col], axis=1),
            val[target_col],
            train.drop(id_cols + [target_col], axis=1),
            train[target_col],
            test.drop(id_cols + [target_col, pred_col], axis=1),
            metric_names, results, 
            params_desc='n_estimators=100',
            dataset_desc='week_datediffs_0416')

LGBMClassifier 

{'auc': {'trn': [0.9032930113614415], 'val': [0.8408082677203096]}}

{'trn_auc_mean': 0.9032930113614415, 'val_auc_mean': 0.8408082677203096}
CPU times: user 22.1 s, sys: 348 ms, total: 22.5 s
Wall time: 5.84 s


In [28]:
%%time
y_test, metrics, results, model = run_model(
            LGBMClassifier(n_estimators=1000),
            train.drop(id_cols + [target_col], axis=1),
            train[target_col],
            val.drop(id_cols + [target_col], axis=1),
            val[target_col],
            test.drop(id_cols + [target_col, pred_col], axis=1),
            metric_names, None, 
            params_desc='n_estimators=1000',
            dataset_desc='week_datediffs')

LGBMClassifier 

[1]	valid_0's auc: 0.713723
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.714893
[3]	valid_0's auc: 0.715392
[4]	valid_0's auc: 0.714957
[5]	valid_0's auc: 0.725616
[6]	valid_0's auc: 0.728212
[7]	valid_0's auc: 0.729469
[8]	valid_0's auc: 0.733307
[9]	valid_0's auc: 0.733677
[10]	valid_0's auc: 0.734072
[11]	valid_0's auc: 0.734414
[12]	valid_0's auc: 0.736255
[13]	valid_0's auc: 0.736838
[14]	valid_0's auc: 0.737086
[15]	valid_0's auc: 0.737444
[16]	valid_0's auc: 0.738003
[17]	valid_0's auc: 0.738066
[18]	valid_0's auc: 0.738219
[19]	valid_0's auc: 0.738565
[20]	valid_0's auc: 0.742887
[21]	valid_0's auc: 0.743422
[22]	valid_0's auc: 0.743716
[23]	valid_0's auc: 0.744063
[24]	valid_0's auc: 0.74488
[25]	valid_0's auc: 0.745247
[26]	valid_0's auc: 0.745402
[27]	valid_0's auc: 0.745709
[28]	valid_0's auc: 0.745854
[29]	valid_0's auc: 0.746797
[30]	valid_0's auc: 0.746894
[31]	valid_0's auc: 0.747145
[32]	valid_0's auc: 0.747049
[33

In [36]:
%%time
y_test, metrics, results = run_model(
            LGBMClassifier(n_estimators=1000),
            train.drop(id_cols + [target_col], axis=1),
            train[target_col],
            val.drop(id_cols + [target_col], axis=1),
            val[target_col],
            test.drop(id_cols + [target_col, pred_col], axis=1),
            metric_names, None, 
            params_desc='n_estimators=1000',
            dataset_desc='week_datediffs')

LGBMClassifier 

[1]	valid_0's auc: 0.818243
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.818344
[3]	valid_0's auc: 0.818317
[4]	valid_0's auc: 0.820228
[5]	valid_0's auc: 0.820721
[6]	valid_0's auc: 0.820625
[7]	valid_0's auc: 0.83575
[8]	valid_0's auc: 0.835687
[9]	valid_0's auc: 0.837461
[10]	valid_0's auc: 0.83852
[11]	valid_0's auc: 0.838576
[12]	valid_0's auc: 0.838712
[13]	valid_0's auc: 0.839013
[14]	valid_0's auc: 0.839285
[15]	valid_0's auc: 0.839157
[16]	valid_0's auc: 0.841013
[17]	valid_0's auc: 0.841273
[18]	valid_0's auc: 0.841472
[19]	valid_0's auc: 0.841413
[20]	valid_0's auc: 0.841734
[21]	valid_0's auc: 0.841835
[22]	valid_0's auc: 0.841731
[23]	valid_0's auc: 0.841948
[24]	valid_0's auc: 0.842023
[25]	valid_0's auc: 0.842796
[26]	valid_0's auc: 0.843562
[27]	valid_0's auc: 0.844094
[28]	valid_0's auc: 0.844029
[29]	valid_0's auc: 0.844186
[30]	valid_0's auc: 0.844152
[31]	valid_0's auc: 0.845846
[32]	valid_0's auc: 0.846581
[33]

In [47]:
# all data
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,trn_auc_std,val_auc_mean,val_auc_std
0,week_datediffs,LGBMClassifier,n_estimators=1000,5.76,0.871636,0.0,0.854498,0.0
1,week_datediffs_0416,LGBMClassifier,n_estimators=100,5.23,0.903293,0.0,0.840808,0.0


In [31]:
# 2018 data
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,trn_auc_std,val_auc_mean,val_auc_std
0,week_datediffs,LGBMClassifier,n_estimators=1000,7.93,0.794849,0.0,0.754695,0.0
1,week_datediffs_0416,LGBMClassifier,n_estimators=100,7.07,0.81311,0.0,0.74207,0.0


In [32]:
y_test_0409 = y_test

In [33]:
y_test_0416[:5]

array([0.05645612, 0.03291414, 0.03697125, 0.04721988, 0.03335506])

In [35]:
y_test_0409[:5]

array([0.04532538, 0.05524985, 0.0432417 , 0.04497969, 0.03397098])

In [36]:
pd.Series(y_test_0409).describe()

count    484758.000000
mean          0.030018
std           0.032947
min           0.002193
25%           0.011776
50%           0.019497
75%           0.042435
max           0.893118
dtype: float64

In [37]:
pd.Series(y_test_0416).describe()

count    484758.000000
mean          0.029301
std           0.030290
min           0.001679
25%           0.009946
50%           0.019143
75%           0.043250
max           0.794302
dtype: float64

In [41]:
np.mean([y_test_0409, y_test_0416], axis=0).shape

(484758,)

In [42]:
test[target_col] = np.mean([y_test_0409, y_test_0416], axis=0)

## Submission

In [43]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], how='left', on='PredictionIdx')

In [44]:
submission[target_col].describe()

count    484758.000000
mean          0.029660
std           0.029503
min           0.002250
25%           0.011114
50%           0.019534
75%           0.043570
max           0.815421
Name: CustomerInterest, dtype: float64

In [40]:
submission[target_col].describe()

count    484758.000000
mean          0.082337
std           0.103445
min           0.002977
25%           0.011680
50%           0.020786
75%           0.198452
max           0.824155
Name: CustomerInterest, dtype: float64

In [45]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.050891
1,c2cc6cc2a8,0.044082
2,a8e94f6344,0.040106
3,758bae1e35,0.0461
4,02ab378ee8,0.033663


In [41]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.151956
1,c2cc6cc2a8,0.047319
2,a8e94f6344,0.190352
3,758bae1e35,0.252158
4,02ab378ee8,0.211535


In [46]:
submission.to_csv(SUBMISSIONS/'lgbm_week_datediffs_0416.csv', index=False)