In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [55]:
DATA      = Path('data')
RAW       = DATA/'raw'
INTERIM   = DATA/'interim'
PROCESSED = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [18]:
train = pd.read_feather(PROCESSED/'thursday_datediffs.feather')
val   = pd.read_feather(PROCESSED/'friday_datediffs.feather')
test  = pd.read_feather(PROCESSED/'test_datediffs.feather')

In [8]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [19]:
from src.utils import preprocessing_pipeline
test  = preprocessing_pipeline(test, customer, isin, trade)
val   = preprocessing_pipeline(val, customer, isin, trade)
train = preprocessing_pipeline(train, customer, isin, trade)

In [29]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [28]:
train.shape, val.shape, test.shape

((486338, 29), (487172, 29), (484758, 30))

In [27]:
val.drop('index', axis=1, inplace=True)
test.drop('DateKey', axis=1, inplace=True)

## Preprocessing

In [31]:
from src.utils import apply_cats
for col in cat_cols:
    test[col] = test[col].astype('category').cat.as_ordered()
apply_cats(train, test)
apply_cats(val, test)

In [32]:
for col in cat_cols:
    train[col] = train[col].cat.codes
    val[col] = val[col].cat.codes
    test[col] = test[col].cat.codes

## Model

In [33]:
import time, pprint

pp = pprint.PrettyPrinter(indent=3)

In [41]:
from sklearn.metrics import roc_auc_score

# globals: [cat_indices]
def fit_model(model, model_name, X_trn, y_trn, X_val, y_val):
    if model_name in ['XGBClassifier', 'LGBMClassifier']:
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=30,
                  eval_metric='auc')
    elif model_name == 'CatBoostClassifier':
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  use_best_model=True,
                  cat_features=cat_indices)
    else:
        model.fit(X_trn, y_trn)
        
def calculate_metrics(model, metrics, X_trn, y_trn, X_val, y_val):
    metric_function = {'auc': roc_auc_score}
    dset = {'trn': {'X': X_trn, 'y': y_trn},
            'val': {'X': X_val, 'y': y_val}}
    
    for d in dset:
        y_pred = model.predict_proba(dset[d]['X'])[:,1]
        for m in metrics:
            metrics[m][d] += [metric_function[m](dset[d]['y'], y_pred)]
                
    pp.pprint(metrics)
    print()

In [42]:
# globals: [kfolds]
def run_model(model, X_train, y_train, X_val, y_val, X_test, 
              metric_names, results=None, dataset_desc='', params_desc=''):
    model_name = str(model.__class__).split('.')[-1].replace('>','').replace("'",'')
    print(model_name, '\n')
    if results is None: results = pd.DataFrame()
    metrics = {metric: {'trn': [], 'val': []} for metric in metric_names}
    y_test = np.zeros((len(X_test)))
    start = time.time()
    
    fit_model(model, model_name, X_train, y_train, X_val, y_val)
    calculate_metrics(model, metrics, X_train, y_train, X_val, y_val)
    y_test = model.predict_proba(X_test)[:,1]
            
    end = time.time()
    means = {f'{d}_{m}_mean': np.mean(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    stds  = {f'{d}_{m}_std': np.std(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    metadata = {'model': model_name, 'dataset': dataset_desc,
                'params': params_desc, 'time': round(end - start, 2)}
    pp.pprint(means)
    results = results.append(pd.Series({**metadata, **means, **stds}), ignore_index=True)
    return y_test, metrics, results

In [43]:
from lightgbm import LGBMClassifier

In [44]:
metric_names = ['auc']

In [45]:
%%time
y_test, metrics, results = run_model(
            LGBMClassifier(n_estimators=1000),
            train.drop(id_cols + [target_col], axis=1),
            train[target_col],
            val.drop(id_cols + [target_col], axis=1),
            val[target_col],
            test.drop(id_cols + [target_col, pred_col], axis=1),
            metric_names, None, 
            params_desc='default',
            dataset_desc='metadata-only')

LGBMClassifier 

[1]	valid_0's auc: 0.741553
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.744723
[3]	valid_0's auc: 0.744732
[4]	valid_0's auc: 0.745221
[5]	valid_0's auc: 0.744875
[6]	valid_0's auc: 0.745217
[7]	valid_0's auc: 0.745576
[8]	valid_0's auc: 0.745646
[9]	valid_0's auc: 0.745685
[10]	valid_0's auc: 0.746635
[11]	valid_0's auc: 0.746615
[12]	valid_0's auc: 0.74715
[13]	valid_0's auc: 0.747248
[14]	valid_0's auc: 0.74723
[15]	valid_0's auc: 0.74724
[16]	valid_0's auc: 0.74967
[17]	valid_0's auc: 0.749886
[18]	valid_0's auc: 0.749753
[19]	valid_0's auc: 0.749695
[20]	valid_0's auc: 0.750647
[21]	valid_0's auc: 0.751103
[22]	valid_0's auc: 0.751521
[23]	valid_0's auc: 0.75182
[24]	valid_0's auc: 0.75224
[25]	valid_0's auc: 0.752271
[26]	valid_0's auc: 0.752359
[27]	valid_0's auc: 0.7531
[28]	valid_0's auc: 0.753542
[29]	valid_0's auc: 0.753873
[30]	valid_0's auc: 0.753999
[31]	valid_0's auc: 0.754725
[32]	valid_0's auc: 0.754648
[33]	valid

In [47]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,trn_auc_std,val_auc_mean,val_auc_std
0,metadata-only,LGBMClassifier,default,7.18,0.898527,0.0,0.774603,0.0


In [49]:
test[target_col] = y_test

## Submission

In [50]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], how='left', on='PredictionIdx')

In [52]:
submission[target_col].describe()

count    484758.000000
mean          0.004684
std           0.008822
min           0.000173
25%           0.000843
50%           0.002007
75%           0.006896
max           0.740771
Name: CustomerInterest, dtype: float64

In [53]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.007894
1,c2cc6cc2a8,0.0069
2,a8e94f6344,0.006347
3,758bae1e35,0.009346
4,02ab378ee8,0.004371


In [56]:
submission.to_csv(SUBMISSIONS/'lgbm_datediffs.csv', index=False)