In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [8]:
train = pd.read_feather(PROCESSED/'week_0409_val.feather')
val = pd.read_feather(PROCESSED/'week_0416_val.feather')
test  = pd.read_feather(PROCESSED/'test_datediffs.feather')

In [7]:
train.head()

Unnamed: 0,index,TradeDateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity
0,0,20180409,0,21856,Buy,0.0,282,282,27,19
1,1,20180409,0,21856,Sell,0.0,282,282,27,19
2,2,20180409,0,24944,Buy,0.0,282,28,27,19
3,3,20180409,0,24944,Sell,0.0,28,28,27,19
4,4,20180409,0,25992,Buy,0.0,41,41,27,13


In [9]:
test.head()

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,TradeDateKey,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity
0,a1e0d80784,20180423,1856,13323,Buy,,20180423,296,296,3,3
1,c2cc6cc2a8,20180423,1856,9230,Buy,,20180423,14,14,3,3
2,a8e94f6344,20180423,1780,9157,Buy,,20180423,296,296,3,296
3,758bae1e35,20180423,2129,9131,Buy,,20180423,296,296,3,11
4,02ab378ee8,20180423,1758,7151,Buy,,20180423,296,296,3,33


In [14]:
%%time
from src.utils import add_dayscount
add_dayscount(train, trade[trade.TradeDateKey > 20180000])
add_dayscount(val, trade[trade.TradeDateKey > 20180000])
add_dayscount(test, trade[trade.TradeDateKey > 20180000])

In [22]:
%%time
from src.utils import preprocessing_pipeline
train = preprocessing_pipeline(train, customer, isin, trade)
val   = preprocessing_pipeline(val, customer, isin, trade)
test  = preprocessing_pipeline(test, customer, isin, trade)

CPU times: user 860 ms, sys: 180 ms, total: 1.04 s
Wall time: 1.09 s


In [23]:
%%time
train.reset_index().to_feather(PROCESSED/'week_0409_diffscount.feather')
val.reset_index().to_feather(PROCESSED/'week_0416_diffscount.feather')
test.reset_index().to_feather(PROCESSED/'week_0423_diffscount.feather')

CPU times: user 3.46 s, sys: 1.22 s, total: 4.68 s
Wall time: 4.68 s


In [4]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [24]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [29]:
train.drop('index', axis=1, inplace=True)
val.drop('index', axis=1, inplace=True)
test.drop('DateKey', axis=1, inplace=True)

In [30]:
train.shape, val.shape, test.shape

((492906, 33), (493590, 33), (484758, 34))

## Preprocessing

In [31]:
from src.utils import apply_cats
for col in cat_cols:
    test[col] = test[col].astype('category').cat.as_ordered()
apply_cats(train, test)
apply_cats(val, test)

for col in cat_cols:
    train[col] = train[col].cat.codes
    val[col] = val[col].cat.codes
    test[col] = test[col].cat.codes

## Model

In [32]:
import time, pprint
pp = pprint.PrettyPrinter(indent=3)

In [33]:
from sklearn.metrics import roc_auc_score

# globals: [cat_indices]
def fit_model(model, model_name, X_trn, y_trn, X_val, y_val):
    if model_name in ['XGBClassifier', 'LGBMClassifier']:
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=30,
                  eval_metric='auc')
    elif model_name == 'CatBoostClassifier':
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  use_best_model=True,
                  cat_features=cat_indices)
    else:
        model.fit(X_trn, y_trn)
        
def calculate_metrics(model, metrics, X_trn, y_trn, X_val, y_val):
    metric_function = {'auc': roc_auc_score}
    dset = {'trn': {'X': X_trn, 'y': y_trn},
            'val': {'X': X_val, 'y': y_val}}
    
    for d in dset:
        y_pred = model.predict_proba(dset[d]['X'])[:,1]
        for m in metrics:
            metrics[m][d] += [metric_function[m](dset[d]['y'], y_pred)]
                
    pp.pprint(metrics)
    print()

In [34]:
def run_model(model, X_train, y_train, X_val, y_val, X_test, 
              metric_names, results=None, dataset_desc='', params_desc=''):
    model_name = str(model.__class__).split('.')[-1].replace('>','').replace("'",'')
    print(model_name, '\n')
    if results is None: results = pd.DataFrame()
    metrics = {metric: {'trn': [], 'val': []} for metric in metric_names}
    y_test = np.zeros((len(X_test)))
    start = time.time()
    
    fit_model(model, model_name, X_train, y_train, X_val, y_val)
    calculate_metrics(model, metrics, X_train, y_train, X_val, y_val)
    y_test = model.predict_proba(X_test)[:,1]
            
    end = time.time()
    means = {f'{d}_{m}_mean': np.mean(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    stds  = {f'{d}_{m}_std': np.std(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    metadata = {'model': model_name, 'dataset': dataset_desc,
                'params': params_desc, 'time': round(end - start, 2)}
    pp.pprint(means)
    results = results.append(pd.Series({**metadata, **means, **stds}), ignore_index=True)
    return y_test, metrics, results, model

In [35]:
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [36]:
%%time
y_test_0409, metrics, results, model = run_model(
            LGBMClassifier(n_estimators=1000),
            train.drop(id_cols + [target_col], axis=1),
            train[target_col],
            val.drop(id_cols + [target_col], axis=1),
            val[target_col],
            test.drop(id_cols + [target_col, pred_col], axis=1),
            metric_names, None, 
            params_desc='n_estimators=1000',
            dataset_desc='week_datediffs')

LGBMClassifier 

[1]	valid_0's auc: 0.726659
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.730065
[3]	valid_0's auc: 0.73239
[4]	valid_0's auc: 0.745058
[5]	valid_0's auc: 0.746611
[6]	valid_0's auc: 0.747184
[7]	valid_0's auc: 0.748441
[8]	valid_0's auc: 0.748515
[9]	valid_0's auc: 0.749002
[10]	valid_0's auc: 0.750193
[11]	valid_0's auc: 0.750263
[12]	valid_0's auc: 0.750629
[13]	valid_0's auc: 0.750929
[14]	valid_0's auc: 0.751122
[15]	valid_0's auc: 0.751986
[16]	valid_0's auc: 0.752665
[17]	valid_0's auc: 0.752993
[18]	valid_0's auc: 0.753173
[19]	valid_0's auc: 0.753433
[20]	valid_0's auc: 0.753773
[21]	valid_0's auc: 0.754055
[22]	valid_0's auc: 0.755153
[23]	valid_0's auc: 0.755584
[24]	valid_0's auc: 0.755423
[25]	valid_0's auc: 0.757235
[26]	valid_0's auc: 0.75805
[27]	valid_0's auc: 0.758561
[28]	valid_0's auc: 0.758942
[29]	valid_0's auc: 0.758745
[30]	valid_0's auc: 0.759008
[31]	valid_0's auc: 0.759333
[32]	valid_0's auc: 0.759869
[33]

In [37]:
%%time
## previous week
y_test_0416, metrics, results, model = run_model(
            LGBMClassifier(n_estimators=100),
            val.drop(id_cols + [target_col], axis=1),
            val[target_col],
            train.drop(id_cols + [target_col], axis=1),
            train[target_col],
            test.drop(id_cols + [target_col, pred_col], axis=1),
            metric_names, results, 
            params_desc='n_estimators=100',
            dataset_desc='week_datediffs_0416')

LGBMClassifier 

[1]	valid_0's auc: 0.719847
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.727257
[3]	valid_0's auc: 0.727494
[4]	valid_0's auc: 0.733128
[5]	valid_0's auc: 0.732506
[6]	valid_0's auc: 0.733892
[7]	valid_0's auc: 0.733932
[8]	valid_0's auc: 0.734775
[9]	valid_0's auc: 0.734852
[10]	valid_0's auc: 0.735387
[11]	valid_0's auc: 0.735663
[12]	valid_0's auc: 0.736812
[13]	valid_0's auc: 0.737172
[14]	valid_0's auc: 0.737268
[15]	valid_0's auc: 0.738106
[16]	valid_0's auc: 0.740057
[17]	valid_0's auc: 0.739857
[18]	valid_0's auc: 0.739982
[19]	valid_0's auc: 0.741471
[20]	valid_0's auc: 0.742623
[21]	valid_0's auc: 0.743457
[22]	valid_0's auc: 0.744619
[23]	valid_0's auc: 0.744891
[24]	valid_0's auc: 0.745553
[25]	valid_0's auc: 0.745355
[26]	valid_0's auc: 0.745335
[27]	valid_0's auc: 0.745714
[28]	valid_0's auc: 0.74617
[29]	valid_0's auc: 0.74626
[30]	valid_0's auc: 0.746489
[31]	valid_0's auc: 0.746917
[32]	valid_0's auc: 0.747551
[33]

In [38]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,trn_auc_std,val_auc_mean,val_auc_std
0,week_datediffs,LGBMClassifier,n_estimators=1000,8.05,0.820482,0.0,0.768241,0.0
1,week_datediffs_0416,LGBMClassifier,n_estimators=100,6.97,0.829352,0.0,0.754512,0.0


In [39]:
pd.Series(y_test_0409).describe()

count    484758.000000
mean          0.030416
std           0.038004
min           0.002028
25%           0.010079
50%           0.018833
75%           0.041508
max           0.918418
dtype: float64

In [40]:
pd.Series(y_test_0416).describe()

count    484758.000000
mean          0.029725
std           0.033532
min           0.001999
25%           0.009525
50%           0.017800
75%           0.043501
max           0.906194
dtype: float64

In [41]:
test[target_col] = np.mean([y_test_0409, y_test_0416], axis=0)

In [47]:
test[target_col] = y_test_0416

## Submission

In [48]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [49]:
submission[target_col].describe()

count    484758.000000
mean          0.029725
std           0.033532
min           0.001999
25%           0.009525
50%           0.017800
75%           0.043501
max           0.906194
Name: CustomerInterest, dtype: float64

In [50]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.057804
1,c2cc6cc2a8,0.07604
2,a8e94f6344,0.043556
3,758bae1e35,0.043243
4,02ab378ee8,0.043728


In [51]:
submission.to_csv(SUBMISSIONS/'07-lgbm_week_datediffs_dayscount_0416.csv', index=False)