In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [18]:
from src.utils import make_val_set, add_datediffs, add_dayscount, preprocessing_pipeline

In [6]:
week_labels = [20180226, 20180305, 20180312, 20180319, 
               20180326, 20180402, 20180409, 20180416, 20180423]

In [8]:
%%time
weeks = [make_val_set(trade[trade.TradeDateKey.apply(lambda x: w1<=x<w2)], 
                    challenge) for w1, w2 in zip(week_labels[:-1], week_labels[1:])]

CPU times: user 4min 39s, sys: 1.24 s, total: 4min 40s
Wall time: 4min 40s


In [16]:
for w in weeks:
    print(w.CustomerInterest.value_counts())

0.0    663675
1.0     19353
Name: CustomerInterest, dtype: int64
0.0    472091
1.0     16685
Name: CustomerInterest, dtype: int64
0.0    471135
1.0     18231
Name: CustomerInterest, dtype: int64
0.0    488248
1.0     15498
Name: CustomerInterest, dtype: int64
0.0    673034
1.0     16790
Name: CustomerInterest, dtype: int64
0.0    486290
1.0     14552
Name: CustomerInterest, dtype: int64
0.0    475374
1.0     17532
Name: CustomerInterest, dtype: int64
0.0    476311
1.0     17279
Name: CustomerInterest, dtype: int64


In [20]:
%%time
for w in weeks:
    add_datediffs(w, trade[trade.TradeDateKey > 20180000])
    add_dayscount(w, trade[trade.TradeDateKey > 20180000])

CPU times: user 9min 54s, sys: 3.8 s, total: 9min 58s
Wall time: 10min 2s


In [30]:
%%time
for i, w in enumerate(weeks):
    weeks[i] = preprocessing_pipeline(w, customer, isin, trade)

CPU times: user 2.02 s, sys: 716 ms, total: 2.74 s
Wall time: 3.51 s


In [26]:
test = pd.read_feather(PROCESSED/'week_0423_diffscount.feather')
test.drop(['DateKey', 'index'], axis=1, inplace=True)
weeks.append(test)

In [38]:
%%time
for w, name in zip(weeks, week_labels):
    w.reset_index().to_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather')

CPU times: user 11.6 s, sys: 4.14 s, total: 15.7 s
Wall time: 19.6 s


## Preprocessing

In [39]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [40]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [46]:
from src.utils import apply_cats
for col in cat_cols:
    weeks[-1][col] = weeks[-1][col].astype('category').cat.as_ordered()
for w in weeks[:-1]:
    apply_cats(w, weeks[-1])

for w in weeks:
    for col in cat_cols:
        w[col] = w[col].cat.codes

## Model

In [42]:
from src.utils import run_model
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [43]:
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    print(train['TradeDateKey'].unique(), 
          val['TradeDateKey'].unique(), 
          test['TradeDateKey'].unique())

[20180226] [20180305] [20180423]
[20180305] [20180312] [20180423]
[20180312] [20180319] [20180423]
[20180319] [20180326] [20180423]
[20180326] [20180402] [20180423]
[20180402] [20180409] [20180423]
[20180409] [20180416] [20180423]
[20180416] [20180423] [20180423]


In [47]:
%%time
results = None
output = []
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            LGBMClassifier(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='n_estimators=120',
            dataset_desc=f'{week_labels[i]}_diffcounts', 
            early_stopping=True)
    output.append([y_test, model])

LGBMClassifier 

[1]	valid_0's auc: 0.588856
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.589327
[3]	valid_0's auc: 0.591072
[4]	valid_0's auc: 0.596908
[5]	valid_0's auc: 0.598766
[6]	valid_0's auc: 0.606072
[7]	valid_0's auc: 0.610939
[8]	valid_0's auc: 0.61618
[9]	valid_0's auc: 0.619665
[10]	valid_0's auc: 0.619252
[11]	valid_0's auc: 0.617324
[12]	valid_0's auc: 0.616155
[13]	valid_0's auc: 0.617634
[14]	valid_0's auc: 0.617795
[15]	valid_0's auc: 0.618791
[16]	valid_0's auc: 0.618599
[17]	valid_0's auc: 0.620581
[18]	valid_0's auc: 0.621722
[19]	valid_0's auc: 0.624371
[20]	valid_0's auc: 0.624872
[21]	valid_0's auc: 0.626597
[22]	valid_0's auc: 0.626039
[23]	valid_0's auc: 0.628256
[24]	valid_0's auc: 0.628355
[25]	valid_0's auc: 0.628738
[26]	valid_0's auc: 0.628607
[27]	valid_0's auc: 0.630683
[28]	valid_0's auc: 0.631619
[29]	valid_0's auc: 0.631028
[30]	valid_0's auc: 0.631737
[31]	valid_0's auc: 0.633127
[32]	valid_0's auc: 0.633541
[33

In [55]:
len(weeks)

9

In [48]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,LGBMClassifier,n_estimators=120,9.38,0.79324,0.649368
1,20180305_diffcounts,LGBMClassifier,n_estimators=120,6.13,0.747396,0.686385
2,20180312_diffcounts,LGBMClassifier,n_estimators=120,7.39,0.77542,0.721112
3,20180319_diffcounts,LGBMClassifier,n_estimators=120,6.34,0.76432,0.677241
4,20180326_diffcounts,LGBMClassifier,n_estimators=120,8.73,0.827969,0.700111
5,20180402_diffcounts,LGBMClassifier,n_estimators=120,6.22,0.790773,0.729939
6,20180409_diffcounts,LGBMClassifier,n_estimators=120,8.17,0.820482,0.768241
7,20180416_diffcounts,LGBMClassifier,n_estimators=120,4.36,0.840449,0.0


In [49]:
# first 5 predictions (2018 data)
np.array([x[0] for x in output])[:,:5]

array([[0.05339676, 0.16143834, 0.02214048, 0.02385522, 0.01883884],
       [0.0365234 , 0.10768938, 0.03010835, 0.06302508, 0.02698126],
       [0.03948132, 0.0761436 , 0.0339043 , 0.11284432, 0.02213062],
       [0.03558743, 0.08358533, 0.02921065, 0.03367553, 0.0285687 ],
       [0.02698953, 0.08557128, 0.01712705, 0.02971009, 0.01640177],
       [0.05482374, 0.10795018, 0.04167132, 0.05552353, 0.04909736],
       [0.06399546, 0.11869492, 0.02254699, 0.04318857, 0.02586204],
       [0.06198428, 0.07739853, 0.0432419 , 0.03850887, 0.03969475]])

In [50]:
test[target_col] = np.mean([x[0] for x in output], axis=0)

## Submission

In [51]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [52]:
submission[target_col].describe()

count    484758.000000
mean          0.030839
std           0.031409
min           0.003790
25%           0.013553
50%           0.024419
75%           0.038912
max           0.857536
Name: CustomerInterest, dtype: float64

In [53]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.046598
1,c2cc6cc2a8,0.102309
2,a8e94f6344,0.029994
3,758bae1e35,0.050041
4,02ab378ee8,0.028447


In [54]:
submission.to_csv(SUBMISSIONS/'08-lgbm_8weeks_diffscount_0226-0416.csv', index=False)