In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [11]:
from src.utils import make_val_set, add_datediffs

In [115]:
%%time
weeks = [make_val_set(trade[trade.TradeDateKey == d], challenge) \
                     for d in range(20180416, 20180421)]

CPU times: user 2min 27s, sys: 320 ms, total: 2min 28s
Wall time: 2min 28s


In [116]:
for w in weeks:
    print(w.CustomerInterest.value_counts())

0.0    482122
1.0      3744
Name: CustomerInterest, dtype: int64
0.0    482916
1.0      4100
Name: CustomerInterest, dtype: int64
0.0    483195
1.0      4285
Name: CustomerInterest, dtype: int64
0.0    482345
1.0      3993
Name: CustomerInterest, dtype: int64
0.0    483985
1.0      3187
Name: CustomerInterest, dtype: int64


In [117]:
%%time
for w in weeks:
    add_datediffs(w, trade[trade.TradeDateKey > 20180000])

CPU times: user 3min 18s, sys: 468 ms, total: 3min 18s
Wall time: 3min 18s


In [118]:
for w, name in zip(weeks, range(20180416, 20180421)):
    w.to_feather(PROCESSED/f'val_datediffs_{name}.feather')

In [119]:
from src.utils import preprocessing_pipeline

In [120]:
weeks = []
for name in range(20180416, 20180421):
    weeks.append(pd.read_feather(PROCESSED/f'val_datediffs_{name}.feather'))
test = pd.read_feather(PROCESSED/'test_datediffs.feather')

In [121]:
%%time
for i, w in enumerate(weeks):
    weeks[i] = preprocessing_pipeline(w, customer, isin, trade)
test = preprocessing_pipeline(test, customer, isin, trade)

CPU times: user 1.57 s, sys: 0 ns, total: 1.57 s
Wall time: 1.57 s


In [122]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'MarketIssue', 'CouponType']

In [123]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [124]:
test.drop('DateKey', axis=1, inplace=True)

In [125]:
weeks.append(test)

In [126]:
for w in weeks:
    w.drop(['IndustrySubgroup', 'TickerIdx'], axis=1, inplace=True)

In [127]:
for w in weeks:
    print(w.shape)

(485866, 27)
(487016, 27)
(487480, 27)
(486338, 27)
(487172, 27)
(484758, 28)


## Preprocessing

In [128]:
from src.utils import apply_cats
for col in cat_cols:
    # test set
    weeks[-1][col] = weeks[-1][col].astype('category').cat.as_ordered()
for w in weeks[:-1]:
    apply_cats(w, weeks[-1])

for w in weeks:
    for col in cat_cols:
        w[col] = w[col].cat.codes

## Model

In [129]:
from src.utils import run_model
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [130]:
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    print(train['TradeDateKey'].unique(), 
          val['TradeDateKey'].unique(), 
          test['TradeDateKey'].unique())

[20180416] [20180417] [20180423]
[20180417] [20180418] [20180423]
[20180418] [20180419] [20180423]
[20180419] [20180420] [20180423]
[20180420] [20180423] [20180423]


In [131]:
%%time
names = list(range(20180416, 20180421))
results = None
output = []
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            LGBMClassifier(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='n_estimators=150',
            dataset_desc=f'{names[i]}_datediffs', 
            early_stopping=True)
    output.append([y_test, model])

LGBMClassifier 

[1]	valid_0's auc: 0.718918
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.718993
[3]	valid_0's auc: 0.719268
[4]	valid_0's auc: 0.719207
[5]	valid_0's auc: 0.719213
[6]	valid_0's auc: 0.719228
[7]	valid_0's auc: 0.719235
[8]	valid_0's auc: 0.722083
[9]	valid_0's auc: 0.722033
[10]	valid_0's auc: 0.722123
[11]	valid_0's auc: 0.722246
[12]	valid_0's auc: 0.722066
[13]	valid_0's auc: 0.722033
[14]	valid_0's auc: 0.722711
[15]	valid_0's auc: 0.722855
[16]	valid_0's auc: 0.72289
[17]	valid_0's auc: 0.723014
[18]	valid_0's auc: 0.723523
[19]	valid_0's auc: 0.724547
[20]	valid_0's auc: 0.726379
[21]	valid_0's auc: 0.726874
[22]	valid_0's auc: 0.727143
[23]	valid_0's auc: 0.72764
[24]	valid_0's auc: 0.734725
[25]	valid_0's auc: 0.744568
[26]	valid_0's auc: 0.744925
[27]	valid_0's auc: 0.752086
[28]	valid_0's auc: 0.752056
[29]	valid_0's auc: 0.752243
[30]	valid_0's auc: 0.752277
[31]	valid_0's auc: 0.752514
[32]	valid_0's auc: 0.752711
[33]

In [132]:
# with 2018 data
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180416_datediffs,LGBMClassifier,n_estimators=150,4.15,0.819803,0.763507
1,20180417_datediffs,LGBMClassifier,n_estimators=150,5.35,0.853344,0.789459
2,20180418_datediffs,LGBMClassifier,n_estimators=150,4.02,0.835253,0.766799
3,20180419_datediffs,LGBMClassifier,n_estimators=150,6.45,0.89042,0.776527
4,20180420_datediffs,LGBMClassifier,n_estimators=150,2.94,0.903803,0.0


In [92]:
# with all data
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180416_datediffs,LGBMClassifier,n_estimators=150,6.7,0.926833,0.869495
1,20180417_datediffs,LGBMClassifier,n_estimators=150,5.27,0.91054,0.889957
2,20180418_datediffs,LGBMClassifier,n_estimators=150,4.87,0.9091,0.875706
3,20180419_datediffs,LGBMClassifier,n_estimators=150,6.27,0.93998,0.881754
4,20180420_datediffs,LGBMClassifier,n_estimators=150,3.38,0.959137,0.0


In [133]:
# first 5 predictions (2018 data)
np.array([x[0] for x in output])[:,:5]

array([[0.01357491, 0.01017713, 0.01043005, 0.01662145, 0.01284638],
       [0.01140843, 0.0058839 , 0.00649024, 0.01097864, 0.00800061],
       [0.07190851, 0.00664296, 0.01037058, 0.01469428, 0.00908813],
       [0.00712391, 0.00597376, 0.00494475, 0.00411091, 0.00273574],
       [0.01074893, 0.00063901, 0.00325018, 0.0095723 , 0.00240637]])

In [93]:
# first 5 predictions (all data)
np.array([x[0] for x in output])[:,:5]

array([[0.0836018 , 0.00824288, 0.10999873, 0.11227282, 0.1121087 ],
       [0.12736906, 0.00593054, 0.11235133, 0.17877195, 0.11556503],
       [0.15078699, 0.00591629, 0.1151076 , 0.19216083, 0.16283516],
       [0.13264925, 0.0099546 , 0.27889786, 0.32566423, 0.21829754],
       [0.16748621, 0.00122203, 0.32202829, 0.1657942 , 0.27001574]])

In [134]:
y_test = np.mean([x[0] for x in output], axis=0)

In [135]:
y_test[:5]

array([0.02295294, 0.00586335, 0.00709716, 0.01119552, 0.00701545])

In [136]:
test[target_col] = y_test

# Submission

In [137]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], how='left', on='PredictionIdx')

In [138]:
submission[target_col].describe()

count    484758.000000
mean          0.005740
std           0.007380
min           0.001169
25%           0.001930
50%           0.002799
75%           0.008413
max           0.287018
Name: CustomerInterest, dtype: float64

In [139]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.022953
1,c2cc6cc2a8,0.005863
2,a8e94f6344,0.007097
3,758bae1e35,0.011196
4,02ab378ee8,0.007015


In [140]:
submission.to_csv(SUBMISSIONS/'lgbm_days_datediffs_0416-0420.csv', index=False)