In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)
macro      = pd.read_csv(RAW/'MarketData_Macro.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [4]:
from src.utils import make_val_set, add_datediffs, add_dayscount, preprocessing_pipeline

In [116]:
week_labels = [ 20171108, 20171115, 20171122, 20171129, 20171206,
                20171213, 20171220, 20171227, 20180103, 20180110,
                20180117, 20180122, 20180129, 20180205, 20180212, 
                20180219, 20180226, 20180305, 20180312, 20180319, 
                20180326, 20180402, 20180409, 20180416, 20180423]

In [117]:
%%time
weeks = [make_val_set(trade[trade.TradeDateKey.apply(lambda x: w1<=x<w2)], 
                    challenge) for w1, w2 in zip(week_labels[:-1], week_labels[1:])]

CPU times: user 13min 9s, sys: 2.94 s, total: 13min 12s
Wall time: 13min 12s


In [118]:
%%time
for w in weeks:
    add_datediffs(w, trade[trade.TradeDateKey > 20180000])
    add_dayscount(w, trade[trade.TradeDateKey > 20180000])

CPU times: user 29min 22s, sys: 4.12 s, total: 29min 27s
Wall time: 29min 26s


In [129]:
%%time
for i, w in enumerate(weeks):
    weeks[i] = preprocessing_pipeline(w, customer, isin, trade)  

CPU times: user 6.33 s, sys: 1.12 s, total: 7.44 s
Wall time: 7.45 s


In [130]:
test = pd.read_feather(PROCESSED/'week_0423_diffscount.feather')
#test.drop(['Yield', 'ZSpread', 'Price'], axis=1, inplace=True)

In [131]:
test.head(1)

Unnamed: 0,index,PredictionIdx,CustomerIdx,IsinIdx,BuySell,CustomerInterest,TradeDateKey,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity,DaysCountBuySell,DaysCountTransaction,DaysCountCustomerActivity,DaysCountBondActivity,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType
0,0,a1e0d80784,1856,13323,Buy,,20180423,296,296,3,3,0,0,6291,34,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,2740,20210315,20130314,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US TMT CDS,B-,"Consumer, Non-cyclic",Printing-Commercial,Global,450000000.0,FIXED


In [134]:
test.drop(['index'], axis=1, inplace=True)

In [135]:
weeks[0].columns

Index(['TradeDateKey', 'CustomerIdx', 'IsinIdx', 'BuySell', 'CustomerInterest',
       'DaysSinceBuySell', 'DaysSinceTransaction', 'DaysSinceCustomerActivity',
       'DaysSinceBondActivity', 'DaysCountBuySell', 'DaysCountTransaction',
       'DaysCountCustomerActivity', 'DaysCountBondActivity', 'Sector',
       'Subsector', 'Region_x', 'Country', 'TickerIdx',
       'ActualMaturityDateKey', 'IssueDateKey', 'Seniority', 'Currency',
       'ActivityGroup', 'Region_y', 'Activity', 'RiskCaptain', 'Owner',
       'CompositeRating', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue',
       'IssuedAmount', 'CouponType'],
      dtype='object')

In [136]:
test.columns

Index(['PredictionIdx', 'CustomerIdx', 'IsinIdx', 'BuySell',
       'CustomerInterest', 'TradeDateKey', 'DaysSinceBuySell',
       'DaysSinceTransaction', 'DaysSinceCustomerActivity',
       'DaysSinceBondActivity', 'DaysCountBuySell', 'DaysCountTransaction',
       'DaysCountCustomerActivity', 'DaysCountBondActivity', 'Sector',
       'Subsector', 'Region_x', 'Country', 'TickerIdx',
       'ActualMaturityDateKey', 'IssueDateKey', 'Seniority', 'Currency',
       'ActivityGroup', 'Region_y', 'Activity', 'RiskCaptain', 'Owner',
       'CompositeRating', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue',
       'IssuedAmount', 'CouponType'],
      dtype='object')

In [137]:
weeks.append(test)

In [138]:
%%time
for w, name in zip(weeks, week_labels):
    w.reset_index().to_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather')

CPU times: user 3min 28s, sys: 9.72 s, total: 3min 37s
Wall time: 3min 36s


# Preprocessing

In [141]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [142]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [143]:
from src.utils import apply_cats
for col in cat_cols:
    weeks[-1][col] = weeks[-1][col].astype('category').cat.as_ordered()

In [144]:
for w in weeks[:-1]:
    apply_cats(w, weeks[-1])

In [145]:
for w in weeks:
    for col in cat_cols:
        w[col] = w[col].cat.codes

In [146]:
weeks[0].head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity,DaysCountBuySell,DaysCountTransaction,DaysCountCustomerActivity,DaysCountBondActivity,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType
0,20171108,0,21856,0,0.0,130,130,130,130,0,0,0,0,0,21,1,30,1555,20190114,20140114,4,15,1,1,1,1,0,7,6,228,5,600000000.0,0
1,20171108,0,21856,1,0.0,130,130,130,130,0,0,0,0,0,21,1,30,1555,20190114,20140114,4,15,1,1,1,1,0,7,6,228,5,600000000.0,0
2,20171108,0,24944,0,0.0,130,130,130,130,0,0,0,0,0,21,1,30,1089,20230817,20160817,4,15,1,1,1,1,0,23,6,210,5,590000000.0,0
3,20171108,0,24944,1,0.0,130,130,130,130,0,0,0,0,0,21,1,30,1089,20230817,20160817,4,15,1,1,1,1,0,23,6,210,5,590000000.0,0
4,20171108,0,25992,0,0.0,130,130,130,130,0,0,0,0,0,21,1,30,601,20200601,20170601,4,15,1,1,1,1,0,10,1,140,5,500000000.0,0


# Train/Test/Val

In [147]:
from src.utils import run_model
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [148]:
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    print(train['TradeDateKey'].unique(), 
          val['TradeDateKey'].unique(), 
          test['TradeDateKey'].unique())

[20171108] [20171115] [20180423]
[20171115] [20171122] [20180423]
[20171122] [20171129] [20180423]
[20171129] [20171206] [20180423]
[20171206] [20171213] [20180423]
[20171213] [20171220] [20180423]
[20171220] [20171227] [20180423]
[20171227] [20180103] [20180423]
[20180103] [20180110] [20180423]
[20180110] [20180117] [20180423]
[20180117] [20180122] [20180423]
[20180122] [20180129] [20180423]
[20180129] [20180205] [20180423]
[20180205] [20180212] [20180423]
[20180212] [20180219] [20180423]
[20180219] [20180226] [20180423]
[20180226] [20180305] [20180423]
[20180305] [20180312] [20180423]
[20180312] [20180319] [20180423]
[20180319] [20180326] [20180423]
[20180326] [20180402] [20180423]
[20180402] [20180409] [20180423]
[20180409] [20180416] [20180423]
[20180416] [20180423] [20180423]


In [149]:
%%time
results = None
output = []
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            LGBMClassifier(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='n_estimators=120',
            dataset_desc=f'{week_labels[i]}_diffcounts', 
            early_stopping=True)
    output.append([y_test, model])

LGBMClassifier 

[1]	valid_0's auc: 0.569806
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.592498
[3]	valid_0's auc: 0.58022
[4]	valid_0's auc: 0.582751
[5]	valid_0's auc: 0.590749
[6]	valid_0's auc: 0.588521
[7]	valid_0's auc: 0.590185
[8]	valid_0's auc: 0.591118
[9]	valid_0's auc: 0.594498
[10]	valid_0's auc: 0.595219
[11]	valid_0's auc: 0.595238
[12]	valid_0's auc: 0.598705
[13]	valid_0's auc: 0.596135
[14]	valid_0's auc: 0.596276
[15]	valid_0's auc: 0.59826
[16]	valid_0's auc: 0.601323
[17]	valid_0's auc: 0.600614
[18]	valid_0's auc: 0.601222
[19]	valid_0's auc: 0.60452
[20]	valid_0's auc: 0.601569
[21]	valid_0's auc: 0.602129
[22]	valid_0's auc: 0.603724
[23]	valid_0's auc: 0.604426
[24]	valid_0's auc: 0.606522
[25]	valid_0's auc: 0.608279
[26]	valid_0's auc: 0.609551
[27]	valid_0's auc: 0.610268
[28]	valid_0's auc: 0.609828
[29]	valid_0's auc: 0.611219
[30]	valid_0's auc: 0.611484
[31]	valid_0's auc: 0.612942
[32]	valid_0's auc: 0.613369
[33]	

In [150]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20171108_diffcounts,LGBMClassifier,n_estimators=120,9.52,0.748724,0.631026
1,20171115_diffcounts,LGBMClassifier,n_estimators=120,9.78,0.734186,0.627338
2,20171122_diffcounts,LGBMClassifier,n_estimators=120,11.01,0.746524,0.582733
3,20171129_diffcounts,LGBMClassifier,n_estimators=120,11.99,0.748436,0.637865
4,20171206_diffcounts,LGBMClassifier,n_estimators=120,8.69,0.746272,0.639137
5,20171213_diffcounts,LGBMClassifier,n_estimators=120,9.59,0.740567,0.652372
6,20171220_diffcounts,LGBMClassifier,n_estimators=120,11.02,0.767265,0.645033
7,20171227_diffcounts,LGBMClassifier,n_estimators=120,11.88,0.840738,0.631174
8,20180103_diffcounts,LGBMClassifier,n_estimators=120,9.4,0.768477,0.603876
9,20180110_diffcounts,LGBMClassifier,n_estimators=120,6.89,0.712174,0.613164


In [163]:
test[target_col] = 0
total = 0
n_weeks = 5
jump = 1.25
dif = 0.5
acum = 1
mul = 0
for ix in range(len(results)):
    if ix % n_weeks == 0:
        acum += dif
        mul = mul + acum
    print(mul)
    test[target_col] += (mul) * output[ix][0]
    total += mul
test[target_col] /= total

1.5
1.5
1.5
1.5
1.5
3.5
3.5
3.5
3.5
3.5
6.0
6.0
6.0
6.0
6.0
9.0
9.0
9.0
9.0
9.0
12.5
12.5
12.5
12.5


In [164]:
total

150.0

# Submission

In [165]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [167]:
submission.to_csv(SUBMISSIONS/'12-lgbm_8weeks_diffscount_20171213-0416_with_weights.csv', index=False)