In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [131]:
week_labels = [20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 
               20180326, 20180402, 20180409, 20180416, 20180423]

In [134]:
%%time
weeks = []
for name in week_labels:
    weeks.append(pd.read_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather'))

CPU times: user 3.41 s, sys: 2.28 s, total: 5.7 s
Wall time: 6.63 s


In [135]:
for w in weeks:
    w.drop('index', inplace=True, axis=1)

In [173]:
weeks[0].columns

Index(['Activity', 'ActivityGroup', 'ActualMaturityDateKey', 'BuySell',
       'CompositeRating', 'Country', 'CouponType', 'Currency', 'CustomerIdx',
       'CustomerInterest', 'DaysCountBondActivity', 'DaysCountBondBuySell',
       'DaysCountBuySell', 'DaysCountCustomerActivity',
       'DaysCountCustomerBuySell', 'DaysCountTransaction',
       'DaysSinceBondActivity', 'DaysSinceBondBuySell', 'DaysSinceBuySell',
       'DaysSinceCustomerActivity', 'DaysSinceCustomerBuySell',
       'DaysSinceTransaction', 'IndustrySector', 'IndustrySubgroup', 'IsinIdx',
       'IssueDateKey', 'IssuedAmount', 'MarketIssue', 'Owner', 'Region_x',
       'Region_y', 'RiskCaptain', 'Sector', 'Seniority', 'Subsector',
       'TickerIdx', 'TradeDateKey'],
      dtype='object')

In [174]:
weeks[-1].columns

Index(['PredictionIdx', 'Activity', 'ActivityGroup', 'ActualMaturityDateKey',
       'BuySell', 'CompositeRating', 'Country', 'CouponType', 'Currency',
       'CustomerIdx', 'CustomerInterest', 'DaysCountBondActivity',
       'DaysCountBuySell', 'DaysCountCustomerActivity', 'DaysCountTransaction',
       'DaysSinceBondActivity', 'DaysSinceBuySell',
       'DaysSinceCustomerActivity', 'DaysSinceTransaction', 'IndustrySector',
       'IndustrySubgroup', 'IsinIdx', 'IssueDateKey', 'IssuedAmount',
       'MarketIssue', 'Owner', 'Region_x', 'Region_y', 'RiskCaptain', 'Sector',
       'Seniority', 'Subsector', 'TickerIdx', 'TradeDateKey'],
      dtype='object')

# Preprocessing

In [138]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

# cat_cols = ['BuySell', 'Sector', 'Region_x', 
#             'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
#             'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
#             'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [172]:
[x for x in weeks[0].columns if x not in cat_cols + id_cols + [target_col, pred_col]]

['ActualMaturityDateKey',
 'DaysCountBondActivity',
 'DaysCountBondBuySell',
 'DaysCountBuySell',
 'DaysCountCustomerActivity',
 'DaysCountCustomerBuySell',
 'DaysCountTransaction',
 'DaysSinceBondActivity',
 'DaysSinceBondBuySell',
 'DaysSinceBuySell',
 'DaysSinceCustomerActivity',
 'DaysSinceCustomerBuySell',
 'DaysSinceTransaction',
 'IssueDateKey',
 'IssuedAmount']

In [140]:
nan_cols = []
for n, w in zip(week_labels, weeks):
    print(n)
    x = w.apply(lambda x: sum(x.isnull()), axis=0)
    nan_cols.extend(x[x>0].index)
    print(x[x>0])
    print()
nan_cols = [x for x in set(nan_cols) if x != target_col]

20180212
Subsector           15468
IndustrySector         38
IndustrySubgroup       38
MarketIssue            38
dtype: int64

20180219
Subsector           16966
IndustrySector         40
IndustrySubgroup       40
MarketIssue            40
dtype: int64

20180226
Subsector           30816
IndustrySector         48
IndustrySubgroup       48
MarketIssue            48
dtype: int64

20180305
Subsector           15002
IndustrySector         40
IndustrySubgroup       40
MarketIssue            40
dtype: int64

20180312
Subsector           15086
IndustrySector         38
IndustrySubgroup       38
MarketIssue            38
dtype: int64

20180319
Subsector           15054
IndustrySector         44
IndustrySubgroup       44
MarketIssue            44
dtype: int64

20180326
Subsector           34726
IndustrySector         92
IndustrySubgroup       92
MarketIssue            92
dtype: int64

20180402
Subsector           15406
IndustrySector         38
IndustrySubgroup       38
MarketIssue            3

In [141]:
# fill nulls
for w in weeks:
    w.fillna({x: -999 for x in nan_cols}, inplace=True)

In [142]:
for n, w in zip(week_labels, weeks):
    print(n)
    x = w.apply(lambda x: sum(x.isnull()), axis=0)
    print(x[x>0])
    print()

20180212
Series([], dtype: int64)

20180219
Series([], dtype: int64)

20180226
Series([], dtype: int64)

20180305
Series([], dtype: int64)

20180312
Series([], dtype: int64)

20180319
Series([], dtype: int64)

20180326
Series([], dtype: int64)

20180402
Series([], dtype: int64)

20180409
Series([], dtype: int64)

20180416
Series([], dtype: int64)

20180423
CustomerInterest    484758
dtype: int64



In [187]:
for w in range(len(weeks) - 1):
    weeks[w].drop(['DaysCountBondBuySell', 'DaysCountCustomerBuySell', 'DaysSinceBondBuySell', 'DaysSinceCustomerBuySell'], axis=1, inplace=True)

In [188]:
weeks[0].columns

Index(['Activity', 'ActivityGroup', 'ActualMaturityDateKey', 'BuySell',
       'CompositeRating', 'Country', 'CouponType', 'Currency', 'CustomerIdx',
       'CustomerInterest', 'DaysCountBondActivity', 'DaysCountBuySell',
       'DaysCountCustomerActivity', 'DaysCountTransaction',
       'DaysSinceBondActivity', 'DaysSinceBuySell',
       'DaysSinceCustomerActivity', 'DaysSinceTransaction', 'IndustrySector',
       'IndustrySubgroup', 'IsinIdx', 'IssueDateKey', 'IssuedAmount',
       'MarketIssue', 'Owner', 'Region_x', 'Region_y', 'RiskCaptain', 'Sector',
       'Seniority', 'Subsector', 'TickerIdx', 'TradeDateKey'],
      dtype='object')

In [189]:
weeks[-1].columns

Index(['Activity', 'ActivityGroup', 'ActualMaturityDateKey', 'BuySell',
       'CompositeRating', 'Country', 'CouponType', 'Currency', 'CustomerIdx',
       'CustomerInterest', 'DaysCountBondActivity', 'DaysCountBuySell',
       'DaysCountCustomerActivity', 'DaysCountTransaction',
       'DaysSinceBondActivity', 'DaysSinceBuySell',
       'DaysSinceCustomerActivity', 'DaysSinceTransaction', 'IndustrySector',
       'IndustrySubgroup', 'IsinIdx', 'IssueDateKey', 'IssuedAmount',
       'MarketIssue', 'Owner', 'Region_x', 'Region_y', 'RiskCaptain', 'Sector',
       'Seniority', 'Subsector', 'TickerIdx', 'TradeDateKey', 'PredictionIdx'],
      dtype='object')

In [151]:
len(weeks[0].columns)

37

In [158]:
for index, w in enumerate(weeks):
    weeks[index] = weeks[index].reindex_axis(sorted(weeks[index].columns), axis=1)

In [177]:
weeks[-1] = weeks[-1].reindex_axis(list(weeks[-1].drop('PredictionIdx', axis=1).columns) + ['PredictionIdx'], axis=1)

# Model

In [190]:
columns = list(weeks[0].drop(id_cols + [target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [191]:
cat_indices

[0, 1, 3, 4, 5, 6, 7, 16, 17, 20, 21, 22, 23, 24, 25, 26, 27, 28]

In [192]:
from src.utils import run_model
from catboost import CatBoostClassifier
metric_names = ['auc']

In [193]:
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    print(train['TradeDateKey'].unique(), 
          val['TradeDateKey'].unique(), 
          test['TradeDateKey'].unique())

[20180212] [20180219] [20180423]
[20180219] [20180226] [20180423]
[20180226] [20180305] [20180423]
[20180305] [20180312] [20180423]
[20180312] [20180319] [20180423]
[20180319] [20180326] [20180423]
[20180326] [20180402] [20180423]
[20180402] [20180409] [20180423]
[20180409] [20180416] [20180423]
[20180416] [20180423] [20180423]


In [None]:
%%time
results = None
output = []
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            CatBoostClassifier(od_pval=0.00001, eval_metric='AUC'),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='od_pval=1e-5',
            dataset_desc=f'{week_labels[i]}_diffcounts', 
            early_stopping=True, cat_indices=cat_indices)
    output.append([y_test, model])

CatBoostClassifier 

0:	learn: 0.5999203	test: 0.5683831	best: 0.5683831 (0)	total: 833ms	remaining: 13m 52s
1:	learn: 0.6106146	test: 0.5743372	best: 0.5743372 (1)	total: 1.67s	remaining: 13m 50s
2:	learn: 0.6139547	test: 0.5805022	best: 0.5805022 (2)	total: 2.46s	remaining: 13m 37s
3:	learn: 0.6147621	test: 0.5818443	best: 0.5818443 (3)	total: 3.27s	remaining: 13m 34s
4:	learn: 0.6355340	test: 0.6061649	best: 0.6061649 (4)	total: 4.06s	remaining: 13m 28s
5:	learn: 0.6356506	test: 0.6077318	best: 0.6077318 (5)	total: 4.89s	remaining: 13m 30s
6:	learn: 0.6355582	test: 0.6089816	best: 0.6089816 (6)	total: 5.7s	remaining: 13m 28s
7:	learn: 0.6358693	test: 0.6086872	best: 0.6089816 (6)	total: 6.49s	remaining: 13m 24s
8:	learn: 0.6375530	test: 0.6075275	best: 0.6089816 (6)	total: 7.33s	remaining: 13m 27s
9:	learn: 0.6377391	test: 0.6076373	best: 0.6089816 (6)	total: 8.22s	remaining: 13m 33s
10:	learn: 0.6384986	test: 0.6085215	best: 0.6089816 (6)	total: 9.03s	remaining: 13m 31s
11:	learn: 

In [196]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180212_diffcounts,CatBoostClassifier,od_pval=1e-5,962.52,0.751289,0.658326
1,20180219_diffcounts,CatBoostClassifier,od_pval=1e-5,1179.65,0.672365,0.669665
2,20180226_diffcounts,CatBoostClassifier,od_pval=1e-5,1173.77,0.791985,0.645228
3,20180305_diffcounts,CatBoostClassifier,od_pval=1e-5,924.42,0.765476,0.683658
4,20180312_diffcounts,CatBoostClassifier,od_pval=1e-5,937.27,0.76623,0.72624
5,20180319_diffcounts,CatBoostClassifier,od_pval=1e-5,912.17,0.774635,0.685237
6,20180326_diffcounts,CatBoostClassifier,od_pval=1e-5,1184.3,0.815067,0.703431
7,20180402_diffcounts,CatBoostClassifier,od_pval=1e-5,889.93,0.775005,0.738437
8,20180409_diffcounts,CatBoostClassifier,od_pval=1e-5,933.95,0.812538,0.767293
9,20180416_diffcounts,CatBoostClassifier,od_pval=1e-5,591.88,0.810783,0.0


In [171]:
results

In [100]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,CatBoostClassifier,od_pval=1e-5,1168.45,0.795663,0.647767
1,20180305_diffcounts,CatBoostClassifier,od_pval=1e-5,923.58,0.773155,0.687191
2,20180312_diffcounts,CatBoostClassifier,od_pval=1e-5,937.19,0.767932,0.728959
3,20180319_diffcounts,CatBoostClassifier,od_pval=1e-5,817.72,0.768425,0.687791
4,20180326_diffcounts,CatBoostClassifier,od_pval=1e-5,1184.29,0.815729,0.704293
5,20180402_diffcounts,CatBoostClassifier,od_pval=1e-5,512.46,0.767704,0.734942
6,20180409_diffcounts,CatBoostClassifier,od_pval=1e-5,931.29,0.814709,0.767811
7,20180416_diffcounts,CatBoostClassifier,od_pval=1e-5,590.13,0.823429,0.0


In [18]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,CatBoostClassifier,od_pval=1e-5,1154.53,0.79684,0.653252
1,20180305_diffcounts,CatBoostClassifier,od_pval=1e-5,904.81,0.748335,0.691596
2,20180312_diffcounts,CatBoostClassifier,od_pval=1e-5,924.38,0.774108,0.731921
3,20180319_diffcounts,CatBoostClassifier,od_pval=1e-5,1034.8,0.7671,0.684033
4,20180326_diffcounts,CatBoostClassifier,od_pval=1e-5,1173.65,0.820162,0.707924
5,20180402_diffcounts,CatBoostClassifier,od_pval=1e-5,632.02,0.763564,0.742936
6,20180409_diffcounts,CatBoostClassifier,od_pval=1e-5,924.63,0.815345,0.770817
7,20180416_diffcounts,CatBoostClassifier,od_pval=1e-5,593.38,0.824783,0.0


In [197]:
test[target_col] = np.mean([x[0] for x in output], axis=0)

# Submission

In [198]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [200]:
submission.to_csv(SUBMISSIONS/'05-catboost_8weeks_diffscount_0212-0416.csv', index=False)