In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
week_labels = [20180226, 20180305, 20180312, 20180319, 
               20180326, 20180402, 20180409, 20180416, 20180423]

In [4]:
%%time
weeks = []
for name in week_labels:
    weeks.append(pd.read_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather'))

CPU times: user 5.14 s, sys: 3.2 s, total: 8.34 s
Wall time: 15.2 s


In [9]:
for w in weeks:
    w.drop('index', inplace=True, axis=1)

## Preprocessing

In [38]:
[x for x in weeks[0].columns if x not in cat_cols + id_cols + [target_col, pred_col]]

['DaysSinceBuySell',
 'DaysSinceTransaction',
 'DaysSinceCustomerActivity',
 'DaysSinceBondActivity',
 'DaysCountBuySell',
 'DaysCountTransaction',
 'DaysCountCustomerActivity',
 'DaysCountBondActivity',
 'ActualMaturityDateKey',
 'IssueDateKey',
 'IssuedAmount']

In [39]:
nan_cols = []
for n, w in zip(week_labels, weeks):
    print(n)
    x = w.apply(lambda x: sum(x.isnull()), axis=0)
    nan_cols.extend(x[x>0].index)
    print(x[x>0])
    print()
nan_cols = [x for x in set(nan_cols) if x != target_col]

20180226
Subsector           30816
IndustrySector         48
IndustrySubgroup       48
MarketIssue            48
dtype: int64

20180305
Subsector           15002
IndustrySector         40
IndustrySubgroup       40
MarketIssue            40
dtype: int64

20180312
Subsector           15086
IndustrySector         38
IndustrySubgroup       38
MarketIssue            38
dtype: int64

20180319
Subsector           15054
IndustrySector         44
IndustrySubgroup       44
MarketIssue            44
dtype: int64

20180326
Subsector           34726
IndustrySector         92
IndustrySubgroup       92
MarketIssue            92
dtype: int64

20180402
Subsector           15406
IndustrySector         38
IndustrySubgroup       38
MarketIssue            38
dtype: int64

20180409
Subsector           15180
IndustrySector         38
IndustrySubgroup       38
MarketIssue            42
dtype: int64

20180416
Subsector           15258
IndustrySector         38
IndustrySubgroup       38
MarketIssue            3

In [46]:
# fill nulls
for w in weeks:
    w.fillna({x: -999 for x in nan_cols}, inplace=True)

In [48]:
for n, w in zip(week_labels, weeks):
    print(n)
    x = w.apply(lambda x: sum(x.isnull()), axis=0)
    print(x[x>0])
    print()

20180226
Series([], dtype: int64)

20180305
Series([], dtype: int64)

20180312
Series([], dtype: int64)

20180319
Series([], dtype: int64)

20180326
Series([], dtype: int64)

20180402
Series([], dtype: int64)

20180409
Series([], dtype: int64)

20180416
Series([], dtype: int64)

20180423
CustomerInterest    484758
dtype: int64



## Model

In [12]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [13]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [15]:
columns = list(weeks[0].drop(id_cols + [target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [17]:
cat_indices

[0, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28]

In [26]:
from src.utils import run_model
from catboost import CatBoostClassifier
metric_names = ['auc']

In [20]:
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    print(train['TradeDateKey'].unique(), 
          val['TradeDateKey'].unique(), 
          test['TradeDateKey'].unique())

[20180226] [20180305] [20180423]
[20180305] [20180312] [20180423]
[20180312] [20180319] [20180423]
[20180319] [20180326] [20180423]
[20180326] [20180402] [20180423]
[20180402] [20180409] [20180423]
[20180409] [20180416] [20180423]
[20180416] [20180423] [20180423]


In [51]:
%%time
results = None
output = []
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            CatBoostClassifier(od_pval=0.00001, eval_metric='AUC'),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='od_pval=1e-5',
            dataset_desc=f'{week_labels[i]}_diffcounts', 
            early_stopping=True, cat_indices=cat_indices)
    output.append([y_test, model])

CatBoostClassifier 

0:	learn: 0.6370119	test: 0.5581310	best: 0.5581310 (0)	total: 1.15s	remaining: 19m 4s
1:	learn: 0.6424314	test: 0.5601913	best: 0.5601913 (1)	total: 2.1s	remaining: 17m 27s
2:	learn: 0.6615317	test: 0.5629745	best: 0.5629745 (2)	total: 3.07s	remaining: 17m 1s
3:	learn: 0.6601926	test: 0.5623774	best: 0.5629745 (2)	total: 4.1s	remaining: 17m 1s
4:	learn: 0.6626626	test: 0.5624073	best: 0.5629745 (2)	total: 5.09s	remaining: 16m 53s
5:	learn: 0.6618720	test: 0.5628994	best: 0.5629745 (2)	total: 6.1s	remaining: 16m 51s
6:	learn: 0.6632794	test: 0.5637429	best: 0.5637429 (6)	total: 7.11s	remaining: 16m 49s
7:	learn: 0.6595463	test: 0.5646248	best: 0.5646248 (7)	total: 8.09s	remaining: 16m 43s
8:	learn: 0.6637554	test: 0.5692895	best: 0.5692895 (8)	total: 9.09s	remaining: 16m 40s
9:	learn: 0.6661600	test: 0.5691652	best: 0.5692895 (8)	total: 10.1s	remaining: 16m 40s
10:	learn: 0.6812327	test: 0.5734976	best: 0.5734976 (10)	total: 11.1s	remaining: 16m 38s
11:	learn: 0.68

ValueError: could not convert string to float: 'Buy'

In [57]:
for i, w in enumerate(weeks[-1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            CatBoostClassifier(od_pval=0.00001, eval_metric='AUC'),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='od_pval=1e-5',
            dataset_desc=f'{week_labels[i]}_diffcounts', 
            early_stopping=True, cat_indices=cat_indices)
    output.append([y_test, model])

CatBoostClassifier 

0:	learn: 0.6367070	total: 780ms	remaining: 12m 59s
1:	learn: 0.6606471	total: 1.5s	remaining: 12m 28s
2:	learn: 0.6588329	total: 2.17s	remaining: 12m
3:	learn: 0.6655395	total: 2.85s	remaining: 11m 50s
4:	learn: 0.6653922	total: 3.54s	remaining: 11m 45s
5:	learn: 0.6625308	total: 4.21s	remaining: 11m 37s
6:	learn: 0.6596722	total: 4.9s	remaining: 11m 35s
7:	learn: 0.6621128	total: 5.58s	remaining: 11m 32s
8:	learn: 0.6598857	total: 6.32s	remaining: 11m 35s
9:	learn: 0.6569446	total: 7.03s	remaining: 11m 35s
10:	learn: 0.6682786	total: 7.72s	remaining: 11m 34s
11:	learn: 0.6690220	total: 8.42s	remaining: 11m 33s
12:	learn: 0.6679639	total: 9.12s	remaining: 11m 32s
13:	learn: 0.6719188	total: 9.82s	remaining: 11m 31s
14:	learn: 0.6733838	total: 10.5s	remaining: 11m 31s
15:	learn: 0.6743626	total: 11.2s	remaining: 11m 31s
16:	learn: 0.6788436	total: 11.9s	remaining: 11m 30s
17:	learn: 0.6806013	total: 12.7s	remaining: 11m 30s
18:	learn: 0.6830983	total: 13.4s	remaini

In [58]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,CatBoostClassifier,od_pval=1e-5,1159.2,0.78606,0.646456
1,20180305_diffcounts,CatBoostClassifier,od_pval=1e-5,920.99,0.772017,0.68438
2,20180312_diffcounts,CatBoostClassifier,od_pval=1e-5,929.12,0.761127,0.727533
3,20180319_diffcounts,CatBoostClassifier,od_pval=1e-5,1069.33,0.775732,0.68579
4,20180326_diffcounts,CatBoostClassifier,od_pval=1e-5,1173.55,0.811456,0.701295
5,20180402_diffcounts,CatBoostClassifier,od_pval=1e-5,688.09,0.753033,0.739146
6,20180409_diffcounts,CatBoostClassifier,od_pval=1e-5,920.79,0.805718,0.766587
7,20180226_diffcounts,CatBoostClassifier,od_pval=1e-5,834.33,0.79368,0.0


In [59]:
# first 5 predictions (2018 data)
np.array([x[0] for x in output])[:,:5]

array([[0.05088555, 0.15661361, 0.02097313, 0.02611844, 0.01816255],
       [0.03824832, 0.09879819, 0.02696571, 0.03006561, 0.02249509],
       [0.08689345, 0.08256685, 0.02508119, 0.05338311, 0.02737974],
       [0.03553228, 0.0671148 , 0.03105588, 0.04107846, 0.02363133],
       [0.04040708, 0.10460788, 0.01893039, 0.02184942, 0.01468147],
       [0.06208593, 0.10760823, 0.05204742, 0.03543031, 0.0553108 ],
       [0.06495149, 0.11511678, 0.02301069, 0.04159508, 0.02848047],
       [0.04784267, 0.136755  , 0.01997929, 0.03374224, 0.01739769]])

In [60]:
test[target_col] = np.mean([x[0] for x in output], axis=0)

## Submission

In [61]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [62]:
submission[target_col].describe()

count    484758.000000
mean          0.031072
std           0.034669
min           0.002335
25%           0.014604
50%           0.024023
75%           0.036462
max           0.949734
Name: CustomerInterest, dtype: float64

In [63]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.053356
1,c2cc6cc2a8,0.108648
2,a8e94f6344,0.027255
3,758bae1e35,0.035408
4,02ab378ee8,0.025942


In [64]:
submission.to_csv(SUBMISSIONS/'08-catboost_8weeks_diffscount_0226-0416.csv', index=False)