In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
from src.utils import get_weeks
# week_labels = get_weeks(day_from=20160104, num_weeks=121)[52:]
week_labels = get_weeks(day_from=20160104, num_weeks=121)[96:]

In [4]:
print(week_labels)

[20171106, 20171113, 20171120, 20171127, 20171204, 20171211, 20171218, 20171225, 20180101, 20180108, 20180115, 20180122, 20180129, 20180205, 20180212, 20180219, 20180226, 20180305, 20180312, 20180319, 20180326, 20180402, 20180409, 20180416, 20180423]


In [5]:
%%time
weeks = pd.DataFrame()
for name in week_labels[:-1]:
    weeks = pd.concat([weeks, pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{name}_SVD_diffscount.feather')])

CPU times: user 3min 36s, sys: 1min 49s, total: 5min 26s
Wall time: 5min 45s


In [6]:
test = pd.read_feather(PROCESSED/f'SVD_17-18_72f/week_{week_labels[-1]}_SVD_diffscount.feather')

In [8]:
len(weeks), len(week_labels)

(13079656, 25)

## Fill nan

In [7]:
nan_cols = ['Subsector', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue']

In [9]:
# %%time
# # fill nulls
# for w in weeks:
#     w.fillna({x: '-999' for x in nan_cols}, inplace=True)

In [10]:
train = weeks

In [11]:
%%time
train.fillna({x: '-999' for x in nan_cols}, inplace=True)
test.fillna({x: '-999' for x in nan_cols}, inplace=True)

CPU times: user 9.92 s, sys: 4.25 s, total: 14.2 s
Wall time: 14.1 s


## Model

In [12]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [13]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [14]:
# columns = list(weeks[0].drop(id_cols + [target_col], axis=1))
# cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [15]:
columns = list(train.drop(id_cols + [target_col], axis=1))
cat_indices = [i for i, col in enumerate(columns) if col in cat_cols]

In [16]:
cat_indices

[0, 45, 46, 47, 48, 49, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 64]

In [17]:
from src.utils import run_model, alert
from catboost import CatBoostClassifier
metric_names = ['auc']

In [18]:
# %%time
# test = weeks[-1]
# weeks = pd.concat(weeks[:-1])
# train = weeks

In [19]:
%%time
y_train = train[target_col]
train.drop(id_cols + [target_col], axis=1, inplace=True)
X_train = train

CPU times: user 4.05 s, sys: 1.72 s, total: 5.77 s
Wall time: 5.73 s


In [None]:
%%time
model = CatBoostClassifier(iterations=1000, eval_metric='AUC')
model.fit(X_train, y_train, cat_features=cat_indices)

0:	learn: 0.6654384	total: 16.1s	remaining: 4h 27m 34s
1:	learn: 0.7207441	total: 31.8s	remaining: 4h 24m 50s
2:	learn: 0.7228649	total: 47.4s	remaining: 4h 22m 16s
3:	learn: 0.7186887	total: 1m 3s	remaining: 4h 24m 21s
4:	learn: 0.7214737	total: 1m 19s	remaining: 4h 22m 20s
5:	learn: 0.7271417	total: 1m 34s	remaining: 4h 21m 21s
6:	learn: 0.7272383	total: 1m 50s	remaining: 4h 20m 35s
7:	learn: 0.7368965	total: 2m 5s	remaining: 4h 19m 49s
8:	learn: 0.7373523	total: 2m 20s	remaining: 4h 18m 36s
9:	learn: 0.7442938	total: 2m 37s	remaining: 4h 19m 28s
10:	learn: 0.7436214	total: 2m 52s	remaining: 4h 18m 22s
11:	learn: 0.7434305	total: 3m 7s	remaining: 4h 17m 48s
12:	learn: 0.7451440	total: 3m 23s	remaining: 4h 17m 27s
13:	learn: 0.7458669	total: 3m 39s	remaining: 4h 17m 8s
14:	learn: 0.7456499	total: 3m 54s	remaining: 4h 16m 43s
15:	learn: 0.7484055	total: 4m 9s	remaining: 4h 15m 59s
16:	learn: 0.7507903	total: 4m 25s	remaining: 4h 15m 37s
17:	learn: 0.7499462	total: 4m 40s	remaining: 4h 

In [None]:
del X_train

In [None]:
a = 2
a

In [1]:
a = 2
a

2

In [None]:
X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
assert list(X_train.columns) == list(X_test.columns)

In [None]:
y_test = model.predict_proba(X_test)[:,1]

In [None]:
%%time
X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
assert list(X_train.columns) == list(X_test.columns)
y_test, _, train_res, model = run_model(
            CatBoostClassifier(iterations=1000, eval_metric='AUC'),
            X_train, y_train, None, None, X_test,
            metric_names, None, 
            params_desc='iterations=1000',
            dataset_desc='NMF_diffcounts', 
            early_stopping_rounds=None, cat_indices=cat_indices)

CatBoostClassifier 

0:	learn: 0.6654384	total: 16.3s	remaining: 4h 30m 45s
1:	learn: 0.7202211	total: 33.1s	remaining: 4h 35m 35s
2:	learn: 0.7222674	total: 48.7s	remaining: 4h 29m 57s
3:	learn: 0.7191410	total: 1m 4s	remaining: 4h 27m 18s
4:	learn: 0.7228564	total: 1m 20s	remaining: 4h 26m 37s
5:	learn: 0.7286108	total: 1m 36s	remaining: 4h 25m 11s
6:	learn: 0.7288837	total: 1m 52s	remaining: 4h 26m 1s
7:	learn: 0.7390810	total: 2m 9s	remaining: 4h 27m 20s
8:	learn: 0.7391342	total: 2m 25s	remaining: 4h 26m 47s
9:	learn: 0.7399110	total: 2m 40s	remaining: 4h 25m 18s
10:	learn: 0.7396333	total: 2m 56s	remaining: 4h 24m 58s
11:	learn: 0.7390016	total: 3m 12s	remaining: 4h 23m 57s
12:	learn: 0.7417968	total: 3m 28s	remaining: 4h 23m 39s
13:	learn: 0.7414737	total: 3m 45s	remaining: 4h 24m 38s
14:	learn: 0.7453840	total: 4m 1s	remaining: 4h 24m 23s
15:	learn: 0.7447459	total: 4m 18s	remaining: 4h 24m 29s
16:	learn: 0.7481705	total: 4m 34s	remaining: 4h 24m 49s
17:	learn: 0.7471192	total:

In [None]:
test[target_col] = y_test

## Submission

In [None]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [None]:
submission[target_col].describe()

In [None]:
submission.head()

In [None]:
submission.to_csv(SUBMISSIONS/'RA02-20-lgbm_2017-2018_MF_diffscounts.csv', index=False)

## Feature importance