In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [5]:
from surprise import dump
_, nmf = dump.load(PROCESSED/'Extra_IsinBuySell_NMF.dump')

In [6]:
from src.utils import get_weeks
week_labels = get_weeks(day_from=20160104, num_weeks=121)[104:]

In [7]:
%%time
weeks = []
for name in week_labels:
    weeks.append(pd.read_feather(PROCESSED/f'all_weeks/week_{name}_diffscount.feather'))

CPU times: user 844 ms, sys: 436 ms, total: 1.28 s
Wall time: 1.28 s


## NMF features

In [9]:
uid = nmf.trainset._raw2inner_id_users
iid = nmf.trainset._raw2inner_id_items

In [11]:
%%time
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
for w in weeks:
    w['NMF_CustomerBias'] = [nmf.bu[uid[x]] for x in w.CustomerIdx]
    w['NMF_IsinBuySellBias'] = [nmf.bi[iid[f'{iIdx}-{bIdx}']] \
                                for iIdx, bIdx in zip(w.IsinIdx, w.BuySell)]
    w['NMF_Recommend'] = [nmf.predict(cIdx, f'{iIdx}-{bIdx}').est \
                          for cIdx, iIdx, bIdx in zip(w.CustomerIdx, w.IsinIdx, w.BuySell)]

CPU times: user 52.3 s, sys: 620 ms, total: 52.9 s
Wall time: 52.8 s


In [14]:
%%time
for n, w in zip(week_labels, weeks):
    customer_factors = np.array([nmf.pu[uid[cIdx]] for cIdx in w.CustomerIdx])
    isin_factors = np.array([nmf.qi[iid[f'{iIdx}-{bIdx}']] \
                             for iIdx, bIdx in zip(w.IsinIdx, w.BuySell)])
    for i in range(customer_factors.shape[1]):
        w[f'NMF_CustomerFactor{i:02}'] = customer_factors[:,i]
    for i in range(isin_factors.shape[1]):
        w[f'NMF_IsinBuySellFactor{i:02}'] = isin_factors[:,i]

CPU times: user 15.5 s, sys: 1.41 s, total: 17 s
Wall time: 16.9 s


## Extra Features

In [16]:
%%time
for w in weeks:
    w['Month'] = w.TradeDateKey.apply(lambda x: x // 10**4 % 10**2)
    w['Day'] = w.TradeDateKey.apply(lambda x: x % 10**2)

CPU times: user 4.27 s, sys: 156 ms, total: 4.43 s
Wall time: 4.42 s


In [18]:
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [19]:
%%time
from src.utils import preprocessing_pipeline
for i, w in enumerate(weeks):
    weeks[i] = preprocessing_pipeline(w, customer, isin, trade)

CPU times: user 7.07 s, sys: 1.06 s, total: 8.13 s
Wall time: 8.12 s


## Preprocessing

In [21]:
from functools import cmp_to_key
from src.utils import composite_rating_cmp
ratings = list(isin.CompositeRating.value_counts().index)
ratings = sorted(ratings, key=cmp_to_key(composite_rating_cmp), reverse=True)
rank = {k: i for i, k in enumerate(ratings)}

In [22]:
%%time
for w in weeks:
    w['CompositeRating']  = w.CompositeRating.apply(lambda x: rank[x])

CPU times: user 2.4 s, sys: 228 ms, total: 2.63 s
Wall time: 2.63 s


In [23]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [24]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [25]:
%%time
from src.utils import apply_cats
for col in cat_cols:
    weeks[-1][col] = weeks[-1][col].astype('category').cat.as_ordered()
for w in weeks[:-1]:
    apply_cats(w, weeks[-1])

for w in weeks:
    for col in cat_cols:
        w[col] = w[col].cat.codes

CPU times: user 6.27 s, sys: 220 ms, total: 6.49 s
Wall time: 6.48 s


## Model

In [26]:
from src.utils import run_model
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [27]:
all_2018 = pd.concat(weeks[:-1])

In [28]:
all_2018.TradeDateKey.unique()

array([20180101, 20180108, 20180115, 20180122, 20180129, 20180205,
       20180212, 20180219, 20180226, 20180305, 20180312, 20180319,
       20180326, 20180402, 20180409, 20180416])

In [33]:
for i in range(-8,0):
    train, val = pd.concat(weeks[:i]), weeks[i]
    print(train['TradeDateKey'].min(), train['TradeDateKey'].max(),
          val['TradeDateKey'].unique())

20180101 20180305 [20180312]
20180101 20180312 [20180319]
20180101 20180319 [20180326]
20180101 20180326 [20180402]
20180101 20180402 [20180409]
20180101 20180409 [20180416]
20180101 20180416 [20180423]


In [None]:
%%time
results = None
output = []
for i, w in enumerate(data_2018[1:]): 
    train, val = pd.concat(data_2018[:i+1]), w
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
            
    _, _, results, model = run_model(
            LGBMClassifier(n_estimators=400),
            X_train, y_train, X_val, y_val, None,
            metric_names, results, 
            params_desc='n_estimators=120',
            dataset_desc=f'{data_2018[i]}_diffcounts', 
            early_stopping_rounds=30)
    output.append(model)