In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)
macro      = pd.read_csv(RAW/'MarketData_Macro.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [4]:
from src.utils import make_val_set, add_datediffs, add_dayscount, preprocessing_pipeline

In [5]:
week_labels = [ 20171108, 20171115, 20171122, 20171129, 20171206,
                20171213, 20171220, 20171227, 20180103, 20180110,
                20180117, 20180122, 20180129, 20180205, 20180212, 
                20180219, 20180226, 20180305, 20180312, 20180319, 
                20180326, 20180402, 20180409, 20180416, 20180423]

In [6]:
%%time
weeks = [make_val_set(trade[trade.TradeDateKey.apply(lambda x: w1<=x<w2)], 
                    challenge) for w1, w2 in zip(week_labels[:-1], week_labels[1:])]

CPU times: user 14min 48s, sys: 4.2 s, total: 14min 52s
Wall time: 14min 52s


In [7]:
%%time
for w in weeks:
    add_datediffs(w, trade[trade.TradeDateKey > 20180000])
    add_dayscount(w, trade[trade.TradeDateKey > 20180000])

CPU times: user 31min 54s, sys: 7.08 s, total: 32min 1s
Wall time: 32min 1s


In [8]:
%%time
for i, w in enumerate(weeks):
    weeks[i] = preprocessing_pipeline(w, customer, isin, trade)  

CPU times: user 5.58 s, sys: 1.82 s, total: 7.41 s
Wall time: 7.41 s


In [9]:
test = pd.read_feather(PROCESSED/'week_0423_diffscount.feather')

In [10]:
test.head()

Unnamed: 0,PredictionIdx,CustomerIdx,IsinIdx,BuySell,CustomerInterest,TradeDateKey,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity,DaysCountBuySell,DaysCountTransaction,DaysCountCustomerActivity,DaysCountBondActivity,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType
0,a1e0d80784,1856,13323,Buy,,20180423,296,296,3,3,0,0,6291,34,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,2740,20210315,20130314,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US TMT CDS,B-,"Consumer, Non-cyclic",Printing-Commercial,Global,450000000.0,FIXED
1,c2cc6cc2a8,1856,9230,Buy,,20180423,14,14,3,3,3,4,6291,12,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,1446,20240215,20131210,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY FIN AUTOS,B,Financial,Multi-line Insurance,US domestic,400000000.0,FIXED
2,a8e94f6344,1780,9157,Buy,,20180423,296,296,3,296,0,0,2783,0,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,1387,20360815,20060815,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH FIN,US FIN SHORT CASH,A+,Industrial,Diversified Manufact Op,Domestic mtn,300000000.0,FLOATING
3,758bae1e35,2129,9131,Buy,,20180423,296,296,3,11,0,0,340,43,Asset Owners,Insurance,Americas,USA,1387,20180501,20080421,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH FIN,US FIN SHORT CASH,A+,Industrial,Diversified Manufact Op,Global,4000000000.0,FIXED
4,02ab378ee8,1758,7151,Buy,,20180423,296,296,3,33,0,0,1239,1,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,1290,20181115,20081118,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH NONFIN,US ENERGY CASH,BBB+,Utilities,Electric-Integrated,US domestic,300000000.0,FIXED


In [None]:
weeks[0].columns

In [None]:
test.columns

In [None]:
weeks.append(test)

## Preprocessing

In [None]:
from functools import cmp_to_key
from src.utils import composite_rating_cmp
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
ratings = list(isin.CompositeRating.value_counts().index)
ratings = sorted(ratings, key=cmp_to_key(composite_rating_cmp), reverse=True)
rank = {k: i for i, k in enumerate(ratings)}

In [None]:
%%time
for w in weeks:
    w['CompositeRating']  = w.CompositeRating.apply(lambda x: rank[x])

In [None]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [None]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [None]:
%%time
from src.utils import apply_cats
for col in cat_cols:
    weeks[-1][col] = weeks[-1][col].astype('category').cat.as_ordered()
for w in weeks[:-1]:
    apply_cats(w, weeks[-1])
for w in weeks:
    for col in cat_cols:
        w[col] = w[col].cat.codes    

## Model

In [None]:
from src.utils import run_model
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [None]:
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    print(train['TradeDateKey'].unique(), 
          val['TradeDateKey'].unique(), 
          test['TradeDateKey'].unique())

In [None]:
%%time
results = None
output = []
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            LGBMClassifier(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='n_estimators=120',
            dataset_desc=f'{week_labels[i]}_diffcounts', 
            early_stopping=True)
    output.append([y_test, model])

In [None]:
results

In [None]:
test[target_col] = 0
total = 0
n_weeks = 5
jump = 1.25
dif = 0.5
acum = 1
mul = 0
for ix in range(len(results)):
    if ix % n_weeks == 0:
        acum += dif
        mul = mul + acum
    print(mul)
    test[target_col] += (mul) * output[ix][0]
    total += mul
test[target_col] /= total

## Submission

In [None]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [None]:
submission.to_csv(SUBMISSIONS/'20-lgbm_8weeks_diffscount_20171213-0416_with_weights.csv', index=False)