In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)
macro      = pd.read_csv(RAW/'MarketData_Macro.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [4]:
from src.utils import make_val_set, add_datediffs, add_dayscount, preprocessing_pipeline

In [8]:
week_labels = [ 20171213, 20171220, 20171227, 20180103, 20180110,
                20180117, 20180122, 20180129, 20180205, 20180212]

In [9]:
%%time
weeks = [make_val_set(trade[trade.TradeDateKey.apply(lambda x: w1<=x<w2)], 
                    challenge) for w1, w2 in zip(week_labels[:-1], week_labels[1:])]

CPU times: user 5min 24s, sys: 1.1 s, total: 5min 25s
Wall time: 5min 25s


In [10]:
%%time
for w in weeks:
    add_datediffs(w, trade[trade.TradeDateKey > 20180000])
    add_dayscount(w, trade[trade.TradeDateKey > 20180000])

CPU times: user 11min 43s, sys: 2.67 s, total: 11min 45s
Wall time: 11min 45s


In [11]:
def fill_variabilities(df, time_days=7):
    df['Yield_var'] = 0.0
    df['ZSpread_var'] = 0.0
    df['Price_var'] = 0.0
    
    for ix in df.IsinIdx.unique():
        temp = df.loc[df.IsinIdx == ix].reset_index()
        fill_variabilites_per_index(df, temp, time_days)

In [12]:
def fill_variabilites_per_index(df, temp, time_days=7):
    for ix, _ in temp.iterrows():
        if ix <= 1:
            continue
        start = ix - time_days - 1
        if start < 0:
            start = 0
        df.set_value(temp.loc[ix]['index'], 'Yield_var', temp.loc[start:ix - 1].Yield.var())
        df.set_value(temp.loc[ix]['index'], 'ZSpread_var', temp.loc[start:ix - 1].ZSpread.var())
        df.set_value(temp.loc[ix]['index'], 'Price_var', temp.loc[start:ix - 1].Price.var())    

In [14]:
w.head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,DaysSinceBuySell,DaysSinceTransaction,DaysSinceCustomerActivity,DaysSinceBondActivity,DaysCountBuySell,DaysCountTransaction,DaysCountCustomerActivity,DaysCountBondActivity
0,20180205,0,21856,Buy,0.0,219,219,4,6,0,0,5,4
1,20180205,0,21856,Sell,0.0,219,219,4,6,0,0,5,4
2,20180205,0,24944,Buy,0.0,219,219,4,21,0,0,5,3
3,20180205,0,24944,Sell,0.0,219,219,4,21,0,0,5,3
4,20180205,0,25992,Buy,0.0,219,219,4,3,0,0,5,15


In [None]:
# %%time
# fill_variabilities(market, time_days=7)

In [None]:
# # To simplify process
# market.to_feather(PROCESSED/'market_with_var_7.feather')

In [15]:
def calculate_SMA(df, period, start=0, column='Price'):
    """
        Returning the First SMA to calculate the first EMA
    """
    return df.loc[start:period + start - 1][column].sum() / period

In [16]:
def calculate_EMA(prev_EMA, price, multiplier):
    """
        Returning the EMA for t time
    """
    return (price - prev_EMA) * multiplier + prev_EMA

In [17]:
def fill_EMA(df, period=20, name_column='EMA_Price_Short_term', column='Price'):
    """
        Exponential moving averages (EMAs) reduce the lag by applying more weight to recent prices
    """
    first_SMA = calculate_SMA(df, period, column=column)
    multiplier= (2.0 / (period + 1))    
    df[name_column] = np.nan
    for ix, _ in df.iterrows():
        if ix < period - 1:
            continue
        elif ix == period - 1:
            df.set_value(ix, name_column, first_SMA)
            prev_EMA = first_SMA
        else:
            if np.isnan(df.loc[ix][column]):
                df.set_value(ix, column, (df.loc[ix-1][column] + df.loc[ix+1][column]) / 2)
            actual_EMA = calculate_EMA(prev_EMA, df.loc[ix][column], multiplier)
            prev_EMA = actual_EMA
            df.set_value(ix, name_column, actual_EMA)
        

In [None]:
# market_temp = market.copy()
# fill_EMA(market_temp, period=20, name_column='EMA_20')

In [None]:
# fill_EMA(market_temp, period=150, name_column='EMA_150')

In [None]:
# market_temp.head()

In [None]:
%%time
for i, w in enumerate(weeks):
    weeks[i] = preprocessing_pipeline(w, customer, isin, trade)  

In [None]:
test = pd.read_feather(PROCESSED/'week_0423_diffscount.feather')
#test.drop(['Yield', 'ZSpread', 'Price'], axis=1, inplace=True)

In [None]:
test.head(1)

In [None]:
def fill_test_variabilities(test, market, time_days=7):
    test['Yield_var'] = 0.0
    test['ZSpread_var'] = 0.0
    test['Price_var'] = 0.0
    
    for ix in test.IsinIdx.unique():
        temp = test.loc[test.IsinIdx == ix].reset_index()
        for ix, _ in temp.iterrows():
            test.set_value(temp.loc[ix]['index'], 'Yield_var', market.tail(time_days).Yield.var())
            test.set_value(temp.loc[ix]['index'], 'ZSpread_var', market.tail(time_days).ZSpread.var())
            test.set_value(temp.loc[ix]['index'], 'Price_var', market.tail(time_days).Price.var())  

In [None]:
# %%time
# fill_test_variabilities(test, market, time_days=7)

In [None]:
test.drop(['index'], axis=1, inplace=True)

In [None]:
weeks[0].columns

In [None]:
test.columns

In [None]:
weeks.append(test)

In [None]:
%%time
for w, name in zip(weeks, week_labels):
    w.reset_index().to_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather')

In [None]:
len(weeks)

In [None]:
# %%time
# weeks_aux = []
# for w, name in zip(weeks, week_labels):
#     weeks_aux.append(pd.read_feather(PROCESSED/f'week_{name % 10000:04}_diffscount.feather'))

# Preprocessing

In [None]:
cat_cols = ['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 
            'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 
            'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 
            'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']

In [None]:
id_cols = ['TradeDateKey', 'CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
pred_col = 'PredictionIdx'

In [None]:
from src.utils import apply_cats
for col in cat_cols:
    weeks[-1][col] = weeks[-1][col].astype('category').cat.as_ordered()

In [None]:
for w in weeks[:-1]:
    apply_cats(w, weeks[-1])

In [None]:
for w in weeks:
    for col in cat_cols:
        w[col] = w[col].cat.codes

# Train/Test/Val

In [None]:
from src.utils import run_model
from lightgbm import LGBMClassifier
metric_names = ['auc']

In [None]:
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    print(train['TradeDateKey'].unique(), 
          val['TradeDateKey'].unique(), 
          test['TradeDateKey'].unique())

In [None]:
%%time
results = None
output = []
for i, w in enumerate(weeks[1:]):
    train, val, test = weeks[i], w, weeks[-1]
    X_train, y_train = train.drop(id_cols + [target_col], axis=1), \
                       train[target_col]
    if pred_col in val.columns: # when test acts as validation
        X_val, y_val = None, None
    else:
        X_val, y_val = val.drop(id_cols + [target_col], axis=1), \
                       val[target_col]
    X_test = test.drop(id_cols + [target_col, pred_col], axis=1)
        
    y_test, _, results, model = run_model(
            LGBMClassifier(n_estimators=120),
            X_train, y_train, X_val, y_val, X_test,
            metric_names, results, 
            params_desc='n_estimators=120',
            dataset_desc=f'{week_labels[i]}_diffcounts', 
            early_stopping=True)
    output.append([y_test, model])

In [None]:
results

In [56]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,LGBMClassifier,n_estimators=120,7.78,0.801433,0.656208
1,20180305_diffcounts,LGBMClassifier,n_estimators=120,5.52,0.763234,0.695151
2,20180312_diffcounts,LGBMClassifier,n_estimators=120,5.24,0.753722,0.727934
3,20180319_diffcounts,LGBMClassifier,n_estimators=120,8.19,0.814878,0.675515
4,20180326_diffcounts,LGBMClassifier,n_estimators=120,7.35,0.812643,0.706836
5,20180402_diffcounts,LGBMClassifier,n_estimators=120,5.81,0.799783,0.739328
6,20180409_diffcounts,LGBMClassifier,n_estimators=120,6.84,0.827979,0.771319
7,20180416_diffcounts,LGBMClassifier,n_estimators=120,3.6,0.844436,0.0


In [40]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,LGBMClassifier,n_estimators=120,7.69,0.792226,0.655294
1,20180305_diffcounts,LGBMClassifier,n_estimators=120,5.12,0.749205,0.69517
2,20180312_diffcounts,LGBMClassifier,n_estimators=120,6.71,0.779543,0.7289
3,20180319_diffcounts,LGBMClassifier,n_estimators=120,8.2,0.80946,0.683462
4,20180326_diffcounts,LGBMClassifier,n_estimators=120,7.18,0.809129,0.705596
5,20180402_diffcounts,LGBMClassifier,n_estimators=120,5.23,0.78741,0.738378
6,20180409_diffcounts,LGBMClassifier,n_estimators=120,6.74,0.818286,0.772272
7,20180416_diffcounts,LGBMClassifier,n_estimators=120,3.52,0.841613,0.0


In [148]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,LGBMClassifier,n_estimators=120,8.04,0.813655,0.655689
1,20180305_diffcounts,LGBMClassifier,n_estimators=120,5.16,0.757246,0.693547
2,20180312_diffcounts,LGBMClassifier,n_estimators=120,6.54,0.785328,0.729307
3,20180319_diffcounts,LGBMClassifier,n_estimators=120,6.18,0.775792,0.678395
4,20180326_diffcounts,LGBMClassifier,n_estimators=120,7.74,0.830289,0.705754
5,20180402_diffcounts,LGBMClassifier,n_estimators=120,4.97,0.783423,0.736578
6,20180409_diffcounts,LGBMClassifier,n_estimators=120,6.74,0.830874,0.772098
7,20180416_diffcounts,LGBMClassifier,n_estimators=120,3.49,0.841493,0.0


In [139]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,LGBMClassifier,n_estimators=120,6.84,0.780921,0.656573
1,20180305_diffcounts,LGBMClassifier,n_estimators=120,4.51,0.735274,0.693338
2,20180312_diffcounts,LGBMClassifier,n_estimators=120,6.87,0.781854,0.728517
3,20180319_diffcounts,LGBMClassifier,n_estimators=120,8.31,0.812702,0.683438
4,20180326_diffcounts,LGBMClassifier,n_estimators=120,8.09,0.824718,0.704264
5,20180402_diffcounts,LGBMClassifier,n_estimators=120,5.6,0.79026,0.736316
6,20180409_diffcounts,LGBMClassifier,n_estimators=120,6.38,0.813713,0.771199
7,20180416_diffcounts,LGBMClassifier,n_estimators=120,3.27,0.84307,0.0


In [105]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,LGBMClassifier,n_estimators=120,11.94,0.795231,0.654387
1,20180305_diffcounts,LGBMClassifier,n_estimators=120,7.43,0.749922,0.694827
2,20180312_diffcounts,LGBMClassifier,n_estimators=120,10.32,0.79454,0.729287
3,20180319_diffcounts,LGBMClassifier,n_estimators=120,11.64,0.789302,0.680462
4,20180326_diffcounts,LGBMClassifier,n_estimators=120,12.1,0.820079,0.706285
5,20180402_diffcounts,LGBMClassifier,n_estimators=120,4.33,0.73576,0.731898
6,20180409_diffcounts,LGBMClassifier,n_estimators=120,11.22,0.825966,0.770442
7,20180416_diffcounts,LGBMClassifier,n_estimators=120,6.62,0.841095,0.0


In [29]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,val_auc_mean
0,20180226_diffcounts,LGBMClassifier,n_estimators=120,12.62,0.800578,0.654793
1,20180305_diffcounts,LGBMClassifier,n_estimators=120,9.34,0.764028,0.694719
2,20180312_diffcounts,LGBMClassifier,n_estimators=120,9.86,0.770449,0.728249
3,20180319_diffcounts,LGBMClassifier,n_estimators=120,12.95,0.804084,0.676341
4,20180326_diffcounts,LGBMClassifier,n_estimators=120,10.54,0.806556,0.705754
5,20180402_diffcounts,LGBMClassifier,n_estimators=120,8.53,0.7811,0.739115
6,20180409_diffcounts,LGBMClassifier,n_estimators=120,10.44,0.832328,0.771817
7,20180416_diffcounts,LGBMClassifier,n_estimators=120,5.76,0.843448,0.0


In [42]:
test[target_col] = np.mean([x[0] for x in output], axis=0)

# Submission

In [43]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], 
                      how='left', on='PredictionIdx')

In [44]:
submission.to_csv(SUBMISSIONS/'06-lgbm_8weeks_diffscount_0212-0416.csv', index=False)