In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [2]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

# Data

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)
macro      = pd.read_csv(RAW/'MarketData_Macro.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

In [4]:
train = pd.merge(trade, customer, how='left', on='CustomerIdx')
train = pd.merge(train, isin, how='left', on='IsinIdx')

In [5]:
test = pd.merge(challenge, customer, how='left', on='CustomerIdx')
test = pd.merge(test, isin, how='left', on='IsinIdx')

In [6]:
# same date column name
train['DateKey'] = train['TradeDateKey']
train.drop('TradeDateKey', axis=1, inplace=True)

In [7]:
# drop extra columns 
extra_cols = ['Price', 'TradeStatus', 'NotionalEUR']
train.drop(extra_cols, axis=1, inplace=True)

In [8]:
# keep same column order
cols = list(train.columns)
test = test[cols + ['PredictionIdx']].copy()

In [9]:
# keep same column order
cols = list(train.columns)
test = test[cols + ['PredictionIdx']].copy()

In [11]:
# Heuristic for num columns
id_cols = ['CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
num_cols = [c for c in train._get_numeric_data().columns if c not in id_cols + [target_col, 'TickerIdx']]
cat_cols = [c for c in train.columns if c not in id_cols + num_cols + [target_col]]

In [12]:
train.drop(id_cols + [target_col], axis=1).columns

Index(['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 'TickerIdx',
       'ActualMaturityDateKey', 'IssueDateKey', 'Seniority', 'Currency',
       'ActivityGroup', 'Region_y', 'Activity', 'RiskCaptain', 'Owner',
       'CompositeRating', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue',
       'IssuedAmount', 'CouponType', 'DateKey'],
      dtype='object')

In [13]:
train.columns

Index(['CustomerIdx', 'IsinIdx', 'BuySell', 'CustomerInterest', 'Sector',
       'Subsector', 'Region_x', 'Country', 'TickerIdx',
       'ActualMaturityDateKey', 'IssueDateKey', 'Seniority', 'Currency',
       'ActivityGroup', 'Region_y', 'Activity', 'RiskCaptain', 'Owner',
       'CompositeRating', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue',
       'IssuedAmount', 'CouponType', 'DateKey'],
      dtype='object')

# Preprocessing

In [14]:
from src.utils import apply_cats
for col in cat_cols:
    train[col] = train[col].astype('category').cat.as_ordered()
apply_cats(test, train)

In [15]:
for col in cat_cols:
    train[col] = train[col].cat.codes
    test[col] = test[col].cat.codes

In [21]:
id_cols = ['Country', 'Subsector']

In [23]:
train.drop(id_cols, axis=1)
test.drop(id_cols, axis=1).columns

Index(['CustomerIdx', 'IsinIdx', 'BuySell', 'CustomerInterest', 'Sector',
       'Region_x', 'TickerIdx', 'ActualMaturityDateKey', 'IssueDateKey',
       'Seniority', 'Currency', 'ActivityGroup', 'Region_y', 'Activity',
       'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector',
       'IndustrySubgroup', 'MarketIssue', 'IssuedAmount', 'CouponType',
       'DateKey', 'PredictionIdx'],
      dtype='object')

# Cross validation

In [28]:
import time, pprint
from sklearn.model_selection import KFold

kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
pp = pprint.PrettyPrinter(indent=3)

In [29]:
from sklearn.metrics import roc_auc_score

# globals: [cat_indices]
def fit_model(model, model_name, X_trn, y_trn, X_val, y_val):
    if model_name in ['XGBClassifier', 'LGBMClassifier']:
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=30,
                  eval_metric='auc')
    elif model_name == 'CatBoostClassifier':
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  use_best_model=True,
                  cat_features=cat_indices)
    else:
        model.fit(X_trn, y_trn)
        
def calculate_metrics(model, metrics, X_trn, y_trn, X_val, y_val):
    metric_function = {'auc': roc_auc_score}
    dset = {'trn': {'X': X_trn, 'y': y_trn},
            'val': {'X': X_val, 'y': y_val}}
    
    for d in dset:
        y_pred = model.predict(dset[d]['X'])
        for m in metrics:
            metrics[m][d] += [metric_function[m](dset[d]['y'], y_pred)]
                
    pp.pprint(metrics)
    print()

In [30]:
# globals: [kfolds]
def run_model(model, X, y, X_test, metric_names, results=None, dataset_desc='', params_desc=''):
    model_name = str(model.__class__).split('.')[-1].replace('>','').replace("'",'')
    print(model_name, '\n')
    if results is None: results = pd.DataFrame()
    metrics = {metric: {'trn': [], 'val': []} for metric in metric_names}
    y_test = np.zeros((len(X_test)))
    start = time.time()
    for trn_idx, val_idx in kfolds.split(X, y):
        print('Training fold')
        fit_model(model, model_name, 
                          X.iloc[trn_idx], y.iloc[trn_idx],
                          X.iloc[val_idx], y.iloc[val_idx])
        calculate_metrics(model, metrics, 
                          X.iloc[trn_idx], y.iloc[trn_idx],
                          X.iloc[val_idx], y.iloc[val_idx])
        y_test += model.predict(X_test)
    y_test /= kfolds.n_splits
    end = time.time()
    means = {f'{d}_{m}_mean': np.mean(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    stds  = {f'{d}_{m}_std': np.std(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    metadata = {'model': model_name, 'dataset': dataset_desc,
                'params': params_desc, 'time': round(end - start, 2)}
    pp.pprint(means)
    results = results.append(pd.Series({**metadata, **means, **stds}), ignore_index=True)
    return y_test, metrics, results

In [41]:
train.columns

Index(['CustomerIdx', 'IsinIdx', 'BuySell', 'CustomerInterest', 'Sector',
       'Subsector', 'Region_x', 'Country', 'TickerIdx',
       'ActualMaturityDateKey', 'IssueDateKey', 'Seniority', 'Currency',
       'ActivityGroup', 'Region_y', 'Activity', 'RiskCaptain', 'Owner',
       'CompositeRating', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue',
       'IssuedAmount', 'CouponType', 'DateKey'],
      dtype='object')

In [27]:
from lightgbm import LGBMClassifier

In [31]:
metric_names = ['auc']

In [32]:
%%time
y_test, metrics, results = run_model(
                            LGBMClassifier(),
                            train.drop(id_cols + [target_col], axis=1),
                            train[target_col],
                            test.drop(id_cols + [target_col, 'PredictionIdx'], axis=1),
                            metric_names, None, 
                            params_desc='default',
                            dataset_desc='metadata-only')

LGBMClassifier 

Training fold
[1]	valid_0's auc: 0.842871
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.870019
[3]	valid_0's auc: 0.877994
[4]	valid_0's auc: 0.885615
[5]	valid_0's auc: 0.886209
[6]	valid_0's auc: 0.894305
[7]	valid_0's auc: 0.906362
[8]	valid_0's auc: 0.916003
[9]	valid_0's auc: 0.917865
[10]	valid_0's auc: 0.927489
[11]	valid_0's auc: 0.933568
[12]	valid_0's auc: 0.934782
[13]	valid_0's auc: 0.94946
[14]	valid_0's auc: 0.950466
[15]	valid_0's auc: 0.958871
[16]	valid_0's auc: 0.962884
[17]	valid_0's auc: 0.966402
[18]	valid_0's auc: 0.968304
[19]	valid_0's auc: 0.969127
[20]	valid_0's auc: 0.970282
[21]	valid_0's auc: 0.970643
[22]	valid_0's auc: 0.971466
[23]	valid_0's auc: 0.973282
[24]	valid_0's auc: 0.973782
[25]	valid_0's auc: 0.974765
[26]	valid_0's auc: 0.975588
[27]	valid_0's auc: 0.976117
[28]	valid_0's auc: 0.977239
[29]	valid_0's auc: 0.978055
[30]	valid_0's auc: 0.979088
[31]	valid_0's auc: 0.97969
[32]	valid_0's auc:

In [33]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,trn_auc_std,val_auc_mean,val_auc_std
0,metadata-only,LGBMClassifier,default,189.22,0.964472,0.001113,0.964441,0.001138


In [34]:
test[target_col] = y_test

# Submission

In [35]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], how='left', on='PredictionIdx')

In [36]:
submission[target_col].value_counts().sort_index()

0.0      5412
0.2       544
0.4       498
0.6       703
0.8      1813
1.0    475788
Name: CustomerInterest, dtype: int64

In [38]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,1.0
1,c2cc6cc2a8,1.0
2,a8e94f6344,1.0
3,758bae1e35,1.0
4,02ab378ee8,1.0


In [40]:
submission.to_csv('lgbm_metadata.csv', index=False)