In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pandas_summary import DataFrameSummary
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('../..')
from src import utils

In [7]:
DATA        = Path('data')
RAW         = DATA/'raw'
INTERIM     = DATA/'interim'
PROCESSED   = DATA/'processed'
SUBMISSIONS = DATA/'submissions'

In [3]:
challenge  = pd.read_csv(RAW/'Challenge_20180423.csv', low_memory=False)
customer   = pd.read_csv(RAW/'Customer.csv', low_memory=False)
isin       = pd.read_csv(RAW/'Isin.csv', low_memory=False)
market     = pd.read_csv(RAW/'Market.csv', low_memory=False)
macro      = pd.read_csv(RAW/'MarketData_Macro.csv', low_memory=False)
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
trade      = pd.read_csv(RAW/'Trade.csv', low_memory=False)

## 50% baseline

In [5]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission['CustomerInterest'] = 0.5

In [6]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.5
1,c2cc6cc2a8,0.5
2,a8e94f6344,0.5
3,758bae1e35,0.5
4,02ab378ee8,0.5


In [8]:
submission.to_csv(SUBMISSIONS/'baseline50.csv', index=False)

## 0 baseline

In [93]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission['CustomerInterest'] = 0
submission.to_csv(SUBMISSIONS/'baseline0.csv', index=False)

## 1 baseline

In [94]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission['CustomerInterest'] = 1
submission.to_csv(SUBMISSIONS/'baseline100.csv', index=False)

## Metadata only

In [96]:
train = pd.merge(trade, customer, how='left', on='CustomerIdx')
train = pd.merge(train, isin, how='left', on='IsinIdx')

In [97]:
test = pd.merge(challenge, customer, how='left', on='CustomerIdx')
test = pd.merge(test, isin, how='left', on='IsinIdx')

In [98]:
print(train.shape)
train.head(3)

(6762021, 28)


Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA,1173,20250201,20150127,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,CCC+,"Consumer, Non-cyclic",Medical-Drugs,Priv placement,1200000000.0,STEP CPN
1,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,3216,20260615,20161014,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,BB-,"Consumer, Cyclical",Home Furnishings,Global,600000000.0,FIXED
2,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,152,20190127,20140127,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY FIN AUTOS,BB,Financial,Finance-Auto Loans,Global,750000000.0,FIXED


In [99]:
print(test.shape)
test.head(3)

(484758, 26)


Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType
0,a1e0d80784,20180423,1856,13323,Buy,,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,2740,20210315,20130314,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US TMT CDS,B-,"Consumer, Non-cyclic",Printing-Commercial,Global,450000000.0,FIXED
1,c2cc6cc2a8,20180423,1856,9230,Buy,,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,1446,20240215,20131210,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY FIN AUTOS,B,Financial,Multi-line Insurance,US domestic,400000000.0,FIXED
2,a8e94f6344,20180423,1780,9157,Buy,,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,1387,20360815,20060815,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH FIN,US FIN SHORT CASH,A+,Industrial,Diversified Manufact Op,Domestic mtn,300000000.0,FLOATING


In [100]:
# same date column name
train['DateKey'] = train['TradeDateKey']
train.drop('TradeDateKey', axis=1, inplace=True)

In [101]:
# drop extra columns 
extra_cols = ['Price', 'TradeStatus', 'NotionalEUR']
train.drop(extra_cols, axis=1, inplace=True)

In [102]:
# PredictionIdx remains in test
train.shape, test.shape

((6762021, 25), (484758, 26))

In [103]:
# train - test, test - train
set(train.columns).difference(test.columns), set(test.columns).difference(train.columns)

(set(), {'PredictionIdx'})

In [104]:
# keep same column order
cols = list(train.columns)
test = test[cols + ['PredictionIdx']].copy()

In [105]:
train.head(3)

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType,DateKey
0,2789,8478,Sell,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA,1173,20250201,20150127,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,CCC+,"Consumer, Non-cyclic",Medical-Drugs,Priv placement,1200000000.0,STEP CPN,20161207
1,2574,14562,Buy,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,3216,20260615,20161014,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,BB-,"Consumer, Cyclical",Home Furnishings,Global,600000000.0,FIXED,20170329
2,2574,4747,Buy,1.0,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,152,20190127,20140127,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY FIN AUTOS,BB,Financial,Finance-Auto Loans,Global,750000000.0,FIXED,20170418


In [106]:
test.head(3)

Unnamed: 0,CustomerIdx,IsinIdx,BuySell,CustomerInterest,Sector,Subsector,Region_x,Country,TickerIdx,ActualMaturityDateKey,IssueDateKey,Seniority,Currency,ActivityGroup,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType,DateKey,PredictionIdx
0,1856,13323,Buy,,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,2740,20210315,20130314,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US TMT CDS,B-,"Consumer, Non-cyclic",Printing-Commercial,Global,450000000.0,FIXED,20180423,a1e0d80784
1,1856,9230,Buy,,Asset Managers & Hedge Funds,Independent Asset Manager,Americas,USA,1446,20240215,20131210,SEN,USD,FLOW G10,AMERICAS,CDS AND HY,CDS AND HY,US HY FIN AUTOS,B,Financial,Multi-line Insurance,US domestic,400000000.0,FIXED,20180423,c2cc6cc2a8
2,1780,9157,Buy,,Asset Managers & Hedge Funds,Asset Mgr owned by Bank/Insur.,Americas,USA,1387,20360815,20060815,SEN,USD,FLOW G10,AMERICAS,HG CASH,HG CASH FIN,US FIN SHORT CASH,A+,Industrial,Diversified Manufact Op,Domestic mtn,300000000.0,FLOATING,20180423,a8e94f6344


In [108]:
# Heuristic for num columns
id_cols = ['CustomerIdx', 'IsinIdx']
target_col = 'CustomerInterest'
num_cols = [c for c in train._get_numeric_data().columns if c not in id_cols + [target_col, 'TickerIdx']]
cat_cols = [c for c in train.columns if c not in id_cols + num_cols + [target_col]]

In [109]:
num_cols

['ActualMaturityDateKey', 'IssueDateKey', 'IssuedAmount', 'DateKey']

In [110]:
print(cat_cols)

['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 'TickerIdx', 'Seniority', 'Currency', 'ActivityGroup', 'Region_y', 'Activity', 'RiskCaptain', 'Owner', 'CompositeRating', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue', 'CouponType']


In [111]:
for col in cat_cols:
    print(train[col].nunique(), '\t', col)

2 	 BuySell
5 	 Sector
41 	 Subsector
3 	 Region_x
99 	 Country
3543 	 TickerIdx
9 	 Seniority
23 	 Currency
3 	 ActivityGroup
8 	 Region_y
16 	 Activity
38 	 RiskCaptain
105 	 Owner
29 	 CompositeRating
13 	 IndustrySector
338 	 IndustrySubgroup
14 	 MarketIssue
6 	 CouponType


In [112]:
train[target_col].value_counts()

0.0    4550092
1.0    2211929
Name: CustomerInterest, dtype: int64

In [113]:
train.shape, test.shape

((6762021, 25), (484758, 26))

In [114]:
train.drop(id_cols + [target_col], axis=1).columns

Index(['BuySell', 'Sector', 'Subsector', 'Region_x', 'Country', 'TickerIdx',
       'ActualMaturityDateKey', 'IssueDateKey', 'Seniority', 'Currency',
       'ActivityGroup', 'Region_y', 'Activity', 'RiskCaptain', 'Owner',
       'CompositeRating', 'IndustrySector', 'IndustrySubgroup', 'MarketIssue',
       'IssuedAmount', 'CouponType', 'DateKey'],
      dtype='object')

## Preprocessing

In [115]:
from src.utils import apply_cats
for col in cat_cols:
    train[col] = train[col].astype('category').cat.as_ordered()
apply_cats(test, train)

In [116]:
for col in cat_cols:
    train[col] = train[col].cat.codes
    test[col] = test[col].cat.codes

## Cross validation

In [117]:
import time, pprint
from sklearn.model_selection import KFold

kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
pp = pprint.PrettyPrinter(indent=3)

In [118]:
from sklearn.metrics import roc_auc_score

# globals: [cat_indices]
def fit_model(model, model_name, X_trn, y_trn, X_val, y_val):
    if model_name in ['XGBClassifier', 'LGBMClassifier']:
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=30,
                  eval_metric='auc')
    elif model_name == 'CatBoostClassifier':
        model.fit(X_trn, y_trn, 
                  eval_set=[(X_val, y_val)],
                  use_best_model=True,
                  cat_features=cat_indices)
    else:
        model.fit(X_trn, y_trn)
        
def calculate_metrics(model, metrics, X_trn, y_trn, X_val, y_val):
    metric_function = {'auc': roc_auc_score}
    dset = {'trn': {'X': X_trn, 'y': y_trn},
            'val': {'X': X_val, 'y': y_val}}
    
    for d in dset:
        y_pred = model.predict_proba(dset[d]['X'])[:,1]
        for m in metrics:
            metrics[m][d] += [metric_function[m](dset[d]['y'], y_pred)]
                
    pp.pprint(metrics)
    print()

In [119]:
# globals: [kfolds]
def run_model(model, X, y, X_test, metric_names, results=None, dataset_desc='', params_desc=''):
    model_name = str(model.__class__).split('.')[-1].replace('>','').replace("'",'')
    print(model_name, '\n')
    if results is None: results = pd.DataFrame()
    metrics = {metric: {'trn': [], 'val': []} for metric in metric_names}
    y_test = np.zeros((len(X_test)))
    start = time.time()
    for trn_idx, val_idx in kfolds.split(X, y):
        print('Training fold')
        fit_model(model, model_name, 
                          X.iloc[trn_idx], y.iloc[trn_idx],
                          X.iloc[val_idx], y.iloc[val_idx])
        calculate_metrics(model, metrics, 
                          X.iloc[trn_idx], y.iloc[trn_idx],
                          X.iloc[val_idx], y.iloc[val_idx])
        y_test += model.predict_proba(X_test)[:,1]
    y_test /= kfolds.n_splits
    end = time.time()
    means = {f'{d}_{m}_mean': np.mean(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    stds  = {f'{d}_{m}_std': np.std(metrics[m][d]) for m in metrics \
                                                     for d in metrics[m]}
    metadata = {'model': model_name, 'dataset': dataset_desc,
                'params': params_desc, 'time': round(end - start, 2)}
    pp.pprint(means)
    results = results.append(pd.Series({**metadata, **means, **stds}), ignore_index=True)
    return y_test, metrics, results

In [120]:
from lightgbm import LGBMClassifier

In [121]:
metric_names = ['auc']

In [123]:
%%time
y_test, metrics, results = run_model(
                            LGBMClassifier(),
                            train.drop(id_cols + [target_col], axis=1),
                            train[target_col],
                            test.drop(id_cols + [target_col, 'PredictionIdx'], axis=1),
                            metric_names, None, 
                            params_desc='default',
                            dataset_desc='metadata-only')

LGBMClassifier 

Training fold
[1]	valid_0's auc: 0.850361
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.869478
[3]	valid_0's auc: 0.872868
[4]	valid_0's auc: 0.87765
[5]	valid_0's auc: 0.891636
[6]	valid_0's auc: 0.896225
[7]	valid_0's auc: 0.898164
[8]	valid_0's auc: 0.907492
[9]	valid_0's auc: 0.909081
[10]	valid_0's auc: 0.914509
[11]	valid_0's auc: 0.922819
[12]	valid_0's auc: 0.929132
[13]	valid_0's auc: 0.929643
[14]	valid_0's auc: 0.945129
[15]	valid_0's auc: 0.95476
[16]	valid_0's auc: 0.95848
[17]	valid_0's auc: 0.959579
[18]	valid_0's auc: 0.962591
[19]	valid_0's auc: 0.964839
[20]	valid_0's auc: 0.965932
[21]	valid_0's auc: 0.967053
[22]	valid_0's auc: 0.96842
[23]	valid_0's auc: 0.969173
[24]	valid_0's auc: 0.970089
[25]	valid_0's auc: 0.971384
[26]	valid_0's auc: 0.971915
[27]	valid_0's auc: 0.972695
[28]	valid_0's auc: 0.973429
[29]	valid_0's auc: 0.974465
[30]	valid_0's auc: 0.975193
[31]	valid_0's auc: 0.975932
[32]	valid_0's auc: 0

In [124]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,trn_auc_std,val_auc_mean,val_auc_std
0,metadata-only,LGBMClassifier,default,194.37,0.987695,0.000502,0.987667,0.000504


In [75]:
results

Unnamed: 0,dataset,model,params,time,trn_auc_mean,trn_auc_std,val_auc_mean,val_auc_std
0,metadata-only,LGBMClassifier,default,190.01,0.956597,0.001176,0.956548,0.001211


In [125]:
test[target_col] = y_test

## Submission

In [126]:
submission = pd.read_csv(RAW/'sample_submission.csv', low_memory=False)
submission = pd.merge(submission[['PredictionIdx']], test[['PredictionIdx', target_col]], how='left', on='PredictionIdx')

In [127]:
submission[target_col].describe()

count    484758.000000
mean          0.855839
std           0.136516
min           0.133469
25%           0.799480
50%           0.908899
75%           0.954256
max           0.996476
Name: CustomerInterest, dtype: float64

In [89]:
submission[target_col].value_counts().sort_index()

0.0      5994
0.2       789
0.4      1085
0.6      1122
0.8      2562
1.0    473206
Name: CustomerInterest, dtype: int64

In [128]:
submission.head()

Unnamed: 0,PredictionIdx,CustomerInterest
0,a1e0d80784,0.530181
1,c2cc6cc2a8,0.572911
2,a8e94f6344,0.653898
3,758bae1e35,0.835174
4,02ab378ee8,0.635883


In [129]:
submission.to_csv(SUBMISSIONS/'lgbm_metadata_proba.csv', index=False)