In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from scipy.special import expit, logit
from sklearn.metrics import roc_auc_score

In [2]:
path1 = '../valid-train_data/'
valid_df = pd.read_pickle(path1+'validation.pkl.gz')

In [3]:
almost_zero = 1e-10
almost_one = 1 - almost_zero

In [19]:
base_models = {
    'lgb1 ': "Python LGBM based on Pranav Pandya's R version",
    'nn1  ': "Neural Network based on Alexander Kireev's",
    'wbftl': "anttip's Wordbatch FM-FTRL",
    'medium rf': 'random forest medium depth',
    'deep rf': 'random forest deep depth',
    'xgboost':'xgboost'
    }

In [20]:
cvfiles = {
    'lgb1 ': path1+'lgbm_r_to_python_withcv_150upsample_valid.csv',
    'nn1  ': path1+'NNET_valid.csv',
    'wbftl': path1+'wordbatch_fm_ftrl_valid.csv',
    'medium rf':path1+'morefeat_rf_medium_valid.csv',
    'deep_rf':path1+'morefeat_rf_deep_valid.csv',
    'xgboost':path1+'xgb_valid.csv'
    }

In [None]:
#nnet has ip as 2nd column

In [21]:
subfiles = {
    'lgb1 ': path1+'lgbm_r_to_python_nocv_150upsample_sub.csv',
    'nn1  ': path1+'NNET_sub.csv',
    'wbftl': path1+'wordbatch_fm_ftrl_sub',
    'medium rf':path1+'morefeat_rf_medium_sub.csv',
    'deep_rf':path1+'morefeat_rf_deep_sub.csv',
    'xgboost':path1+'xgb_sub.csv'
    }

In [None]:
cvdata = pd.DataFrame( { 
    m:pd.read_csv(cvfiles[m])['is_attributed'].clip(almost_zero,almost_one).apply(logit) 
    for m in base_models
    } )
X_train = np.array(cvdata)
y_train = pd.read_pickle(VAL_FILE)['is_attributed'] 

In [None]:
cvdata.corr()

In [None]:
stack_model = LogisticRegression()
stack_model.fit(X_train, y_train)
stack_model.coef_

In [None]:
weights = stack_model.coef_/stack_model.coef_.sum()
scores = [ roc_auc_score( y_train, expit(cvdata[c]) )  for c in cvdata.columns ]
pd.DataFrame( data={'score':scores, 'weight':weights.reshape(-1)}, index=base_models.values() )

In [None]:
print(  'Stacker score: ', roc_auc_score( y_train, stack_model.predict_proba(X_train)[:,1] )  )

In [None]:
final_sub = pd.DataFrame()
subs = {m:pd.read_csv(subfiles[m]).rename({'is_attributed':m},axis=1) for m in base_models}
first_model = list(base_models.keys())[0]
final_sub['click_id'] = subs[first_model]['click_id']

In [None]:
df = subs[first_model]
for m in subs:
    if m != first_model:
        df = df.merge(subs[m], on='click_id')  # being careful in case clicks are in different order
df.head()

In [None]:
X_test = np.array( df.drop(['click_id'],axis=1).clip(almost_zero,almost_one).apply(logit) )
final_sub['is_attributed'] = stack_model.predict_proba(X_test)[:,1]
final_sub.head(10)

In [None]:
final_sub.to_csv("sub_stacked.csv", index=False, float_format='%.9f')