In [5]:
import os
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss
from scipy.optimize import minimize

from models_utils_skf import *

In [6]:
def load_oof(mode = 'train'):
    src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/OOF_preds2/'
    oof_preds = pd.DataFrame()
    files = sorted([x for x in os.listdir(src + '{}/'.format(mode)) if '.pkl' in x or '.csv' in x
                   and 'stack' not in x])
    print('\n', 'Loading OOF preds:', files, '\n', 'Numer of files to load:', len(files), '\n')
    for i in files:
        if 'pkl'in i:
            df_preds = pd.read_pickle('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        if '.csv'in i:
            df_preds = pd.read_csv('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        if '0.18' in i and mode == 'test':
            print(i, 'applying transformation')
            df_preds.is_duplicate = df_preds.is_duplicate.apply(transform)
        oof_preds = pd.concat([oof_preds, df_preds], axis = 1)
    return oof_preds

def transform(x):
    a = 0.165 / 0.369191399096
    b =  (1 - 0.165) / (1 - 0.369191399096)
    xt = a * x / (a * x + b * (1 - x))
    return xt

def inv_pred_transform(preds):
    a = 0.165 / 0.369191399096
    b = (1 - 0.165) / (1 - 0.369191399096)
    return b * preds / (b * preds + a * (1 - preds))

def testOOF_transform(X_test2, inverse = True):
    X_test = X_test2.copy()
    for i in range(X_test.shape[1]):
        if inverse:
            X_test.iloc[:, i] = X_test.iloc[:, i].apply(inv_pred_transform)
        else:
            X_test.iloc[:, i] = X_test.iloc[:, i].apply(transform)
    return X_test

def predict_test_lgbm(test_preds, model_name, transform_preds = True):
    print('Predicting on test set with LightGBM.')
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    if transform_preds:
        sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
        sample_sub.to_csv(sub_src + '{}_transformed.csv'.format(model_name), index = False)
    else:
        sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [7]:
def weight_stacking(X, y):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running sciPy minimize function to find stacking weights.')
    
    def minimize_logloss(weights):
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
                final_prediction += weight*prediction
        return log_loss(y_tr, final_prediction)
    
    scores = []
    weights = []
    i = 1
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        
        predictions = []
        for i in range(X_tr.shape[1]):
            predictions.append(np.array(X_tr[:, i]))
        starting_values = np.random.uniform(size = X_tr.shape[1])
        cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
        bounds = [(0,1)] * len(predictions)
        
        res = minimize(minimize_logloss, 
           starting_values, 
           method = 'L-BFGS-B', 
           bounds = bounds, 
           constraints = cons,
           options={'maxiter': 10000})
        scores.append(res['fun'])
        weights.append(res['x'])
        print('Score for fold {} :'.format(i), res['fun'])
        i += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(scores).mean(axis = 0), '\n')
    return scores, weights

In [8]:
X_train = load_oof()
X_test = load_oof(mode = 'test')
y_train = pd.read_pickle('y_train.pkl')



 Loading OOF preds: ['train_preds_866cols_xgbparams1.pkl', 'train_preds_866cols_xgbparams2.pkl', 'train_preds_866cols_xgbparams3.pkl', 'train_preds_866cols_xgbparams4.pkl', 'train_preds_lgb_0.1807_20170603_1943.csv', 'train_preds_newNetworks_currentBest.pkl', 'train_preds_xgb_0.1800_20170601_1113.csv', 'train_preds_xgb_0.1808_20170602_0246.csv', 'train_preds_xgb_0.1812_20170603_0506.csv', 'train_preds_xgb_0.1813_20170530_0249.csv'] 
 Numer of files to load: 10 


 Loading OOF preds: ['test_preds_866cols_xgbparams1.csv', 'test_preds_866cols_xgbparams2.csv', 'test_preds_866cols_xgbparams3.csv', 'test_preds_866cols_xgbparams4.csv', 'test_preds_lgb_0.1807_20170603_1943.csv', 'test_preds_newNetworks_currentBest.csv', 'test_preds_xgb_0.1800_20170601_1113.csv', 'test_preds_xgb_0.1808_20170602_0246.csv', 'test_preds_xgb_0.1812_20170603_0506.csv', 'test_preds_xgb_0.1813_20170530_0249.csv'] 
 Numer of files to load: 10 

test_preds_lgb_0.1807_20170603_1943.csv applying transformation
test_preds

In [10]:
X_test

Unnamed: 0,is_duplicate,is_duplicate.1,is_duplicate.2,is_duplicate.3,is_duplicate.4,is_duplicate.5,is_duplicate.6,is_duplicate.7,is_duplicate.8,is_duplicate.9
0,0.000129,1.911653e-04,0.000122,0.000141,1.545842e-04,0.000165,0.000171,1.457426e-04,1.333449e-04,0.000211
1,0.052304,5.754049e-02,0.050996,0.057502,6.940157e-02,0.052037,0.057498,6.634863e-02,6.084073e-02,0.127176
2,0.099921,1.115923e-01,0.088400,0.091953,9.478759e-02,0.094231,0.095461,9.520093e-02,8.808312e-02,0.130371
3,0.000016,1.140786e-05,0.000017,0.000016,1.917794e-05,0.000016,0.000021,2.067181e-05,9.140462e-06,0.000029
4,0.004827,5.110578e-03,0.006035,0.004970,5.039550e-03,0.004776,0.005952,4.978772e-03,5.656539e-03,0.008326
5,0.000095,9.505450e-05,0.000109,0.000089,7.394592e-05,0.000094,0.000102,6.632345e-05,4.150730e-05,0.000157
6,0.997075,9.957646e-01,0.997880,0.997002,9.981463e-01,0.996705,0.996638,9.983015e-01,9.983101e-01,0.998324
7,0.122253,1.110091e-01,0.142846,0.139284,2.167587e-01,0.130072,0.138057,1.973689e-01,1.967920e-01,0.174692
8,0.268248,2.371590e-01,0.305436,0.247487,1.556029e-01,0.248620,0.267425,1.547635e-01,1.581062e-01,0.235332
9,0.000632,7.314834e-04,0.000554,0.000633,5.466894e-04,0.000542,0.000623,4.847422e-04,3.806334e-04,0.001111


In [9]:
X_train

Unnamed: 0,866cols_xgbparams1_prob,866cols_xgbparams2_prob,866cols_xgbparams3_prob,866cols_xgbparams4_prob,is_duplicate,is_duplicate.1,is_duplicate.2,is_duplicate.3,is_duplicate.4,is_duplicate.5
0,0.038358,0.100842,0.042751,0.052017,0.044802,0.053304,0.045111,0.031322,0.033692,0.046143
1,0.039052,0.036801,0.026893,0.047948,0.055348,0.034319,0.046815,0.046572,0.029156,0.029782
2,0.241795,0.283495,0.232339,0.259854,0.194025,0.279996,0.230845,0.234490,0.212141,0.153272
3,0.000048,0.000062,0.000029,0.000060,0.000008,0.000059,0.000048,0.000011,0.000004,0.000049
4,0.000013,0.000021,0.000013,0.000017,0.000007,0.000029,0.000015,0.000004,0.000008,0.000017
5,0.192057,0.344998,0.188420,0.284563,0.349059,0.281890,0.230937,0.356465,0.325100,0.240997
6,0.000005,0.000005,0.000009,0.000006,0.000003,0.000010,0.000009,0.000008,0.000004,0.000007
7,0.823113,0.811390,0.820624,0.825700,0.784744,0.861165,0.831392,0.770500,0.804362,0.845639
8,0.026277,0.019517,0.033145,0.023274,0.063773,0.013592,0.026633,0.046321,0.049923,0.049936
9,0.301068,0.319692,0.278782,0.336262,0.385652,0.376321,0.309016,0.410406,0.315750,0.337729


In [8]:
sc, we = weight_stacking(X_train, y_train)

Running sciPy minimize function to find stacking weights.
Score for fold 9 : 0.179408373599
Score for fold 9 : 0.17847285955
Score for fold 9 : 0.178980887451
Score for fold 9 : 0.179360588937
Score for fold 9 : 0.17864670343
Score for fold 9 : 0.178888789574
Score for fold 9 : 0.178912083278
Score for fold 9 : 0.179339164637
Score for fold 9 : 0.178812460143
Score for fold 9 : 0.1789309382
Mean logloss for model in 10-folds SKF: 0.17897528488 



In [31]:
we2 = np.array(we)
we2 = we2.mean(axis = 0)

In [35]:
X_test2 = X_test.copy()
for i in range(len(we2)):
    X_test2.iloc[:, i] = we2[i] * X_test.iloc[:, i]
X_test2.sum(axis = 1)

0          0.000286
1          0.112052
2          0.160693
3          0.000032
4          0.010372
5          0.000156
6          0.998078
7          0.252615
8          0.321901
9          0.001094
10         0.023824
11         0.000014
12         0.000005
13         0.012212
14         0.034574
15         0.002934
16         0.000009
17         0.756140
18         0.206711
19         0.163314
20         0.000006
21         0.003573
22         0.000772
23         0.016576
24         0.000051
25         0.086410
26         0.000009
27         0.006720
28         0.037284
29         0.000033
             ...   
2345766    0.000003
2345767    0.000024
2345768    0.005238
2345769    0.000003
2345770    0.261233
2345771    0.004034
2345772    0.309077
2345773    0.000005
2345774    0.000095
2345775    0.055289
2345776    0.091607
2345777    0.002230
2345778    0.000040
2345779    0.000746
2345780    0.000449
2345781    0.157923
2345782    0.315332
2345783    0.000006
2345784    0.351294


In [38]:
predict_test_lgbm(X_test2.sum(axis = 1), '10bestGBM_weighted', False)

Predicting on test set with LightGBM.
