In [None]:
import os
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss
from scipy.optimize import minimize

from models_utils_skf import *

In [None]:
def load_oof(mode = 'train'):
    src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/OOF_preds/'
    oof_preds = pd.DataFrame()
    files = sorted([x for x in os.listdir(src + '{}/'.format(mode)) if '.pkl' in x or '.csv' in x
                   and 'stack' not in x])
    print('\n', 'Loading OOF preds:', files, '\n', 'Numer of files to load:', len(files), '\n')
    for i in files:
        if 'pkl'in i:
            df_preds = pd.read_pickle('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        if '.csv'in i:
            df_preds = pd.read_csv('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        oof_preds = pd.concat([oof_preds, df_preds], axis = 1)
    return oof_preds

def transform(x):
    a = 0.165 / 0.369191399096
    b =  (1 - 0.165) / (1 - 0.369191399096)
    xt = a * x / (a * x + b * (1 - x))
    return xt

def inv_pred_transform(preds):
    a = 0.165 / 0.369191399096
    b = (1 - 0.165) / (1 - 0.369191399096)
    return b * preds / (b * preds + a * (1 - preds))

def testOOF_transform(X_test2, inverse = True):
    X_test = X_test2.copy()
    for i in range(X_test.shape[1]):
        if inverse:
            X_test.iloc[:, i] = X_test.iloc[:, i].apply(inv_pred_transform)
        else:
            X_test.iloc[:, i] = X_test.iloc[:, i].apply(transform)
    return X_test

def predict_test_lgbm(test_preds, model_name, transform_preds = True):
    print('Predicting on test set with LightGBM.')
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    if transform_preds:
        sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
        sample_sub.to_csv(sub_src + '{}_transformed.csv'.format(model_name), index = False)
    else:
        sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [None]:
def weight_stacking(X, y):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running sciPy minimize function to find stacking weights.')
    
    def minimize_logloss(weights):
        final_prediction = 0
        for weight, prediction in zip(weights, predictions):
                final_prediction += weight*prediction
        return log_loss(y_tr, final_prediction)
    
    scores = []
    weights = []
    i = 1
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        
        predictions = []
        for i in range(X_tr.shape[1]):
            predictions.append(np.array(X_tr[:, i]))
        starting_values = np.random.uniform(size = X_tr.shape[1])
        cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
        bounds = [(0,1)] * len(predictions)
        
        res = minimize(minimize_logloss, 
           starting_values, 
           method = 'L-BFGS-B', 
           bounds = bounds, 
           constraints = cons,
           options={'maxiter': 10000})
        scores.append(res['fun'])
        weights.append(res['x'])
        print('Score for fold {} :'.format(i), res['fun'])
        i += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(scores).mean(axis = 0), '\n')
    return scores, weights

In [None]:
X_train = load_oof()
X_test = load_oof(mode = 'test')
y_train = pd.read_pickle('y_train.pkl')

X_train = X_train.iloc[:, :4]

In [None]:
sc, we = weight_stacking(X_train, y_train)