In [1]:
import os
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss

from models_utils_skf import *

In [2]:
def load_oof(mode = 'train'):
    src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/OOF_preds/'
    oof_preds = pd.DataFrame()
    files = sorted([x for x in os.listdir(src + '{}/'.format(mode)) if '.pkl' in x or '.csv' in x
                   and 'stack' not in x])
    print('\n', 'Loading OOF preds:', files, '\n', 'Numer of files to load:', len(files), '\n')
    for i in files:
        if 'pkl'in i:
            df_preds = pd.read_pickle('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        if '.csv'in i:
            df_preds = pd.read_csv('{}/{}/{}'.format(src, mode, i))
            if 'id' in df_preds.columns:
                df_preds.drop(['id'], axis = 1, inplace = True)
            if 'test_id' in df_preds.columns:
                df_preds.drop(['test_id'], axis = 1, inplace = True)
        oof_preds = pd.concat([oof_preds, df_preds], axis = 1)
    return oof_preds

def transform(x):
    a = 0.165 / 0.369191399096
    b =  (1 - 0.165) / (1 - 0.369191399096)
    xt = a * x / (a * x + b * (1 - x))
    return xt

def inv_pred_transform(preds):
    a = 0.165 / 0.369191399096
    b = (1 - 0.165) / (1 - 0.369191399096)
    return b * preds / (b * preds + a * (1 - preds))

def testOOF_transform(X_test2, inverse = True):
    X_test = X_test2.copy()
    for i in range(X_test.shape[1]):
        if inverse:
            X_test.iloc[:, i] = X_test.iloc[:, i].apply(inv_pred_transform)
        else:
            X_test.iloc[:, i] = X_test.iloc[:, i].apply(transform)
    return X_test

def predict_test_lgbm(test_preds, model_name, transform_preds = True):
    print('Predicting on test set with LightGBM.')
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    if transform_preds:
        sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
        sample_sub.to_csv(sub_src + '{}_transformed.csv'.format(model_name), index = False)
    else:
        sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [3]:
def oversample_anokas(X, y):
    pos_train = X[y == 1]
    neg_train = X[y == 0]
    p = 0.165
    scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
    while scale > 1:
        neg_train = np.vstack([neg_train, neg_train])
        scale -=1
    neg_train = np.vstack([neg_train, neg_train[:int(scale * len(neg_train))]])
    print("Mean target rate : ", len(pos_train) / (len(pos_train) + len(neg_train)))
    X = np.vstack([pos_train, neg_train])
    y = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
    del pos_train, neg_train
    return X, y

def oversample_2nd(X, y):
    X_dup = X[y == 1]
    X_non_dup = X[y == 0]
    X = np.vstack([X_non_dup, X_dup, X_non_dup])
    y = np.array([0] * X_non_dup.shape[0] + [1] * X_dup.shape[0] + [0] * X_non_dup.shape[0])
    del X_dup
    del X_non_dup
    print("Mean target rate : ", y.mean())
    return X, y


def lgb_foldrun_test_oversample(X, y, X_test, params, name, save = True):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running LGBM model with parameters:', params)
        
    i = 0
    losses = []
    losses2 = []
    oof_train = np.zeros((X.shape[0]))
    oof_test = np.zeros((10, 2345796))
    os.makedirs('saved_models/LGBM/SKF/{}'.format(name), exist_ok = True)
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()
        
        X_tr, y_tr = oversample_anokas(X_tr, y_tr)
        X_val, y_val = oversample_anokas(X_val, y_val)
        
        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_val = lgb.Dataset(X_val, y_val)
        print('\n', 'Start training on fold: {}'.format(i))
        gbm = lgb.train(params, lgb_train, num_boost_round = 100000, valid_sets = lgb_val,
                        early_stopping_rounds = 200, verbose_eval = 100)
        print('Start predicting...')
        val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        val_pred_transform = np.apply_along_axis(transform, 0, val_pred)
        oof_train[val_index] = val_pred
        score = log_loss(y_val, val_pred)
        score2 = log_loss(y_val, val_pred_transform)
        losses.append(score)
        losses2.append(score2)
        if X_test is not None:
            test_preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)
            oof_test[i, :] = test_preds
        print('Final score for fold {} :'.format(i), score, 'Transformed score:', score2, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        gbm.save_model('saved_models/LGBM/SKF/{}/LGBM_10SKF_loss{:.5f}_fold{}.txt'.format(name, score, i))
        i += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0), '\n',
         'Mean logloss for transformed predictions:', np.array(losses2).mean(axis = 0), '\n')
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['{}_prob'.format(name)]
    oof_test = oof_test.mean(axis = 0)
    oof_test = pd.DataFrame(oof_test)
    oof_test.columns = ['{}_prob'.format(name)]
    if save:
        oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
        oof_test.to_pickle('OOF_preds/test/test_preds_{}.pkl'.format(name))
    return oof_train, oof_test

In [4]:
lgb_params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : {'binary_logloss'},
    'learning_rate' : 0.03,
    'feature_fraction' : 0.51,
    'bagging_fraction': 0.9,
    'bagging_freq': 100,
    'num_leaves' : 255,
    'max_depth': 4,
    'min_data_in_leaf': 23,
    'subsample': 0.8,
    'colsample_bytree': 0.41,
    'silent': 1,
    'random_state': 1337,
    'verbose': 1,
    'nthread': 4,
}

xgb_params = {
    'seed': 1337,
    'colsample_bytree': 0.42,
    'silent': 1,
    'subsample': 0.85,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'min_child_weight': 20,
    'nthread': 4,
    }

X_train = load_oof()
X_test = load_oof(mode = 'test')
y_train = pd.read_pickle('y_train.pkl')


 Loading OOF preds: ['train_preds_866cols_xgbparams1.pkl', 'train_preds_866cols_xgbparams2.pkl', 'train_preds_866cols_xgbparams3.pkl', 'train_preds_866cols_xgbparams4.pkl', 'train_preds_AttentionClean_preds.pkl', 'train_preds_AttentionNER_preds.pkl', 'train_preds_lgb_0.1807_20170603_1943.csv', 'train_preds_lgb_0.1842_20170527_1246.csv', 'train_preds_newNetworks_currentBest.pkl', 'train_preds_xgb_0.1800_20170601_1113.csv', 'train_preds_xgb_0.1808_20170602_0246.csv', 'train_preds_xgb_0.1812_20170603_0506.csv', 'train_preds_xgb_0.1813_20170530_0249.csv'] 
 Numer of files to load: 13 


 Loading OOF preds: ['test_preds_866cols_xgbparams1.csv', 'test_preds_866cols_xgbparams2.csv', 'test_preds_866cols_xgbparams3.csv', 'test_preds_866cols_xgbparams4.csv', 'test_preds_AttentionClean_preds_transformed.pkl', 'test_preds_AttentionNER_preds_transformed.pkl', 'test_preds_lgb_0.1807_20170603_1943.csv', 'test_preds_lgb_0.1842_20170527_1246.csv', 'test_preds_newNetworks_currentBest.csv', 'test_preds_

* stacking_TrainValidOversample_testInverse: 
    * Mean logloss for model in 10-folds SKF: 0.135621201826 
    * Mean logloss for transformed predictions: 0.15744757582 
    
    
    
* stacking_ValidOversample:
    * Mean logloss for model in 10-folds SKF: 0.153901737106 
    * Mean logloss for transformed predictions: 0.13636932305 

In [5]:
savename = 'stacking_TrainValidOversample_testInverse'

X_test = testOOF_transform(X_test, inverse = True)
oof_train, oof_test = lgb_foldrun_test_oversample(X_train, y_train, X_test, lgb_params, savename, False)
predict_test_lgbm(oof_test, savename, transform_preds = False)

Running LGBM model with parameters: {'max_depth': 4, 'bagging_fraction': 0.9, 'silent': 1, 'metric': {'binary_logloss'}, 'verbose': 1, 'random_state': 1337, 'min_data_in_leaf': 23, 'learning_rate': 0.03, 'objective': 'binary', 'subsample': 0.8, 'nthread': 4, 'num_leaves': 255, 'colsample_bytree': 0.41, 'feature_fraction': 0.51, 'boosting_type': 'gbdt', 'bagging_freq': 100, 'task': 'train'}
Mean target rate :  0.19124359014512396
Mean target rate :  0.19124429867267975

 Start training on fold: 0
Train until valid scores didn't improve in 200 rounds.
[100]	valid_0's binary_logloss: 0.149598
[200]	valid_0's binary_logloss: 0.134213
[300]	valid_0's binary_logloss: 0.133602
[400]	valid_0's binary_logloss: 0.133491
[500]	valid_0's binary_logloss: 0.133417
[600]	valid_0's binary_logloss: 0.133345
[700]	valid_0's binary_logloss: 0.133313
[800]	valid_0's binary_logloss: 0.133312
[900]	valid_0's binary_logloss: 0.133293
[1000]	valid_0's binary_logloss: 0.133278
[1100]	valid_0's binary_logloss: 

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
s1 = pd.read_csv(src + 'stacking_TrainValidOversample_testInverse.csv')
s2 = pd.read_csv(src + 'stacking_ValidOversample.csv')
s3 = pd.read_csv(src + 'test_preds_mod_xgbstack1_0.1754_20170604_1555.csv')