In [1]:
import nltk
import difflib
import time
import gc
import itertools
import multiprocessing
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold

from models_utils_fe import *
from models_utils_skf import *

In [2]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

X_train = pd.read_pickle('Xtrain_866BestColsDropped.pkl')
X_train = X_train.astype('float32')
print(X_train.shape)

xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

del xgb_feats
gc.collect()

(404290, 866)


21

In [3]:
def xgb_foldrun_ooftr(X, y, params, name, start_fold, save = True):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running XGB model with parameters:', params)
    
    j = 0
    losses = []
    train_splits = []
    val_splits = []
    for tr_index, val_index in skf.split(X, y):
        train_splits.append(tr_index)
        val_splits.append(val_index)
        
    oof_train = np.zeros((X.shape[0]))
    os.makedirs('saved_models/XGB/SKF/{}'.format(name), exist_ok = True)
    for i in range(start_fold, 10):
        X_tr, X_val = X[train_splits[i]], X[val_splits[i]]
        y_tr, y_val = y[train_splits[i]], y[val_splits[i]]
        t = time.time()
        
        dtrain = xgb.DMatrix(X_tr, label = y_tr)
        dval = xgb.DMatrix(X_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dval, 'valid')]
        print('Start training on fold: {}'.format(i))
        gbm = xgb.train(params, dtrain, 10000, watchlist, 
                        early_stopping_rounds = 200, verbose_eval = 100)
        print('Start predicting...')
        val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
        oof_train[val_index] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        gbm.save_model('saved_models/XGB/SKF/{}/XGB_10SKF_loss{:.5f}_fold{}.txt'.format(name, score, i))
        j += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0))
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['{}_prob'.format(name)]
    if save:
        oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
    return oof_train

In [9]:
def xgb_foldrun_ooftr(X, y, params, name, src, save = True):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running XGB model with parameters:', params)
    
    j = 0
    losses = []
    train_splits = []
    val_splits = []
    for tr_index, val_index in skf.split(X, y):
        train_splits.append(tr_index)
        val_splits.append(val_index)
        
    oof_train = np.zeros((X.shape[0]))
    models = sorted([x for x in os.listdir(src) if 'txt' in x])
    for i in range(0, 10):
        X_tr, X_val = X[train_splits[i]], X[val_splits[i]]
        y_tr, y_val = y[train_splits[i]], y[val_splits[i]]
        t = time.time()
        
        dtrain = xgb.DMatrix(X_tr, label = y_tr)
        dval = xgb.DMatrix(X_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dval, 'valid')]
        print('Start training on fold: {}'.format(i))
        gbm = xgb.Booster(model_file = src + models[i])
        
        print('Start predicting...')
        val_pred = gbm.predict(xgb.DMatrix(X_val))
        oof_train[val_splits[i]] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0))
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['{}_prob'.format(name)]
    if save:
        oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
    return oof_train

def predict_test_xgb_fold(src, X_test):
    print('Predicting on test set with XGBoost.')
    fold_preds = np.zeros((10, 2345796))
    models = sorted([x for x in os.listdir(src) if 'txt' in x])
    #X_test = xgb.DMatrix(X_test)
    for i in tqdm(range(len(models))):
        gbm = xgb.Booster(model_file = src + models[i])
        test_preds = gbm.predict(X_test)
        test_preds = np.apply_along_axis(transform, 0, test_preds)
        fold_preds[i, :] = test_preds
    fold_preds = fold_preds.mean(axis = 0)
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = fold_preds
    #sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(src.split('/')[-2]), index = False)
    return

In [12]:
xgb_params2 = {
    'seed': 1337,
    'colsample_bytree': 0.43,
    'silent': 1,
    'subsample': 0.88,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5,
    'min_child_weight': 30,
    'nthread': 4,
    'tree_method': 'hist',
    }


src1 = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/saved_models/XGB/SKF/866cols_xgbparams2_copy/'
oof_train2 = xgb_foldrun_ooftr(X_train, y_train, xgb_params2, '866cols_xgbparams2', src1, save = True)

Running XGB model with parameters: {'nthread': 4, 'seed': 1337, 'max_depth': 5, 'eval_metric': 'logloss', 'objective': 'binary:logistic', 'colsample_bytree': 0.43, 'subsample': 0.88, 'silent': 1, 'eta': 0.02, 'tree_method': 'hist', 'min_child_weight': 30}
Start training on fold: 0
Start predicting...
Final score for fold 0 : 0.176475939391 
 Time it took to train and predict on fold: 5.173024654388428 

Start training on fold: 1
Start predicting...
Final score for fold 1 : 0.184859639179 
 Time it took to train and predict on fold: 8.121746063232422 

Start training on fold: 2
Start predicting...
Final score for fold 2 : 0.180010402118 
 Time it took to train and predict on fold: 6.373630046844482 

Start training on fold: 3
Start predicting...
Final score for fold 3 : 0.177431577293 
 Time it took to train and predict on fold: 7.068759202957153 

Start training on fold: 4
Start predicting...
Final score for fold 4 : 0.182519865556 
 Time it took to train and predict on fold: 5.9042797

In [None]:
xgb_params1 = {
    'seed': 1337,
    'colsample_bytree': 0.46,
    'silent': 1,
    'subsample': 0.89,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 8,
    'min_child_weight': 21,
    'nthread': 4,
    'tree_method': 'hist',
    }

xgb_params2 = {
    'seed': 1337,
    'colsample_bytree': 0.43,
    'silent': 1,
    'subsample': 0.88,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5,
    'min_child_weight': 30,
    'nthread': 4,
    'tree_method': 'hist',
    }

xgb_params3 = {
    'seed': 1337,
    'colsample_bytree': 0.38,
    'silent': 1,
    'subsample': 0.87,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 10,
    'min_child_weight': 16,
    'nthread': 4,
    'tree_method': 'hist',
    }

xgb_params4 = {
    'seed': 1337,
    'colsample_bytree': 0.46,
    'silent': 1,
    'subsample': 0.88,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 7,
    'min_child_weight': 23,
    'nthread': 4,
    'tree_method': 'hist',
    }


#oof_train3 = xgb_foldrun_ooftr(X_train, y_train, xgb_params3, '866cols_xgbparams3')
#oof_train4 = xgb_foldrun_ooftr(X_train, y_train, xgb_params4, '866cols_xgbparams4')
oof_train2 = xgb_foldrun_ooftr(X_train, y_train, xgb_params2, '866cols_xgbparams2', 6)

* 866cols_xgbparams1: Mean logloss for model in 10-folds SKF: 0.179874240765
* 866cols_xgbparams3: Mean logloss for model in 10-folds SKF: 0.180440278734
* 866cols_xgbparams4: Mean logloss for model in 10-folds SKF: 0.179872296735
* 866cols_xgbparams2: Mean logloss for model in 10-folds SKF: 0.180740034549