In [1]:
import nltk
import difflib
import time
import gc
import itertools
import multiprocessing
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold

from models_utils_fe import *
from models_utils_skf import *

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
oof_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/OOF_preds/train/'

X_train = pd.read_pickle('Xtrain_866BestColsDropped.pkl')
mlp = pd.read_pickle(oof_src + 'train_preds_MLP_1sttry.pkl')
X_train = pd.concat([X_train, mlp], axis = 1)
X_train = X_train.astype('float32')
print(X_train.shape)

xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

del xgb_feats
gc.collect()

In [None]:
def xgb_foldrun_ooftr(X, y, params, name, save = True):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running XGB model with parameters:', params)
    
    i = 0
    losses = []
    oof_train = np.zeros((X.shape[0]))
    os.makedirs('saved_models/XGB/SKF/{}'.format(name), exist_ok = True)
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()
        
        dtrain = xgb.DMatrix(X_tr, label = y_tr)
        dval = xgb.DMatrix(X_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dval, 'valid')]
        print('Start training on fold: {}'.format(i))
        gbm = xgb.train(params, dtrain, 10000, watchlist, 
                        early_stopping_rounds = 200, verbose_eval = 100)
        print('Start predicting...')
        val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
        oof_train[val_index] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        gbm.save_model('saved_models/XGB/SKF/{}/XGB_10SKF_loss{:.5f}_fold{}.txt'.format(name, score, i))
        i += 1
    print('Mean logloss for model in 10-folds SKF:', np.array(losses).mean(axis = 0))
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['{}_prob'.format(name)]
    if save:
        oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
    return oof_train

In [None]:
xgb_params1 = {
    'seed': 1337,
    'colsample_bytree': 0.46,
    'silent': 1,
    'subsample': 0.89,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 8,
    'min_child_weight': 21,
    'nthread': 4,
    'tree_method': 'hist',
    }

xgb_params2 = {
    'seed': 1337,
    'colsample_bytree': 0.43,
    'silent': 1,
    'subsample': 0.88,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 5,
    'min_child_weight': 30,
    'nthread': 4,
    'tree_method': 'hist',
    }

* 866cols_xgbparams1: Mean SKF 0.179874240765

In [None]:
oof_train1 = xgb_foldrun_ooftr(X_train, y_train, xgb_params1, '866cols_xgbparams1')

In [None]:
oof_train2 = xgb_foldrun_ooftr(X_train, y_train, xgb_params2, '866cols_xgbparams2')

In [None]:
xgb_params3 = {
    'seed': 1337,
    'colsample_bytree': 0.38,
    'silent': 1,
    'subsample': 0.87,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 10,
    'min_child_weight': 16,
    'nthread': 4,
    'tree_method': 'hist',
    }

xgb_params4 = {
    'seed': 1337,
    'colsample_bytree': 0.46,
    'silent': 1,
    'subsample': 0.88,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 7,
    'min_child_weight': 23,
    'nthread': 4,
    'tree_method': 'hist',
    }


oof_train3 = xgb_foldrun_ooftr(X_train, y_train, xgb_params3, '866cols_xgbparams3')
oof_train4 = xgb_foldrun_ooftr(X_train, y_train, xgb_params4, '866cols_xgbparams4')

In [None]:
gbm = xgb.Booster(model_file = 'saved_models/XGB/XGB_10SKF_FredFeatsGRU_loss0.17917_fold1.txt')
dtrain = xgb.DMatrix(X_train, label = y_train)

mapper = {'f{0}'.format(i): v for i, v in enumerate(dtrain.feature_names)}
importance = {mapper[k]: v for k, v in gbm.get_fscore().items()}
importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)[:20]

df_importance = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_importance['fscore'] = df_importance['fscore'] / df_importance['fscore'].sum()

plt.figure()
df_importance.plot()
df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 18))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')