In [1]:
import nltk
import difflib
import time
import gc
import itertools
import multiprocessing
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold

from models_utils_fe import *
from models_utils_skf import *

In [2]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
att_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/v_spacy-models/decomposable_attention/checks/'

X_train = pd.read_pickle('Xtrain_825colsCurrentBest.pkl')
xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

#del xgb_feats
#gc.collect()

best_ner = pd.read_pickle('Xtrain_NER_15bestFeats.pkl')
best_lemmat = pd.read_pickle('Xtrain_lemmatClean_15bestFeats.pkl')

X_train = pd.concat([X_train, best_ner, best_lemmat], axis = 1)
X_train = X_train.astype('float32')

In [3]:
lgb_params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'binary',
    'metric' : {'binary_logloss'},
    'learning_rate' : 0.03,
    'feature_fraction' : 0.51,
    'bagging_fraction': 0.9,
    'bagging_freq': 100,
    'num_leaves' : 255,
    'max_depth': 8,
    'min_data_in_leaf': 23,
    'subsample': 0.8,
    'colsample_bytree': 0.41,
    'silent': 1,
    'random_state': 1337,
    'verbose': 1,
    'nthread': 9,
}

xgb_params = {
    'seed': 1337,
    'colsample_bytree': 0.42,
    'silent': 1,
    'subsample': 0.85,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 8,
    'min_child_weight': 20,
    'nthread': 4,
    'tree_method': 'hist',
    }

* X_train = pd.concat([X_train, best_ner, best_lemmat], axis = 1)
```python
[100]	valid_0's binary_logloss: 0.212996
[200]	valid_0's binary_logloss: 0.190133
[300]	valid_0's binary_logloss: 0.186119
[400]	valid_0's binary_logloss: 0.18423
[500]	valid_0's binary_logloss: 0.183256
[600]	valid_0's binary_logloss: 0.18258
[700]	valid_0's binary_logloss: 0.182047
[800]	valid_0's binary_logloss: 0.181516
[900]	valid_0's binary_logloss: 0.181176
[1000]	valid_0's binary_logloss: 0.181028
[1100]	valid_0's binary_logloss: 0.180709
[1200]	valid_0's binary_logloss: 0.180682
[1300]	valid_0's binary_logloss: 0.180531
[1400]	valid_0's binary_logloss: 0.180594
[1500]	valid_0's binary_logloss: 0.180671
[1600]	valid_0's binary_logloss: 0.180772
Early stopping, best iteration is:
[1430]	valid_0's binary_logloss: 0.180485
Start predicting...
Final score for fold 1 : 0.180486617074 
```

In [4]:
lgb_foldrun(X_train, y_train, lgb_params, 'BestGRU_experiments')

Running LGBM model with parameters: {'random_state': 1337, 'min_data_in_leaf': 23, 'metric': {'binary_logloss'}, 'task': 'train', 'verbose': 1, 'colsample_bytree': 0.41, 'learning_rate': 0.03, 'bagging_freq': 100, 'silent': 1, 'subsample': 0.8, 'nthread': 9, 'boosting_type': 'gbdt', 'bagging_fraction': 0.9, 'max_depth': 8, 'num_leaves': 255, 'objective': 'binary', 'feature_fraction': 0.51}
Start training on fold: 1
Train until valid scores didn't improve in 200 rounds.
[100]	valid_0's binary_logloss: 0.212996
[200]	valid_0's binary_logloss: 0.190133
[300]	valid_0's binary_logloss: 0.186119
[400]	valid_0's binary_logloss: 0.18423
[500]	valid_0's binary_logloss: 0.183256
[600]	valid_0's binary_logloss: 0.18258
[700]	valid_0's binary_logloss: 0.182047
[800]	valid_0's binary_logloss: 0.181516
[900]	valid_0's binary_logloss: 0.181176
[1000]	valid_0's binary_logloss: 0.181028
[1100]	valid_0's binary_logloss: 0.180709
[1200]	valid_0's binary_logloss: 0.180682
[1300]	valid_0's binary_logloss: 

KeyboardInterrupt: 

In [None]:
gbm = xgb.Booster(model_file = 'saved_models/XGB/XGB_10SKF_FredFeatsGRU_loss0.17917_fold1.txt')
dtrain = xgb.DMatrix(X_train, label = y_train)

mapper = {'f{0}'.format(i): v for i, v in enumerate(dtrain.feature_names)}
importance = {mapper[k]: v for k, v in gbm.get_fscore().items()}
importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)[:20]

df_importance = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_importance['fscore'] = df_importance['fscore'] / df_importance['fscore'].sum()

plt.figure()
df_importance.plot()
df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 18))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')