In [None]:
import nltk
import difflib
import time
import gc
import itertools
import multiprocessing
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from models_utils_fe import *
from models_utils_gbm import *

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
trans_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/transformations/'

wmd = pd.read_csv(src + 'train_WMD_cleaned_stemmed.csv')
wmd = wmd.astype('float32')
wmd.replace(np.inf, 1000, inplace = True)

skip_thought = pd.read_csv(src + 'train_skipthoughts_Alex_distances.csv')
skip_thought = skip_thought.astype('float32')

compression = pd.read_csv(src + 'train_LZMAcompression_distance.csv')
compression = compression.astype('float32')

edit = pd.read_csv(src + 'train_EDITdistance.csv')
edit = edit.astype('float32')

moments = pd.read_csv(src + 'train_doc2vec_moments.csv')
moments = moments.astype('float32')

networks_NER = pd.read_csv(src + 'train_networkfeats_NER.csv')
networks_NER = networks_NER.astype('float32')

xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

lsaq1 = pd.DataFrame(np.load(trans_src + 'train_lsa50_CV1gram.npy')[0])
lsaq1.columns = ['{}_lsaCV1_q1'.format(i) for i in range(lsaq1.shape[1])]
lsaq2 = pd.DataFrame(np.load(trans_src + 'train_lsa50_CV1gram.npy')[1])
lsaq2.columns = ['{}_lsaCV1_q2'.format(i) for i in range(lsaq2.shape[1])]

svdq1 = pd.DataFrame(np.load(trans_src + 'train_svd50_CV1gram.npy')[0])
svdq1.columns = ['{}_svdCV1_q1'.format(i) for i in range(svdq1.shape[1])]
svdq2 = pd.DataFrame(np.load(trans_src + 'train_svd50_CV1gram.npy')[1])
svdq2.columns = ['{}_svdCV1_q2'.format(i) for i in range(svdq2.shape[1])]

X_train = pd.read_pickle('Xtrain_500bestCols.pkl')
X_train = pd.concat([X_train, wmd, skip_thought, compression, edit, moments, networks_NER, 
                     lsaq1, lsaq2, svdq1, svdq2], axis = 1)

del xgb_feats, wmd, skip_thought, compression, edit, moments, networks_NER, lsaq1, lsaq2, svdq1, svdq2
gc.collect()

In [None]:
best_cols = [
    'min_pagerank_sp_network_weighted',
    'norm_wmd',
    'word_match',
    '1wl_tfidf_l2_euclidean',
    'm_vstack_svd_q1_q1_euclidean',
    '1wl_tfidf_cosine',
    'sk_bi_skew_q2vec',
    'm_q1_q2_tf_svd0',
    'sk_bi_skew_q1vec',
    'skew_q2vec',
    'trigram_tfidf_cosine',
    'sk_uni_skew_q2vec',
    'sk_bi_canberra_distance',
    'question1_3',
    'sk_uni_skew_q1vec',
    'sk_uni_kur_q2vec',
    'min_eigenvector_centrality_np_network_weighted',
    'avg_world_len2',
    'z_word_match',
    'sk_uni_kur_q1vec',
    'skew_doc2vec_pretrained_lemmat']

rescale = False
X_bin = bin_numerical(X_train, best_cols, 0.1)
X_grouped = group_featbyfeat(X_train, best_cols, 'mean')
X_grouped2 = group_featbyfeat(X_train, best_cols, 'sum')
X_combinations = feature_combinations(X_train, best_cols[:5])

X_additional = pd.concat([X_bin, X_grouped, X_grouped2, X_combinations], axis = 1)
X_additional = drop_duplicate_cols(X_additional)
X_additional.replace(np.inf, 999, inplace = True)
X_additional.replace(np.nan, -999, inplace = True)
if rescale:
    colnames = X_additional.columns
    X_additional = pd.DataFrame(MinMaxScaler().fit_transform(X_additional))
    X_additional.columns = colnames

X_train = pd.concat([X_train, X_additional], axis = 1)
print('Final training data shape:', X_train.shape)

del X_bin, X_grouped, X_grouped2, X_combinations, X_additional
gc.collect()

In [None]:
xgb = True


if xgb:
    run_xgb()
else:
    run_lgb()

In [None]:
df_importance.feature

In [None]:
gbm = xgb.Booster(model_file = 'saved_models/XGB/XGB_500cols_furtherExperiments.txt')
dtrain = xgb.DMatrix(X_train, label = y_train)

mapper = {'f{0}'.format(i): v for i, v in enumerate(dtrain.feature_names)}
importance = {mapper[k]: v for k, v in gbm.get_fscore().items()}
importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)[:20]

df_importance = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_importance['fscore'] = df_importance['fscore'] / df_importance['fscore'].sum()

plt.figure()
df_importance.plot()
df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 18))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')

In [None]:
gbm = xgb.Booster(model_file = 'saved_models/XGB/XGB_500cols_experiments.txt')
dtrain = xgb.DMatrix(X_train, label = y_train)

mapper = {'f{0}'.format(i): v for i, v in enumerate(dtrain.feature_names)}
importance = {mapper[k]: v for k, v in gbm.get_fscore().items()}
importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)[:20]

df_importance = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_importance['fscore'] = df_importance['fscore'] / df_importance['fscore'].sum()

plt.figure()
df_importance.plot()
df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 18))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')

In [None]:
retain_cols = df_importance['feature']
X_train2 = X_train.loc[:, retain_cols]
retain_cols.to_pickle('Colnames_best500features.pkl')