In [None]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("dark")
import category_encoders as ce
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from textstat.textstat import textstat
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cosine, correlation, canberra, chebyshev, minkowski, jaccard, euclidean

from models_utils_xgb import *

In [None]:
def get_train():
    feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
    keras_q1 = np.load(feats_src + 'train_q1_transformed.npy')
    keras_q2 = np.load(feats_src + 'train_q2_transformed.npy')
    
    feats_src2 = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/NER/'
    keras_q1 = np.load(feats_src2 + 'q1train_NER_128len.npy')
    keras_q2 = np.load(feats_src2 + 'q2train_NER_128len.npy')
    
    xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
    abhishek_feats = pd.read_csv(feats_src + 'abhishek/train_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv(feats_src + 'other_features/text_features_train.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv(feats_src + 'other_features/img_features_train.csv')
    srk_feats = pd.read_csv(feats_src + 'srk/SRK_grams_features_train.csv')
    
    mephisto_feats = pd.read_csv('../../data/features/lemmatized_fullclean/train_mephistopeheles_features.csv').iloc[:, 6:]
    turkewitz_feats = pd.read_csv('../../data/features/lemmatized_fullclean/train_turkewitz_features_fullcleanSTEMMED.csv')
    #turkewitz_feats = pd.read_csv(feats_src + 'other_features/train_turkewitz_feats_orig.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    turkewitz_feats['freq_sum'] = turkewitz_feats.q1_freq + turkewitz_feats.q2_freq
    turkewitz_feats['freq_diff'] = turkewitz_feats.q1_freq - turkewitz_feats.q2_freq
    turkewitz_feats['freq_mult'] = turkewitz_feats.q1_freq * turkewitz_feats.q2_freq
    turkewitz_feats['freq_div'] = turkewitz_feats.q1_freq / turkewitz_feats.q2_freq
    
    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    y_train = xgb_feats['is_duplicate']
    xgb_feats = xgb_feats.iloc[:, 8:]
    
    df = pd.concat([xgb_feats, abhishek_feats, text_feats, img_feats, 
                               turkewitz_feats, mephisto_feats], axis = 1)
    del xgb_feats, abhishek_feats, text_feats, img_feats, turkewitz_feats, mephisto_feats
    gc.collect()
    
    df = drop_duplicate_cols(df)
    keras_q1 = pd.DataFrame(keras_q1)
    keras_q2 = pd.DataFrame(keras_q2)
    keras_q1.columns = ['question1_{}'.format(i) for i in range(keras_q1.shape[1])]
    keras_q2.columns = ['question2_{}'.format(i) for i in range(keras_q2.shape[1])]
    X = pd.concat([keras_q1, keras_q2, df], axis = 1)
    
    colnames_list = X.columns.tolist()
    colnames_list[300] = 'len_char_q1_other'
    colnames_list[301] = 'len_char_q2_other'
    X.columns = colnames_list
    X = X.astype('float32')
    print('Training data shape:', X.shape)
    return X, y_train

def labelcount_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for cat_feature in categorical_features:
        cat_feature_value_counts = df[cat_feature].value_counts()
        value_counts_list = cat_feature_value_counts.index.tolist()
        value_counts_range_rev = list(reversed(range(len(cat_feature_value_counts)))) # for ascending ordering
        value_counts_range = list(range(len(cat_feature_value_counts))) # for descending ordering
        labelcount_dict = dict(zip(value_counts_list, value_counts_range))
        new_df[cat_feature] = df[cat_feature].map(labelcount_dict)
    return new_df

def count_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for i in categorical_features:
        new_df[i] = df[i].astype('object').replace(df[i].value_counts())
    return new_df

def bin_numerical(df2, cols, step):
    df = df2.copy()
    numerical_features = cols
    new_df = pd.DataFrame()
    for i in numerical_features:
        feature_range = np.arange(0, np.max(df[i]), step)
        new_df[i] = np.digitize(df[i], feature_range, right=True)
    return new_df

def drop_duplicate_cols(df):
    dfc = df.iloc[0:10000,:]
    dfc = dfc.T.drop_duplicates().T
    duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
    print('Dropping duplicate columns:', duplicate_cols)
    df.drop(duplicate_cols, axis = 1, inplace = True)
    print('Final shape:', df.shape)
    del dfc
    gc.collect()
    return df

In [None]:
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')

X_train = pd.read_pickle('Xtrain_500bestCols.pkl')
y_train = xgb_feats['is_duplicate']


* Final score: 0.212184284436 for Xtrain_500bestCols.pkl

In [None]:
params = {
    'seed': 1337,
    'colsample_bytree': 0.48,
    'silent': 1,
    'subsample': 0.74,
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 12,
    'min_child_weight': 20,
    'nthread': 8,
    'tree_method': 'hist',
    }

t = time.time()
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                            test_size = 0.2, random_state = 111)
dtrain = xgb.DMatrix(X_tr, label = y_tr)
dval = xgb.DMatrix(X_val, label = y_val)
watchlist = [(dtrain, 'train'), (dval, 'valid')]

print('Start training...')
gbm = xgb.train(params, dtrain, 100000, watchlist, 
                early_stopping_rounds = 100, verbose_eval = 100)

print('Start predicting...')
val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
score = log_loss(y_val, val_pred)
print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
gbm.save_model('saved_models/XGB/XGB_new_NetworkFeats_experiments.txt')

In [None]:
gbm = xgb.Booster(model_file = 'saved_models/XGB/XGB_new_NetworkFeats_experiments.txt')

mapper = {'f{0}'.format(i): v for i, v in enumerate(dtrain.feature_names)}
importance = {mapper[k]: v for k, v in gbm.get_fscore().items()}
importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)[:500]

df_importance = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_importance['fscore'] = df_importance['fscore'] / df_importance['fscore'].sum()

plt.figure()
df_importance.plot()
df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 18))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')



In [None]:
retain_cols = df_importance['feature']
X_train2 = X_train.loc[:, retain_cols]
retain_cols.to_pickle('Colnames_best500features.pkl')