In [None]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import os
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cosine, correlation, canberra, chebyshev, minkowski, jaccard, euclidean

from xgb_utils import *

In [None]:
def get_train():
    feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/NER_features/'
    keras_q1 = np.load(feats_src + 'q1train_NER_128len.npy')
    keras_q2 = np.load(feats_src + 'q2train_NER_128len.npy')
    abhishek_feats = pd.read_csv(feats_src + 'train_abhishek_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    whq_feats = pd.read_csv(feats_src + 'train_whq_with_jaccard_feats.csv')
    eda_feats = pd.read_csv(feats_src + 'train_eda_features.csv').iloc[:, -1]
    srk_feats = pd.read_csv(feats_src + 'train_SRKgrams_features.csv')
    mephisto_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/train_mephistopeheles_features.csv').iloc[:, 6:]
    the1owl_feats = pd.read_csv(feats_src + 'train_the1owl_features.csv').iloc[:, 9:]
    img_feats = pd.read_csv(feats_src + 'img_features_train.csv')
    
    turkewitz_feats = pd.read_csv(feats_src + 'train_turkewitz_features.csv')
    y_train = turkewitz_feats['is_duplicate']
    
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    turkewitz_feats['freq_sum'] = turkewitz_feats.q1_freq + turkewitz_feats.q2_freq
    turkewitz_feats['freq_diff'] = turkewitz_feats.q1_freq - turkewitz_feats.q2_freq
    turkewitz_feats['freq_mult'] = turkewitz_feats.q1_freq * turkewitz_feats.q2_freq
    turkewitz_feats['freq_div'] = turkewitz_feats.q1_freq / turkewitz_feats.q2_freq
    
    df = pd.concat([abhishek_feats, whq_feats, eda_feats, srk_feats, mephisto_feats, the1owl_feats, 
                    img_feats, turkewitz_feats], axis = 1)
    df = pd.DataFrame(df)
    dfc = df.iloc[0:1000,:]
    dfc = dfc.T.drop_duplicates().T
    duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
    print('Dropping duplicate columns:', duplicate_cols)
    df.drop(duplicate_cols, axis = 1, inplace = True)
    print('Final shape:', df.shape)
    
    X = np.concatenate([keras_q1, keras_q2, df.values], axis = 1)
    X = X.astype('float32')
    print('Training data shape:', X.shape)
    return X, y_train


def predict_test(X_test, model_name):
    print('Predicting on test set.')
    gbm = xgb.Booster(model_file = 'saved_models/XGB/{}.txt'.format(model_name))
    test_preds = gbm.predict(xgb.DMatrix(X_test))

    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

def labelcount_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for cat_feature in categorical_features:
        cat_feature_value_counts = df[cat_feature].value_counts()
        value_counts_list = cat_feature_value_counts.index.tolist()
        value_counts_range_rev = list(reversed(range(len(cat_feature_value_counts)))) # for ascending ordering
        value_counts_range = list(range(len(cat_feature_value_counts))) # for descending ordering
        labelcount_dict = dict(zip(value_counts_list, value_counts_range))
        new_df[cat_feature] = df[cat_feature].map(labelcount_dict)
    return new_df

def count_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for i in categorical_features:
        new_df[i] = df[i].astype('object').replace(df[i].value_counts())
    return new_df

def bin_numerical(df2, cols, step):
    df = df2.copy()
    numerical_features = cols
    new_df = pd.DataFrame()
    for i in numerical_features:
        feature_range = np.arange(0, np.max(df[i]), step)
        new_df[i] = pd.cut(df[i], feature_range, right=True)
        new_df[i] = pd.factorize(df[i], sort = True)[0]
    return new_df

In [None]:
def train_xgb(cv = False):
    
    t = time.time()
    params = {
    'seed': 1337,
    'colsample_bytree': 0.48,
    'silent': 1,
    'subsample': 0.74,
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 12,
    'min_child_weight': 20,
    'nthread': 8,
    'tree_method': 'hist',
    #'updater': 'grow_gpu',
    }
    
    X_train, y_train = get_train()
    
    if cv:
        dtrain = xgb.DMatrix(X_train, y_train)
        hist = xgb.cv(params, dtrain, num_boost_round = 100000, nfold = 5,
                      stratified = True, early_stopping_rounds = 350, verbose_eval = 250,
                      seed = 1337)
        del X_train, y_train
        gc.collect()
        print('Time it took to train in CV manner:', time.time() - t)
        return hist
    
    else:
        X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                                    test_size = 0.2, random_state = 111)
        del X_train, y_train
        gc.collect()
        dtrain = xgb.DMatrix(X_tr, label = y_tr)
        dval = xgb.DMatrix(X_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dval, 'valid')]

        print('Start training...')
        gbm = xgb.train(params, dtrain, 100000, watchlist, 
                        early_stopping_rounds = 350, verbose_eval = 100)

        print('Start predicting...')
        val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
        score = log_loss(y_val, val_pred)
        print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
        
        del X_tr, X_val, y_tr, y_val
        gc.collect()
        return gbm
    

def run_xgb(model_name, train = True, test = False, cv = False):
    if cv:
        gbm_hist = train_xgb(True)
        return gbm_hist
    if train:
        gbm = train_xgb()
        gbm.save_model('saved_models/XGB/{}.txt'.format(model_name))
        if test:
            predict_test('{}'.format(model_name))
        return gbm

In [None]:
X_train, y_train = get_train()

In [None]:
turkewitz_feats = pd.read_csv('../features/NER_features/train_turkewitz_features.csv')

train_lc = labelcount_encode(turkewitz_feats, ['q1_freq', 'q2_freq'])
train_c = count_encode(turkewitz_feats, ['q1_freq', 'q2_freq'])

train_c.q1_freq = train_c.q1_freq / np.max(train_c.q1_freq)
train_c.q2_freq = train_c.q2_freq / np.max(train_c.q2_freq)

X_train = np.concatenate([X_train, train_c, train_lc], axis = 1)

In [None]:
params = {
    'seed': 1337,
    'colsample_bytree': 0.48,
    'silent': 1,
    'subsample': 0.74,
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 12,
    'min_child_weight': 20,
    'nthread': 8,
    'tree_method': 'hist',
    }

t = time.time()
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                            test_size = 0.2, random_state = 111)
dtrain = xgb.DMatrix(X_tr, label = y_tr)
dval = xgb.DMatrix(X_val, label = y_val)
watchlist = [(dtrain, 'train'), (dval, 'valid')]

print('Start training...')
gbm = xgb.train(params, dtrain, 100000, watchlist, 
                early_stopping_rounds = 150, verbose_eval = 100)

print('Start predicting...')
val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
score = log_loss(y_val, val_pred)
print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
gbm.save_model('saved_models/XGB/XGB_NERexperiment.txt')

In [None]:
X_test = get_test()

turkewitz_feats = pd.read_csv('../../data/features/lemmat_spacy_features/test_turkewitz_features.csv')

train_lc2 = labelcount_encode(turkewitz_feats, ['q1_hash', 'q2_hash'])
train_lc3 = labelcount_encode(turkewitz_feats, ['q1_freq', 'q2_freq'])
train_c = count_encode(turkewitz_feats, ['q1_freq', 'q2_freq'])

train_c.q1_freq = train_c.q1_freq / np.max(train_c.q1_freq)
train_c.q2_freq = train_c.q2_freq / np.max(train_c.q2_freq)

train_lc2.q1_hash = train_lc2.q1_hash / np.max(train_lc2.q1_hash)
train_lc2.q2_hash = train_lc2.q2_hash / np.max(train_lc2.q2_hash)

X_test = np.concatenate([X_test, train_c, train_lc2, train_lc3], axis = 1)

predict_test(X_test, 'XGB_firstBO_magicexperiments')