In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
import multiprocessing
import difflib
import time
import gc

import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from pandas.core.common import array_equivalent

from xgb_utils import *



In [None]:
def get_train():
    abhishek_feats = pd.read_csv('../../data/features/abhishek/train_features.csv',
                      encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/train_whq_with_jaccard_feats.csv')
    eda_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/train_eda_features.csv')
    mephisto_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/train_mephistopeheles_features.csv')
    turkewitz_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/train_turkewitz_features.csv')
    srk_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/train_SRKgrams_features.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    
    q1 = np.load('../features/q1train_spacylemmat_fullclean_170len_treetrunc.npy')
    q2 = np.load('../features/q2train_spacylemmat_fullclean_170len_treetrunc.npy')

    df = pd.concat([mephisto_feats, abhishek_feats, turkewitz_feats], axis = 1)
    df2 = pd.concat([eda_feats, text_feats, srk_feats], axis = 1)
    df = df.merge(df2, on = 'id', how = 'left')
    print('Original shape:', df.shape)
    df.fillna(-999, inplace = True)
    
    y = df['is_duplicate_y']
    
    dfc = df.iloc[0:1000,:]
    dfc = dfc.T.drop_duplicates().T
    duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
    print('Dropping duplicate columns:', duplicate_cols)
    df.drop(duplicate_cols, axis = 1, inplace = True)
    print('Final shape:', df.shape)
    
    df.drop(['is_duplicate_x',], axis = 1, inplace = True)
    X = df.iloc[:, 6:]
    X.drop(['question1_y','question2_y'], axis = 1, inplace = True)
    X = np.concatenate([X.values, q1, q2], axis = 1)
    print('Train data loaded.', '\n', 'Training data shape:', X.shape)
    return X, y

In [None]:
def get_test():
    abhishek_feats = pd.read_csv('../../data/features/abhishek/test_features.csv',
                      encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/test_whq_with_jaccard_feats.csv')
    eda_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/test_eda_features.csv')
    mephisto_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/test_mephistopeheles_features.csv')
    turkewitz_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/test_turkewitz_features.csv')
    srk_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/test_SRKgrams_features.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]

    q1 = np.load('../../data/transformed/keras_tokenizer/test_q1_transformed.npy')
    q2 = np.load('../../data/transformed/keras_tokenizer/test_q2_transformed.npy')
    
    df = pd.concat([mephisto_feats, abhishek_feats, turkewitz_feats], axis = 1)
    df2 = pd.concat([eda_feats, text_feats, srk_feats], axis = 1)
    df = df.merge(df2, on = 'test_id', how = 'left')
    print('Original shape:', df.shape)
    
    dfc = df.iloc[0:1000,:]
    dfc = dfc.T.drop_duplicates().T
    duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
    print('Dropping duplicate columns:', duplicate_cols)
    df.drop(duplicate_cols, axis = 1, inplace = True)
    print('Final shape:', df.shape)
    
    X = df.iloc[:, 4:]
    X.drop(['question1_y','question2_y'], axis = 1, inplace = True)
    X = np.concatenate([X.values, q1, q2], axis = 1)
    print('Test data loaded.', '\n', 'Test data shape:', X.shape)
    return X


def predict_test(model_name):
    print('Predicting on test set.')
    X_test = get_test()
    gbm = xgb.Booster(model_file = 'saved_models/XGB/{}.txt'.format(model_name))
    test_preds = gbm.predict(xgb.DMatrix(X_test))

    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [None]:
def train_xgb(cv = False):
    
    t = time.time()
    params = {
    'seed': 1337,
    'colsample_bytree': 0.48,
    'silent': 1,
    'subsample': 0.74,
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 12,
    'min_child_weight': 20,
    'nthread': 6,
    'tree_method': 'hist',
    #'updater': 'grow_gpu_hist',
    #'gpu_id': 0,
    }
    
    X_train, y_train = get_train()
    X_train = X_train.astype('float32')
    
    if cv:
        dtrain = xgb.DMatrix(X_train, y_train)
        hist = xgb.cv(params, dtrain, num_boost_round = 100000, nfold = 5,
                      stratified = True, early_stopping_rounds = 350, verbose_eval = 250,
                      seed = 1337)
        del X_train, y_train
        gc.collect()
        print('Time it took to train in CV manner:', time.time() - t)
        return hist
    
    else:
        X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                                    test_size = 0.2, random_state = 111)
        del X_train, y_train
        gc.collect()
        dtrain = xgb.DMatrix(X_tr, label = y_tr)
        dval = xgb.DMatrix(X_val, label = y_val)
        watchlist = [(dtrain, 'train'), (dval, 'valid')]

        print('Start training...')
        gbm = xgb.train(params, dtrain, 100000, watchlist, 
                        early_stopping_rounds = 350, verbose_eval = 250)

        print('Start predicting...')
        val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
        score = log_loss(y_val, val_pred)
        print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
        
        del X_tr, X_val, y_tr, y_val
        gc.collect()
        return gbm
    

def run_xgb(model_name, train = True, test = False, cv = False):
    if cv:
        gbm_hist = train_xgb(True)
        return gbm_hist
    if train:
        gbm = train_xgb()
        gbm.save_model('saved_models/XGB/{}.txt'.format(model_name))
        if test:
            predict_test('{}'.format(model_name))
        return gbm

In [None]:
gbm = run_xgb('XGB_spacy_lemmat_combinedFeats_origEncoding', train = True, test = True)

In [None]:
predict_test('XGB_spacy_lemmat_combinedFeats')