In [1]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cosine, correlation, canberra, chebyshev, minkowski, jaccard, euclidean

from xgb_utils import *

In [2]:
def get_test():
    feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned_data/'
    feats_src2 = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/NER_features/'
    
    keras_q1 = np.load(feats_src2 + 'q1test_NER_128len.npy')
    keras_q2 = np.load(feats_src2 + 'q2test_NER_128len.npy')
    xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_test.csv')
    abhishek_feats = pd.read_csv(feats_src + 'abhishek/test_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv(feats_src + 'other_features/text_features_test.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv(feats_src + 'other_features/img_features_test.csv')
    srk_feats = pd.read_csv(feats_src + 'srk/SRK_grams_features_test.csv')
    
    mephisto_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/test_mephistopeheles_features.csv').iloc[:, 6:]
    turkewitz_feats = pd.read_csv('../../data/features/spacylemmat_fullclean/test_turkewitz_features.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    turkewitz_feats['freq_sum'] = turkewitz_feats.q1_freq + turkewitz_feats.q2_freq
    turkewitz_feats['freq_diff'] = turkewitz_feats.q1_freq - turkewitz_feats.q2_freq
    turkewitz_feats['freq_mult'] = turkewitz_feats.q1_freq * turkewitz_feats.q2_freq
    turkewitz_feats['freq_div'] = turkewitz_feats.q1_freq / turkewitz_feats.q2_freq

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    xgb_feats = xgb_feats.iloc[:, 5:]
    
    df = pd.concat([xgb_feats, abhishek_feats, text_feats, img_feats, 
                               turkewitz_feats, mephisto_feats], axis = 1)
    df = pd.DataFrame(df)
    dfc = df.iloc[0:1000,:]
    dfc = dfc.T.drop_duplicates().T
    duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
    print('Dropping duplicate columns:', duplicate_cols)
    df.drop(duplicate_cols, axis = 1, inplace = True)
    print('Final shape:', df.shape)
    
    X = np.concatenate([keras_q1, keras_q2, df.values], axis = 1)
    X = X.astype('float32')
    print('Test data shape:', X.shape)
    return X

def predict_test(X_test, model_name):
    print('Predicting on test set.')
    gbm = xgb.Booster(model_file = 'saved_models/XGB/{}.txt'.format(model_name))
    test_preds = gbm.predict(xgb.DMatrix(X_test))

    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return


def get_transformations_features(transformations_src, mode = 'train'):
    print('Adding features based on data transformations.')
    lsa10tr_3grams_q1 = np.load(transformations_src + '{}_lsa10_3grams.npy'.format(mode))[0]
    lsa10tr_3grams_q2 = np.load(transformations_src + '{}_lsa10_3grams.npy'.format(mode))[1]
    
    transforms_feats = pd.DataFrame()
    transforms_feats['cosine'] = [cosine(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['correlation'] = [correlation(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['jaccard'] = [jaccard(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['euclidean'] = [euclidean(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['minkowski'] = [minkowski(x, y, 3) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    return transforms_feats

def get_doc2vec_features(doc2vec_src, mode = 'train'):
    print('Adding features based on Doc2Vec distances.')
    doc2vec_pre_q1 = np.load(doc2vec_src + '{}_q1_doc2vec_vectors_pretrained.npy'.format(mode))
    doc2vec_pre_q2 = np.load(doc2vec_src + '{}_q2_doc2vec_vectors_pretrained.npy'.format(mode))
    doc2vec_quora_q1 = np.load(doc2vec_src + '{}_q1_doc2vec_vectors_trainquora.npy'.format(mode))
    doc2vec_quora_q2 = np.load(doc2vec_src + '{}_q2_doc2vec_vectors_trainquora.npy'.format(mode))
    
    d2v_feats_pretrained = pd.DataFrame()
    d2v_feats_pretrained['cosine'] = [cosine(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['correlation'] = [correlation(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['jaccard'] = [jaccard(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['euclidean'] = [euclidean(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['minkowski'] = [minkowski(x, y, 3) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    
    d2v_feats_quora = pd.DataFrame()
    d2v_feats_quora['cosine'] = [cosine(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['correlation'] = [correlation(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['jaccard'] = [jaccard(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['euclidean'] = [euclidean(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['minkowski'] = [minkowski(x, y, 3) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    return d2v_feats_pretrained, d2v_feats_quora

def labelcount_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for cat_feature in categorical_features:
        cat_feature_value_counts = df[cat_feature].value_counts()
        value_counts_list = cat_feature_value_counts.index.tolist()
        value_counts_range_rev = list(reversed(range(len(cat_feature_value_counts)))) # for ascending ordering
        value_counts_range = list(range(len(cat_feature_value_counts))) # for descending ordering
        labelcount_dict = dict(zip(value_counts_list, value_counts_range))
        new_df[cat_feature] = df[cat_feature].map(labelcount_dict)
    return new_df

def count_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for i in categorical_features:
        new_df[i] = df[i].astype('object').replace(df[i].value_counts())
    return new_df

def bin_numerical(df2, cols, step):
    df = df2.copy()
    numerical_features = cols
    new_df = pd.DataFrame()
    for i in numerical_features:
        feature_range = np.arange(0, np.max(df[i]), step)
        new_df[i] = np.digitize(df[i], feature_range, right=True)
    return new_df

In [7]:
def get_new_feats():
    
    print('Creating additional new features.')
    src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
    test_orig =  pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv').iloc[:, 4:]
    turkewitz_feats = pd.read_csv('/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/spacylemmat_fullclean/test_turkewitz_features.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    
    """
    dflc = labelcount_encode(test_orig, ['question1', 'question2'])
    lc_cols = ['q1_lc', 'q2_lc']
    dflc.columns = lc_cols
    dflc_bin = bin_numerical(dflc, lc_cols, 5000)
    dflc_bin.columns = ['q1_lc_bin', 'q2_lc_bin']
    dflc['q1_lc'] = dflc['q1_lc'] / np.max(dflc['q1_lc'])
    dflc['q2_lc'] = dflc['q2_lc'] / np.max(dflc['q2_lc'])
    dflc_full = pd.concat([dflc, dflc_bin], axis = 1)
    """
    dflc_full = pd.read_csv('dflc_test.csv')
    tf_dflc = pd.concat([dflc_full, turkewitz_feats], axis = 1)
    
    tf_dflc = turkewitz_feats
    tf_dflc['q1gr1'] = tf_dflc.groupby(['q1_freq'])['q1_lc_bin'].transform('mean')
    tf_dflc['q2gr2'] = tf_dflc.groupby(['q2_freq'])['q2_lc_bin'].transform('mean')
    tf_dflc['q1gr1'] = tf_dflc['q1gr1'] / np.max(tf_dflc['q1gr1'])
    tf_dflc['q2gr2'] = tf_dflc['q2gr2'] / np.max(tf_dflc['q2gr2'])
    
    ff1 = turkewitz_feats.groupby(['q2_freq'])['q1_freq'].transform('sum')
    ff2 = turkewitz_feats.groupby(['q1_freq'])['q2_freq'].transform('sum')
    ff1 = ff1 / np.max(ff1)
    ff2 = ff2 / np.max(ff2)
    ff1m = turkewitz_feats.groupby(['q2_freq'])['q1_freq'].transform('mean')
    ff2m = turkewitz_feats.groupby(['q1_freq'])['q2_freq'].transform('mean')
    ff1m = ff1m / np.max(ff1m)
    ff2m = ff2m / np.max(ff2m)
    gr_feats = pd.DataFrame()
    gr_feats['ff1'] = ff1
    gr_feats['ff2'] = ff2
    gr_feats['ff1m'] = ff1m
    gr_feats['ff2m'] = ff2m

    train_lc3 = labelcount_encode(turkewitz_feats, ['q1_freq', 'q2_freq'])
    train_c = count_encode(turkewitz_feats, ['q1_freq', 'q2_freq'])
    train_c.q1_freq = train_c.q1_freq / np.max(train_c.q1_freq)
    train_c.q2_freq = train_c.q2_freq / np.max(train_c.q2_freq)

    new_feats = np.concatenate([train_c, train_lc3, gr_feats], axis = 1)
    del train_c, train_lc3, gr_feats
    gc.collect()
    return new_feats

In [8]:
#X_test = get_test()
new_feats = get_new_feats()
d2v_pre = np.load('test_doc2vec_pretrained_distances.npy')
d2v_quora = np.load('test_doc2vec_quoratrain_distances.npy')
transforms = np.load('test_transformations_distances.npy')
X_test = np.concatenate([X_test, d2v_pre, d2v_quora, transforms, new_feats], axis = 1)
X_test = X_test.astype('float32')
del d2v_pre, d2v_quora, transforms, new_feats
gc.collect()

predict_test(X_test, 'XGB_turkewitz_Doc2Vec2_LSA_GroupedFeats_noLC')

Creating additional new features.
Predicting on test set.
