In [None]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cosine, correlation, canberra, chebyshev, minkowski, jaccard, euclidean

from models_utils_xgb import *

In [None]:
def get_test():
    feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
    feats_src2 = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/NER/'
    
    keras_q1 = np.load(feats_src2 + 'q1test_NER_128len.npy')
    keras_q2 = np.load(feats_src2 + 'q2test_NER_128len.npy')
    xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_test.csv')
    abhishek_feats = pd.read_csv(feats_src + 'abhishek/test_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv(feats_src + 'other_features/text_features_test.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv(feats_src + 'other_features/img_features_test.csv')
    srk_feats = pd.read_csv(feats_src + 'srk/SRK_grams_features_test.csv')
    
    mephisto_feats = pd.read_csv('../../data/features/lemmatized_fullclean/test_mephistopeheles_features.csv').iloc[:, 6:]
    turkewitz_feats = pd.read_csv('../../data/features/lemmatized_fullclean/test_turkewitz_features_fullcleanSTEMMED.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    turkewitz_feats['freq_sum'] = turkewitz_feats.q1_freq + turkewitz_feats.q2_freq
    turkewitz_feats['freq_diff'] = turkewitz_feats.q1_freq - turkewitz_feats.q2_freq
    turkewitz_feats['freq_mult'] = turkewitz_feats.q1_freq * turkewitz_feats.q2_freq
    turkewitz_feats['freq_div'] = turkewitz_feats.q1_freq / turkewitz_feats.q2_freq

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    xgb_feats = xgb_feats.iloc[:, 5:]
    
    df = pd.concat([xgb_feats, abhishek_feats, text_feats, img_feats, 
                               turkewitz_feats, mephisto_feats], axis = 1)
    del xgb_feats, abhishek_feats, text_feats, img_feats, turkewitz_feats, mephisto_feats
    gc.collect()
    
    df = drop_duplicate_cols(df)
    keras_q1 = pd.DataFrame(keras_q1)
    keras_q2 = pd.DataFrame(keras_q2)
    keras_q1.columns = ['question1_{}'.format(i) for i in range(keras_q1.shape[1])]
    keras_q2.columns = ['question2_{}'.format(i) for i in range(keras_q2.shape[1])]
    X = pd.concat([keras_q1, keras_q2, df], axis = 1)
    
    colnames_list = X.columns.tolist()
    colnames_list[300] = 'len_char_q1_other'
    colnames_list[301] = 'len_char_q2_other'
    X.columns = colnames_list
    print('Test data shape:', X.shape)
    X = X.astype('float32')
    return X

def predict_test(X_test, model_name):
    print('Predicting on test set.')
    gbm = xgb.Booster(model_file = 'saved_models/XGB/{}.txt'.format(model_name))
    X_test = xgb.DMatrix(X_test)
    test_preds = gbm.predict(X_test)
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return


def get_transformations_features(transformations_src, mode = 'train'):
    print('Adding features based on data transformations.')
    lsa10tr_3grams_q1 = np.load(transformations_src + '{}_lsa10_3grams.npy'.format(mode))[0]
    lsa10tr_3grams_q2 = np.load(transformations_src + '{}_lsa10_3grams.npy'.format(mode))[1]
    
    transforms_feats = pd.DataFrame()
    transforms_feats['cosine'] = [cosine(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['correlation'] = [correlation(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['jaccard'] = [jaccard(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['euclidean'] = [euclidean(x, y) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    transforms_feats['minkowski'] = [minkowski(x, y, 3) for (x,y) in zip(lsa10tr_3grams_q1, lsa10tr_3grams_q2)]
    return transforms_feats

def get_doc2vec_features(doc2vec_src, mode = 'train'):
    print('Adding features based on Doc2Vec distances.')
    doc2vec_pre_q1 = np.load(doc2vec_src + '{}_q1_doc2vec_vectors_pretrained.npy'.format(mode))
    doc2vec_pre_q2 = np.load(doc2vec_src + '{}_q2_doc2vec_vectors_pretrained.npy'.format(mode))
    doc2vec_quora_q1 = np.load(doc2vec_src + '{}_q1_doc2vec_vectors_trainquora.npy'.format(mode))
    doc2vec_quora_q2 = np.load(doc2vec_src + '{}_q2_doc2vec_vectors_trainquora.npy'.format(mode))
    
    d2v_feats_pretrained = pd.DataFrame()
    d2v_feats_pretrained['cosine'] = [cosine(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['correlation'] = [correlation(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['jaccard'] = [jaccard(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['euclidean'] = [euclidean(x, y) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    d2v_feats_pretrained['minkowski'] = [minkowski(x, y, 3) for (x,y) in zip(doc2vec_pre_q1, doc2vec_pre_q2)]
    
    d2v_feats_quora = pd.DataFrame()
    d2v_feats_quora['cosine'] = [cosine(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['correlation'] = [correlation(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['jaccard'] = [jaccard(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['euclidean'] = [euclidean(x, y) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    d2v_feats_quora['minkowski'] = [minkowski(x, y, 3) for (x,y) in zip(doc2vec_quora_q1, doc2vec_quora_q2)]
    return d2v_feats_pretrained, d2v_feats_quora

def labelcount_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for cat_feature in categorical_features:
        cat_feature_value_counts = df[cat_feature].value_counts()
        value_counts_list = cat_feature_value_counts.index.tolist()
        value_counts_range_rev = list(reversed(range(len(cat_feature_value_counts)))) # for ascending ordering
        value_counts_range = list(range(len(cat_feature_value_counts))) # for descending ordering
        labelcount_dict = dict(zip(value_counts_list, value_counts_range))
        new_df[cat_feature] = df[cat_feature].map(labelcount_dict)
    return new_df

def count_encode(df2, cols):
    df = df2.copy()
    categorical_features = cols
    new_df = pd.DataFrame()
    for i in categorical_features:
        new_df[i] = df[i].astype('object').replace(df[i].value_counts())
    return new_df

def bin_numerical(df2, cols, step):
    df = df2.copy()
    numerical_features = cols
    new_df = pd.DataFrame()
    for i in numerical_features:
        feature_range = np.arange(0, np.max(df[i]), step)
        new_df[i] = np.digitize(df[i], feature_range, right=True)
    return new_df

def drop_duplicate_cols(df):
    dfc = df.iloc[0:10000,:]
    dfc = dfc.T.drop_duplicates().T
    duplicate_cols = sorted(list(set(df.columns).difference(set(dfc.columns))))
    print('Dropping duplicate columns:', duplicate_cols)
    df.drop(duplicate_cols, axis = 1, inplace = True)
    print('Final shape:', df.shape)
    del dfc
    gc.collect()
    return df

In [None]:
def get_new_feats():
    
    print('Creating additional grouping features.')
    turkewitz_feats = pd.read_csv('/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/test_turkewitz_features_fullcleanSTEMMED.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    
    ff1 = turkewitz_feats.groupby(['q2_freq'])['q1_freq'].transform('sum')
    ff2 = turkewitz_feats.groupby(['q1_freq'])['q2_freq'].transform('sum')
    ff1 = ff1 / np.max(ff1)
    ff2 = ff2 / np.max(ff2)
    ff1m = turkewitz_feats.groupby(['q2_freq'])['q1_freq'].transform('mean')
    ff2m = turkewitz_feats.groupby(['q1_freq'])['q2_freq'].transform('mean')
    ff1m = ff1m / np.max(ff1m)
    ff2m = ff2m / np.max(ff2m)
    gr_feats = pd.DataFrame()
    gr_feats['ff1'] = ff1
    gr_feats['ff2'] = ff2
    gr_feats['ff1m'] = ff1m
    gr_feats['ff2m'] = ff2m

    test_c = count_encode(turkewitz_feats, ['q1_freq', 'q2_freq'])
    test_c.q1_freq = test_c.q1_freq / np.max(test_c.q1_freq)
    test_c.q2_freq = test_c.q2_freq / np.max(test_c.q2_freq)
    test_c.rename(columns = {'q1_freq': 'q1_freq_normalized', 'q2_freq': 'q2_freq_normalized'}, inplace = True)

    src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
    network_feats =  pd.read_csv(src + 'test_networkfeats_fullclean.csv')
    textacy1_feats = pd.read_csv(src + 'test_textacy_similarity_feats.csv')

    new_feats = pd.concat([test_c, gr_feats, network_feats, textacy1_feats, turkewitz_feats], axis = 1)
    new_feats['q1_deg_by_freq'] = new_feats.groupby(['q1_freq'])['q1_degrees'].transform('mean')
    new_feats['q2_deg_by_freq'] = new_feats.groupby(['q2_freq'])['q2_degrees'].transform('mean')
    new_feats['q1_deg_by_freq2'] = new_feats.groupby(['q1_freq'])['q2_degrees'].transform('mean')
    new_feats['q2_deg_by_freq1'] = new_feats.groupby(['q2_freq'])['q1_degrees'].transform('mean')
    new_feats['q1_clust_by_freq'] = new_feats.groupby(['q1_freq'])['q1_cluster'].transform('mean')
    new_feats['q2_clust_by_freq'] = new_feats.groupby(['q2_freq'])['q2_cluster'].transform('mean')
    new_feats['q1_clust_by_freq2'] = new_feats.groupby(['q1_freq'])['q2_cluster'].transform('mean')
    new_feats['q2_clust_by_freq1'] = new_feats.groupby(['q2_freq'])['q1_cluster'].transform('mean')
    new_feats['q1_deg_by_freq_inv'] = new_feats.groupby(['q1_degrees'])['q1_freq'].transform('mean')
    new_feats['q2_deg_by_freq_inv'] = new_feats.groupby(['q2_degrees'])['q2_freq'].transform('mean')
    new_feats['q1_deg_by_freq2_inv'] = new_feats.groupby(['q2_degrees'])['q1_freq'].transform('mean')
    new_feats['q2_deg_by_freq1_inv'] = new_feats.groupby(['q1_degrees'])['q2_freq'].transform('mean')
    new_feats['q1_clust_by_freq_inv'] = new_feats.groupby(['q1_cluster'])['q1_freq'].transform('mean')
    new_feats['q2_clust_by_freq_inv'] = new_feats.groupby(['q2_cluster'])['q2_freq'].transform('mean')
    new_feats['q1_clust_by_freq2_inv'] = new_feats.groupby(['q2_cluster'])['q1_freq'].transform('mean')
    new_feats['q2_clust_by_freq1_inv'] = new_feats.groupby(['q1_cluster'])['q2_freq'].transform('mean')
    new_feats.drop(turkewitz_feats.columns.tolist(), axis = 1, inplace = True)

    del test_c, gr_feats, network_feats, textacy1_feats, turkewitz_feats
    gc.collect()
    return new_feats

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

X_test = get_test()
new_feats = get_new_feats()

networks_tony = pd.read_pickle(src + 'test_fullnetworkfeatsTony.pkl')
networks_weighted = pd.read_pickle(src + 'test_networkfeats_weighted.pkl')

col_dict = {
    'q1_cluster': 'q1_cluster_tony',
    'q1_degrees': 'q1_degrees_tony',
    'q1_squared_cluster': 'q1_squared_cluster_tony',
    'q1_triangles_cluster': 'q1_triangles_cluster_tony',
    'q2_cluster': 'q2_cluster_tony',
    'q2_degrees': 'q2_degrees_tony',
    'q2_squared_cluster': 'q2_squared_cluster_tony',
    'q2_triangles_cluster': 'q2_triangles_cluster_tony'
}

networks_tony.rename(columns = col_dict, inplace = True)

cv_svd50_dist = pd.read_csv(src + 'test_SVD_CV1gram_50dim.csv')
cv_lsa50_dist = pd.read_csv(src + 'test_LSA_CV1gram_50dim.csv')

tfidf_svd50_dist = pd.read_csv(src + 'test_SVD_TFIDF_3grams_words_50dim.csv')
tfidf_lsa50_dist = pd.read_csv(src + 'test_LSA_TFIDF_3grams_words_50dim.csv')

d2v_pre = pd.read_csv(src + 'test_doc2vec_pretrained_distances.csv')
d2v_quora = pd.read_csv(src + 'test_doc2vec_quoratrain_distances.csv')
transforms = pd.read_csv(src + 'test_SVDLSA_CV1gram_distances.csv')

X_test = pd.concat([X_test, new_feats, networks_tony, networks_weighted,
                     cv_svd50_dist, cv_lsa50_dist, tfidf_svd50_dist, tfidf_lsa50_dist,
                     d2v_pre, d2v_quora, transforms], axis = 1)

cols_to_drop = ['counts_max_network_weighted', 'counts_min_network_weighted', 'diff_counts_network_weighted', 'diff_degrees_network_weighted', 'diff_triangles_cluster_network_weighted', 'exactly_same', 'jaccard_distance_test_LSA_TFIDF_3grams_words_50dim', 'max_degrees_network_weighted', 'max_triangles_cluster_network_weighted', 'min_degrees_network_weighted', 'min_triangles_cluster_network_weighted', 'mult_counts_network_weighted', 'q1_counts_network_weighted', 'q1_degrees_network_weighted', 'q1_triangles_cluster_network_weighted', 'q2_counts_network_weighted', 'q2_degrees_network_weighted', 'q2_triangles_cluster_network_weighted', 'question1_100', 'question1_101', 'question1_102', 'question1_103', 'question1_104', 'question1_105', 'question1_106', 'question1_107', 'question1_108', 'question1_109', 'question1_110', 'question1_111', 'question1_112', 'question1_113', 'question1_114', 'question1_115', 'question1_116', 'question1_117', 'question1_118', 'question1_119', 'question1_120', 'question1_121', 'question1_122', 'question1_123', 'question1_124', 'question1_125', 'question1_126', 'question1_127', 'question1_68', 'question1_69', 'question1_70', 'question1_71', 'question1_72', 'question1_73', 'question1_74', 'question1_75', 'question1_76', 'question1_77', 'question1_78', 'question1_79', 'question1_80', 'question1_81', 'question1_82', 'question1_83', 'question1_84', 'question1_85', 'question1_86', 'question1_87', 'question1_88', 'question1_89', 'question1_90', 'question1_91', 'question1_92', 'question1_93', 'question1_94', 'question1_95', 'question1_96', 'question1_97', 'question1_98', 'question1_99', 'question2_103', 'question2_104', 'question2_106', 'question2_108', 'question2_109', 'question2_110', 'question2_112', 'question2_113', 'question2_114', 'question2_115', 'question2_116', 'question2_118', 'question2_119', 'question2_120', 'question2_121', 'question2_122', 'question2_123', 'question2_126', 'question2_127', 'question2_81', 'question2_83', 'question2_87', 'question2_88', 'question2_89', 'question2_96', 'sum_counts_network_weighted', 'sum_degrees_network_weighted', 'sum_triangles_cluster_network_weighted']
X_test.drop(cols_to_drop, axis = 1, inplace = True)
X_test = X_test.astype('float32')
print('Final shape:', X_test.shape)

X_test.to_pickle('Xtest_916cols.pkl')

del new_feats, networks_tony, networks_weighted, cv_svd50_dist, cv_lsa50_dist, \
tfidf_svd50_dist, tfidf_lsa50_dist, d2v_pre, d2v_quora, transforms
gc.collect()

In [None]:
X_test = pd.read_pickle('Xtest_500bestCols.pkl')
predict_test(X_test, 'XGB_new_NetworkFeats_experiments_500feats')