In [1]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cosine, correlation, canberra, chebyshev, minkowski, jaccard, euclidean

from models_utils_gbm import *
from models_utils_fe import *

In [2]:
def get_test():
    feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
    feats_src2 = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/NER/'
    
    keras_q1 = np.load(feats_src2 + 'q1test_NER_128len.npy')
    keras_q2 = np.load(feats_src2 + 'q2test_NER_128len.npy')
    xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_test.csv')
    abhishek_feats = pd.read_csv(feats_src + 'abhishek/test_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv(feats_src + 'other_features/text_features_test.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv(feats_src + 'other_features/img_features_test.csv')
    srk_feats = pd.read_csv(feats_src + 'srk/SRK_grams_features_test.csv')
    
    mephisto_feats = pd.read_csv('../../data/features/lemmatized_fullclean/test_mephistopeheles_features.csv').iloc[:, 6:]
    turkewitz_feats = pd.read_csv('../../data/features/lemmatized_fullclean/test_turkewitz_features_fullcleanSTEMMED.csv')
    turkewitz_feats = turkewitz_feats[['q1_freq', 'q2_freq']]
    turkewitz_feats['freq_sum'] = turkewitz_feats.q1_freq + turkewitz_feats.q2_freq
    turkewitz_feats['freq_diff'] = turkewitz_feats.q1_freq - turkewitz_feats.q2_freq
    turkewitz_feats['freq_mult'] = turkewitz_feats.q1_freq * turkewitz_feats.q2_freq
    turkewitz_feats['freq_div'] = turkewitz_feats.q1_freq / turkewitz_feats.q2_freq

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    xgb_feats = xgb_feats.iloc[:, 5:]
    
    df = pd.concat([xgb_feats, abhishek_feats, text_feats, img_feats, 
                               turkewitz_feats, mephisto_feats], axis = 1)
    del xgb_feats, abhishek_feats, text_feats, img_feats, turkewitz_feats, mephisto_feats
    gc.collect()
    
    df = drop_duplicate_cols(df)
    keras_q1 = pd.DataFrame(keras_q1)
    keras_q2 = pd.DataFrame(keras_q2)
    keras_q1.columns = ['question1_{}'.format(i) for i in range(keras_q1.shape[1])]
    keras_q2.columns = ['question2_{}'.format(i) for i in range(keras_q2.shape[1])]
    X = pd.concat([keras_q1, keras_q2, df], axis = 1)
    
    colnames_list = X.columns.tolist()
    colnames_list[300] = 'len_char_q1_other'
    colnames_list[301] = 'len_char_q2_other'
    X.columns = colnames_list
    print('Test data shape:', X.shape)
    X = X.astype('float32')
    return X

def predict_test_xgb(X_test, model_name):
    print('Predicting on test set with XGBoost.')
    gbm = xgb.Booster(model_file = 'saved_models/XGB/{}.txt'.format(model_name))
    X_test = xgb.DMatrix(X_test)
    test_preds = gbm.predict(X_test)
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

def predict_test_lgbm(X_test, model_name):
    print('Predicting on test set with LightGBM.')
    gbm = lgb.Booster(model_file = 'saved_models/LGBM/{}.txt'.format(model_name))
    test_preds = gbm.predict(X_test)
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
trans_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/transformations/'

wmd = pd.read_csv(src + 'test_WMD_cleaned_stemmed.csv')
wmd = wmd.astype('float32')
wmd.replace(np.inf, 1000, inplace = True)

skip_thought = pd.read_csv(src + 'test_skipthoughts_Alex_distances.csv')
skip_thought = skip_thought.astype('float32')

compression = pd.read_csv(src + 'test_LZMAcompression_distance.csv')
compression = compression.astype('float32')

edit = pd.read_csv(src + 'test_EDITdistance.csv')
edit = edit.astype('float32')

moments = pd.read_csv(src + 'test_doc2vec_moments.csv')
moments = moments.astype('float32')

networks_NER = pd.read_csv(src + 'test_networkfeats_NER.csv')
networks_NER = networks_NER.astype('float32')

lsaq1 = pd.DataFrame(np.load(trans_src + 'test_lsa50_CV1gram.npy')[0])
lsaq1.columns = ['{}_lsaCV1_q1'.format(i) for i in range(lsaq1.shape[1])]
lsaq2 = pd.DataFrame(np.load(trans_src + 'test_lsa50_CV1gram.npy')[1])
lsaq2.columns = ['{}_lsaCV1_q2'.format(i) for i in range(lsaq2.shape[1])]
lsaq1 = lsaq1.astype('float32')
lsaq2 = lsaq2.astype('float32')

svdq1 = pd.DataFrame(np.load(trans_src + 'test_svd50_CV1gram.npy')[0])
svdq1.columns = ['{}_svdCV1_q1'.format(i) for i in range(svdq1.shape[1])]
svdq2 = pd.DataFrame(np.load(trans_src + 'test_svd50_CV1gram.npy')[1])
svdq2.columns = ['{}_svdCV1_q2'.format(i) for i in range(svdq2.shape[1])]
svdq1 = svdq1.astype('float32')
svdq2 = svdq2.astype('float32')

#nmfq1 = pd.DataFrame(pd.read_pickle(src + 'te_nmf.pkl'))
#nmfq1.columns = ['{}_nmf'.format(i) for i in range(nmfq1.shape[1])]
#nmfq1 = nmfq1.astype('float32')

X_test = pd.read_pickle('Xtest_500bestCols.pkl')
X_test = pd.concat([X_test, wmd, skip_thought, compression, edit, moments, networks_NER, 
                     lsaq1, lsaq2, svdq1, svdq2], axis = 1)

del wmd, skip_thought, compression, edit, moments, networks_NER, \
    lsaq1, lsaq2, svdq1, svdq2
gc.collect()

In [None]:
best_cols = [
    'min_pagerank_sp_network_weighted',
    'norm_wmd',
    'word_match',
    '1wl_tfidf_l2_euclidean',
    'm_vstack_svd_q1_q1_euclidean',
    '1wl_tfidf_cosine',
    'sk_bi_skew_q2vec',
    'm_q1_q2_tf_svd0',
    'sk_bi_skew_q1vec',
    'skew_q2vec',
    'trigram_tfidf_cosine',
    'sk_uni_skew_q2vec',
    'sk_bi_canberra_distance',
    'question1_3',
    'sk_uni_skew_q1vec',
    'sk_uni_kur_q2vec',
    'min_eigenvector_centrality_np_network_weighted',
    'avg_world_len2',
    'z_word_match',
    'sk_uni_kur_q1vec',
    'skew_doc2vec_pretested_lemmat']

rescale = False
X_bin = bin_numerical(X_test, best_cols, 0.1)
X_grouped = group_featbyfeat(X_test, best_cols, 'mean')
X_grouped2 = group_featbyfeat(X_test, best_cols, 'sum')
X_combinations = feature_combinations(X_test, best_cols[:5])

X_additional = pd.concat([X_bin, X_grouped, X_grouped2, X_combinations], axis = 1)
#X_additional = drop_duplicate_cols(X_additional)
X_additional.replace(np.inf, 999, inplace = True)
X_additional.replace(np.nan, -999, inplace = True)
if rescale:
    colnames = X_additional.columns
    X_additional = pd.DataFrame(MinMaxScaler().fit_transform(X_additional))
    X_additional.columns = colnames

X_test = pd.concat([X_test, X_additional], axis = 1)
X_test = X_test.astype('float32')
print('Final test data shape:', X_test.shape)

del X_bin, X_grouped, X_grouped2, X_combinations, X_additional
gc.collect()

In [3]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
trans_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/transformations/'

X_test = pd.read_pickle('Xtest_814colsBest.pkl', compression = 'bz2')
test_interactions = pd.read_pickle(src + 'test_tony_interaction_gru.pkl')

X_test = pd.concat([X_test, test_interactions], axis = 1)
X_test = X_test.astype('float32')

del test_interactions
gc.collect()

0

In [4]:
best_cols = pd.read_pickle('Cols_currentBest_400feats.pkl')
X_test2 = X_test.loc[:, best_cols]
X_test2.to_pickle('Xtest_currentBest_400feats.pkl')

In [4]:
predict_test_xgb(X_test, 'XGB_10SKF_FredFeatsGRUandDecompAttention_loss0.17354_fold1')

Predicting on test set with XGBoost.


In [4]:
predict_test_lgbm(X_test, 'LGBM_10SKF_FredFeatsGRUandDecompAttention_loss0.17440_fold1')

Predicting on test set with LightGBM.
