In [1]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import xgboost as xgb
import lightgbm as lgb
import warnings
import os
warnings.filterwarnings('ignore')

from tqdm import tqdm
from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.spatial.distance import cosine, correlation, canberra, chebyshev, minkowski, jaccard, euclidean

from models_utils_gbm import *
from models_utils_fe import *

In [5]:
def predict_test_xgb(X_test, model_name):
    print('Predicting on test set with XGBoost.')
    gbm = xgb.Booster(model_file = 'saved_models/XGB/{}.txt'.format(model_name))
    X_test = xgb.DMatrix(X_test)
    test_preds = gbm.predict(X_test)
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return

def predict_test_lgbm(X_test, model_name):
    print('Predicting on test set with LightGBM.')
    gbm = lgb.Booster(model_file = 'saved_models/LGBM/{}.txt'.format(model_name))
    test_preds = gbm.predict(X_test)
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = test_preds
    sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(model_name), index = False)
    return


def predict_test_xgb_fold(src, X_test):
    print('Predicting on test set with XGBoost.')
    fold_preds = np.zeros((10, 2345796))
    models = sorted([x for x in os.listdir(src) if 'txt' in x])
    #X_test = xgb.DMatrix(X_test)
    for i in tqdm(range(0, 10)):
        gbm = xgb.Booster(model_file = src + models[i])
        test_preds = gbm.predict(X_test)
        test_preds = np.apply_along_axis(transform, 0, test_preds)
        fold_preds[i, :] = test_preds
    fold_preds = fold_preds.mean(axis = 0)
    sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/submissions/'
    sample_sub = pd.read_csv(sub_src + 'sample_submission.csv')
    sample_sub['is_duplicate'] = fold_preds
    #sample_sub.is_duplicate = sample_sub.is_duplicate.apply(transform)
    sample_sub.to_csv(sub_src + '{}.csv'.format(src.split('/')[-2]), index = False)
    return

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

X_test = pd.read_pickle('Xtest_814colsBest.pkl', compression = 'bz2')
test_interactions = pd.read_pickle(src + 'test_tony_interaction_gru.pkl')
X_test = pd.concat([X_test, test_interactions], axis = 1)
new_set = pd.read_pickle('test_NewSubset_BestAbhishek.pkl')
new_set.columns = ['{}_lemmat{}'.format(val, i) for i, val in enumerate(new_set.columns)]
new_networks = pd.read_pickle('test_networkfeats_weighted_untransformed_30.05.pkl')
for col in new_networks.columns:
    X_test[col] = new_networks[col]

X_test = pd.concat([X_test, new_set], axis = 1)
X_test = X_test.astype('float32')

to_drop = ['diff_closeness_centrality_network_weighted', 'diff_cluster_network_weighted', 'diff_neighbors_network_weighted', 'diff_squared_cluster_network_weighted', 'max_cluster', 'max_cluster_network_weighted', 'max_neighbors', 'max_squared_cluster', 'max_squared_cluster_network_weighted', 'max_triangles_cluster', 'mean_cluster', 'mean_neighbors', 'mean_squared_cluster', 'min_closeness_centrality_network_weighted', 'min_cluster', 'min_cluster_network_weighted', 'min_degree_centrality', 'min_degrees', 'min_neighbors', 'min_neighbors_network_weighted', 'min_squared_cluster', 'min_squared_cluster_network_weighted', 'min_triangles_cluster', 'q1_closeness_centrality_by_q1_closeness_centrality', 'q1_closeness_centrality_by_q1_eigenvector_centrality_np', 'q1_cluster_tony', 'q1_pagerank_sp_by_q1_eigenvector_centrality_np', 'q1_squared_cluster', 'q1_squared_cluster_by_q1_squared_cluster', 'q2_closeness_centrality_by_q2_closeness_centrality', 'q2_cluster', 'q2_cluster_by_q2_cluster', 'q2_cluster_by_q2_eigenvector_centrality_np', 'q2_pagerank_sp_by_q2_eigenvector_centrality_np', 'q2_squared_cluster_tony', 'q2_triangles_cluster', 'sum_cluster_network_weighted']
X_test.drop(to_drop, axis = 1, inplace = True)
print(X_test.shape)
X_test.to_pickle('Xtest_866BestColsDropped.pkl')

del test_interactions, new_set, new_networks
gc.collect()

In [3]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/models/saved_models/XGB/SKF/'
X_test = pd.read_pickle('Xtest_866BestColsDropped.pkl')
X_test = xgb.DMatrix(X_test)

xgb1 = src + '866cols_xgbparams1/'
xgb2 = src + '866cols_xgbparams2/'
xgb3 = src + '866cols_xgbparams3/'
xgb4 = src + '866cols_xgbparams4/'

xgbnn = src + 'newNetworks_currentBest/'


In [6]:
predict_test_xgb_fold(xgbnn, X_test)


  0%|          | 0/10 [00:00<?, ?it/s]

Predicting on test set with XGBoost.


[A
100%|██████████| 10/10 [16:28<00:00, 97.66s/it]


In [None]:
predict_test_xgb_fold(xgb1, X_test)
predict_test_xgb_fold(xgb2, X_test)
predict_test_xgb_fold(xgb3, X_test)
predict_test_xgb_fold(xgb4, X_test)