In [1]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import category_encoders as ce
import itertools

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy import sparse
from fastFM import als, sgd
from vowpalwabbit.sklearn_vw import VWClassifier, VWRegressor
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation

from models_utils_fe import *

In [2]:
def tf_idf(df_full, dftr):
    tf = TfidfVectorizer(stop_words = 'english', min_df = 1, max_df = 0.999)
    tf.fit(df_full.question1 + df_full.question2)
    q1_tfidf = tf.transform(dftr.question1.values)
    q2_tfidf = tf.transform(dftr.question2.values)
    tr_tfidf = sparse.hstack([q1_tfidf, q2_tfidf])
    print('Final shape:', tr_tfidf.shape)
    return tr_tfidf

In [3]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

wmd = pd.read_csv(src + 'train_WMD_cleaned_stemmed.csv')
wmd = wmd.astype('float32')
wmd.replace(np.inf, 1000, inplace = True)
skip_thought = pd.read_csv(src + 'train_skipthoughts_Alex_distances.csv')
skip_thought = skip_thought.astype('float32')
compression = pd.read_csv(src + 'train_LZMAcompression_distance.csv')
compression = compression.astype('float32')
edit = pd.read_csv(src + 'train_EDITdistance.csv')
edit = edit.astype('float32')
moments = pd.read_csv(src + 'train_doc2vec_moments.csv')
moments = moments.astype('float32')
networks_NER = pd.read_csv(src + 'train_networkfeats_NER.csv')
networks_NER = networks_NER.astype('float32')
xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
y_train = xgb_feats[['is_duplicate']]

X_train = pd.read_pickle('Xtrain_500bestCols.pkl')
X_train = pd.concat([X_train, wmd, skip_thought, compression, edit, moments, networks_NER], axis = 1)

del xgb_feats, wmd, skip_thought, compression, edit, moments, networks_NER
gc.collect()

best_cols = [
    'min_pagerank_sp_network_weighted',
    'norm_wmd',
    'word_match',
    '1wl_tfidf_l2_euclidean',
    'm_vstack_svd_q1_q1_euclidean',
    '1wl_tfidf_cosine',
    'sk_bi_skew_q2vec',
    'm_q1_q2_tf_svd0',
    'sk_bi_skew_q1vec',
    'skew_q2vec',
    'trigram_tfidf_cosine',
    'sk_uni_skew_q2vec',
    'sk_bi_canberra_distance',
    'question1_3',
    'sk_uni_skew_q1vec',
    'sk_uni_kur_q2vec',
    'min_eigenvector_centrality_np_network_weighted',
    'avg_world_len2',
    'z_word_match',
    'sk_uni_kur_q1vec',
    'skew_doc2vec_pretrained_lemmat']

rescale = False
X_bin = bin_numerical(X_train, best_cols, 0.1)
X_grouped = group_featbyfeat(X_train, best_cols, 'mean')
X_grouped2 = group_featbyfeat(X_train, best_cols, 'sum')
X_combinations = feature_combinations(X_train, best_cols[:5])


X_additional = pd.concat([X_bin, X_grouped, X_grouped2, X_combinations], axis = 1)
X_additional = drop_duplicate_cols(X_additional)
if rescale:
    colnames = X_additional.columns
    X_additional = pd.DataFrame(MinMaxScaler().fit_transform(X_additional))
    X_additional.columns = colnames

X_train = pd.concat([X_train, X_additional], axis = 1)
print('Final training data shape:', X_train.shape)

X_train = X_train.astype('float32')
X_train.fillna(-999, inplace = True)
X_train.replace(np.inf, 999, inplace = True)

X_tr, X_val, y_tr, y_val = train_test_split(sparse.csr_matrix(X_train), y_train.is_duplicate.values,
                                            stratify = y_train.is_duplicate.values,
                                            test_size = 0.2, random_state = 111)

del X_bin, X_grouped, X_grouped2, X_combinations, X_additional
gc.collect()



src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
dftr2 = pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv')[['question1', 'question2']]
dfte2 = pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv')[['question1', 'question2']]

df_full = pd.concat((dftr2, dfte2))
X_traintf = tf_idf(df_full, dftr2)

X_trtf, X_valtf, y_trtf, y_valtf = train_test_split(X_traintf, y_train.is_duplicate.values,
                                            stratify = y_train.is_duplicate.values,
                                            test_size = 0.2, random_state = 111)

Groupings of ['min_pagerank_sp_network_weighted', 'norm_wmd', 'word_match', '1wl_tfidf_l2_euclidean', 'm_vstack_svd_q1_q1_euclidean', '1wl_tfidf_cosine', 'sk_bi_skew_q2vec', 'm_q1_q2_tf_svd0', 'sk_bi_skew_q1vec', 'skew_q2vec', 'trigram_tfidf_cosine', 'sk_uni_skew_q2vec', 'sk_bi_canberra_distance', 'question1_3', 'sk_uni_skew_q1vec', 'sk_uni_kur_q2vec', 'min_eigenvector_centrality_np_network_weighted', 'avg_world_len2', 'z_word_match', 'sk_uni_kur_q1vec', 'skew_doc2vec_pretrained_lemmat'] columns done.
Groupings of ['min_pagerank_sp_network_weighted', 'norm_wmd', 'word_match', '1wl_tfidf_l2_euclidean', 'm_vstack_svd_q1_q1_euclidean', '1wl_tfidf_cosine', 'sk_bi_skew_q2vec', 'm_q1_q2_tf_svd0', 'sk_bi_skew_q1vec', 'skew_q2vec', 'trigram_tfidf_cosine', 'sk_uni_skew_q2vec', 'sk_bi_canberra_distance', 'question1_3', 'sk_uni_skew_q1vec', 'sk_uni_kur_q2vec', 'min_eigenvector_centrality_np_network_weighted', 'avg_world_len2', 'z_word_match', 'sk_uni_kur_q1vec', 'skew_doc2vec_pretrained_lemmat'] 

In [5]:
np.save('y_tr', y_tr)
np.save('y_val', y_val)

In [None]:
# XGB Feats

t = time.time()
vw = VWRegressor()
vw.fit(X_tr, y_tr)

train_pred = vw.predict(X_train)
val_pred = vw.predict(X_val)

score = vw.score(X_val, y_val)
loss = log_loss(y_val, val_pred)
print('VW score:', score, 'Logloss score:', loss, '\n',
      'Time it took to train and predict:', time.time() - t)

np.save('model_predictions/train_VW_xgbfeats_{}'.format(loss), train_pred)
np.save('model_predictions/val_VW_xgbfeats_{}'.format(loss), val_pred)

In [None]:
# TF-IDF Feats

t = time.time()
vw = VWRegressor()
vw.fit(X_trtf, y_trtf)

train_pred = vw.predict(X_trtf)
val_pred = vw.predict(X_valtf)

score = vw.score(X_valtf, y_valtf)
loss = log_loss(y_valtf, val_pred)
print('VW score:', score, 'Logloss score:', loss, '\n',
      'Time it took to train and predict:', time.time() - t)

np.save('model_predictions/train_VW_tfidffeats_{}'.format(loss), train_pred)
np.save('model_predictions/val_VW_tfidffeats_{}'.format(loss), val_pred)

In [None]:
# Combined feats

X_trc = sparse.hstack([X_tr, X_trtf])
X_valc = sparse.hstack([X_val, X_valtf])

t = time.time()
vw = VWRegressor()
vw.fit(X_trc, y_tr)

train_pred = vw.predict(X_trc)
val_pred = vw.predict(X_valc)

score = vw.score(X_valc, y_val)
loss = log_loss(y_val, val_pred)
print('VW score:', score, 'Logloss score:', loss, '\n',
      'Time it took to train and predict:', time.time() - t)

np.save('model_predictions/train_VW_combinedfeats_{}'.format(loss), train_pred)
np.save('model_predictions/val_VW_combinedfeats_{}'.format(loss), val_pred)