In [None]:
import pandas as pd
import numpy as np
import nltk
import multiprocessing
import difflib
import time
import gc
import category_encoders as ce
import itertools

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from scipy import sparse
from fastFM import als, sgd
from vowpalwabbit.sklearn_vw import VWClassifier, VWRegressor
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD, LatentDirichletAllocation


In [None]:
def tf_idf(df_full, dftr):
    tf = TfidfVectorizer(stop_words = 'english', min_df = 1, max_df = 0.999)
    tf.fit(df_full.question1 + df_full.question2)
    q1_tfidf = tf.transform(dftr.question1.values)
    q2_tfidf = tf.transform(dftr.question2.values)
    tr_tfidf = sparse.hstack([q1_tfidf, q2_tfidf])
    print('Final shape:', tr_tfidf.shape)
    return tr_tfidf

def model_foldrun(model, X, y, name, X_test = None):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running model:', model)
    i = 1
    losses = []
    oof_train = np.zeros((X.shape[0],))
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()
        model.fit(X_tr, y_tr)
        val_pred = model.predict(X_val)
        score = log_loss(y_val, val_pred)
        losses.append(score)
        oof_train[val_index] = val_pred
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        if Xte is not None:
            print('Predicting test set.')
            test_preds = model.predict([Xte, Xte2], batch_size = 64)
            test_preds = pd.DataFrame(test_preds)
            test_preds.columns = ['{}_feat1', '{}_feat2']
            test_preds.to_pickle('test_{}preds_fold{}'.format(name, i))
            del test_preds
            gc.collect()
        i += 1
    val_pred = pd.DataFrame(val_pred, index = val_index)
    val_pred.columns = ['{}_feat1', '{}_feat2']
    val_pred.to_pickle('train_{}preds_fold{}'.format(name, i))
    return
        

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
trans_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/transformations/'

X_train = pd.read_pickle('Xtrain_814colsBest.pkl', compression = 'bz2')
xgb_feats = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')
train_interactions = pd.read_pickle(src + 'train_tony_interaction_gru.pkl')

X_train = pd.concat([X_train, train_interactions], axis = 1)
X_train = X_train.astype('float32')
y_train = xgb_feats[['is_duplicate']]
X_train.replace(np.nan, -999, inplace = True)
X_train.replace(np.inf, 999, inplace = True)


dftr2 = pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv')[['question1', 'question2']]
dfte2 = pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv')[['question1', 'question2']]
df_full = pd.concat((dftr2, dfte2))
X_traintf = tf_idf(df_full, dftr2)

del xgb_feats, df_full, dftr2, dfte2
gc.collect()

In [None]:
vw = VWRegressor()

model_foldrun(vw, X_train, y_train, 'VWRegressor1stRun', X_test)