In [None]:
import time
import gc
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from scipy import sparse
from fastFM import als, sgd
from vowpalwabbit.sklearn_vw import VWClassifier, VWRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

In [None]:
def model_sklearn_foldrun(model, X, y, name, X_test = None):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running model:', model)
    i = 0
    losses = []
    oof_train = np.zeros((404290, 2))
    oof_test = np.zeros((10, 2345796, 2))
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()
        model.fit(X_tr, y_tr)
        if 'sklearn' in name or 'Class' in name:
            val_pred = model.predict_proba(X_val)
        else:
            val_pred = model.predict(X_val)
        score = log_loss(y_val, val_pred)
        losses.append(score)
        oof_train[val_index, :] = val_pred
        if X_test is not None:
            print('Predicting test set.')
            if 'sklearn' in name or 'Class' in name:
                test_preds = model.predict_proba(X_test)
            else:
                test_preds = model.predict(X_test)
            oof_test[i, :, :] = test_preds
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        i += 1
    oof_train = pd.DataFrame(oof_train)
    oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
    oof_train.columns = ['{}_prob0'.format(name), '{}_prob1'.format(name)]
    oof_test = oof_test.mean(axis = 0)
    oof_test = pd.DataFrame(oof_test)
    oof_test.columns = ['{}_prob0'.format(name), '{}_prob1'.format(name)]
    oof_test.to_pickle('OOF_preds/test/test_preds_{}.pkl'.format(name))
    print(oof_train.shape, oof_test.shape)
    return

def model_foldrun(model, X, y, name, X_test = None):
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    print('Running model:', model)
    i = 0
    losses = []
    oof_train = np.zeros((404290))
    oof_test = np.zeros((10, 2345796))
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()
        model.fit(X_tr, y_tr)
        val_pred = model.predict(X_val)
        score = log_loss(y_val, val_pred)
        losses.append(score)
        oof_train[val_index] = val_pred
        if X_test is not None:
            print('Predicting test set.')
            test_preds = model.predict(X_test)
            oof_test[i, :] = test_preds
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        i += 1
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['{}_prob'.format(name)]
    oof_train.to_pickle('OOF_preds/train/train_preds_{}.pkl'.format(name))
    oof_test = oof_test.mean(axis = 0)
    oof_test = pd.DataFrame(oof_test)
    oof_test.columns = ['{}_prob'.format(name)]
    oof_test.to_pickle('OOF_preds/test/test_preds_{}.pkl'.format(name))
    print(oof_train.shape, oof_test.shape)
    return


In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'
trans_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/lemmatized_fullclean/transformations/'

tfidf = False
if tfidf:
    X_traintf = pd.read_pickle('Xtrain_TFIDFstem_othermodels.pkl')
    X_testtf = pd.read_pickle('Xtest_TFIDFstem_othermodels.pkl')
else:
    X_train = pd.read_pickle('Xtrain_825cols_othermodels.pkl')
    X_train = X_train.astype('float32')
    X_test = pd.read_pickle('Xtest_825cols_othermodels.pkl')
    X_test = X_test.astype('float32')
    
y_train = pd.read_pickle('y_train.pkl')

In [None]:
vw = VWRegressor()
lr = LogisticRegression()
ridge = Ridge()

knn = KNeighborsClassifier(n_neighbors = 10, n_jobs = 8)
ada = AdaBoostClassifier()
gb = GradientBoostingClassifier(n_estimators=200, subsample=0.85, min_samples_split=10, min_samples_leaf=5, 
                                min_weight_fraction_leaf=0.0, max_depth=6, random_state=111)
et = ExtraTreesClassifier(n_estimators=400, min_samples_split=6, min_samples_leaf=10, 
                                min_weight_fraction_leaf=0.0, max_depth=8, random_state=111, n_jobs=9)
rf = RandomForestClassifier(n_estimators=450, min_samples_split=4, min_samples_leaf=8, 
                                min_weight_fraction_leaf=0.0, max_depth=9, random_state=111, n_jobs=9)


In [None]:
model_sklearn_foldrun(lr, X_traintf, y_train, 'LogRegClassifier_TFIDF', X_testtf)
model_foldrun(vw, X_traintf, y_train, 'VWRegressor_TFIDF', X_testtf)

In [None]:
#model_sklearn_foldrun(et, X_train, y_train, 'ExtraTreesClassifier_xgbfeats', X_test)
#model_sklearn_foldrun(rf, X_train, y_train, 'RandomForestClassifier_xgbfeats', X_test)
#model_sklearn_foldrun(gb, X_train, y_train, 'GradientBoostingClassifier_xgbfeats', X_test)
model_sklearn_foldrun(ada, X_train, y_train, 'AdaBoostClassifier_xgbfeats', X_test)
model_sklearn_foldrun(knn, X_train, y_train, 'KNNClassifier10nn_xgbfeats', X_test)