In [1]:
import numpy as np
import pandas as pd

import itertools
import time
from datetime import datetime

from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, mean_absolute_error

from scipy.stats import skew, boxcox
from scipy.sparse import csr_matrix, hstack, spmatrix





In [2]:
def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

shift = 200
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

In [3]:
def output(name_train, name_test, files_train, files_test):
    
    train_preds = pd.DataFrame(np.concatenate(files_train, axis = 1))
    test_preds = pd.DataFrame(np.concatenate(files_test, axis = 1))
    
    train_preds.to_csv('Good_preds/Sklearn/{}.csv'.format(name_train), index = False)
    test_preds.to_csv('Good_preds/Sklearn/{}.csv'.format(name_test), index = False)
                      
    return


In [4]:
ID = 'id'
TARGET = 'loss'
NFOLDS = 5
SEED = 669
NROWS = None
DATA_DIR = "../.."

def nn_load(interactions = False):

    TRAIN_FILE = "{0}/train.csv".format(DATA_DIR)
    TEST_FILE = "{0}/test.csv".format(DATA_DIR)
    SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)

    train = pd.read_csv(TRAIN_FILE, nrows=NROWS)
    test = pd.read_csv(TEST_FILE, nrows=NROWS)

    train_indices = train[ID].values
    test_indices = test[ID].values


    y_train = train["loss"].values
    #y_train_ravel = train[TARGET].ravel()

    train.drop([ID, TARGET], axis=1, inplace=True)
    test.drop([ID], axis=1, inplace=True)

    print("{},{}".format(train.shape, test.shape))

    ntrain = train.shape[0]
    ntest = test.shape[0]
    train_test = pd.concat((train, test)).reset_index(drop=True)
    
    numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
    
    features = train.columns
    cats = [feat for feat in features if 'cat' in feat]
    
    if interactions:
        print('2-way feature combinations.')
        for comb in itertools.combinations(COMB_FEATURE, 2):
            feat = comb[0] + "_" + comb[1]
            train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
            train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]
        print "2-way interactions done."
    print train_test.shape

    for feat in cats:
        train_test[feat] = train_test[feat].apply(encode)
    print '\n', "Features for NN encoded"

    features = train_test.columns
    cats = [feat for feat in features if 'cat' in feat]

    sparse_data = []

    for feature in cats:
        dummy = pd.get_dummies(train_test[feature].astype("category"), drop_first = True)
        temp = csr_matrix(dummy)
        sparse_data.append(temp)

    nums = [feat for feat in features if "cont" in feat]
    scaler = RobustScaler()
    temp = csr_matrix(scaler.fit_transform(train_test[nums]))

    sparse_data.append(temp)
    #sparse_data.append(pca1)

    del train_test

    x_traintest = hstack(sparse_data, format = "csr")
    x_train = x_traintest[:ntrain, :]
    x_test = x_traintest[ntrain:, :]

    print x_train.shape
    print x_test.shape

    del x_traintest, sparse_data, temp

    print '\n', 'NN dataset loaded.', '\n'

    return x_train, x_test, y_train

In [5]:
X1, X_te1, y1 = nn_load(True)

(188318, 130),(125546, 130)
2-way feature combinations.
2-way interactions done.
(313864, 725)

Features for NN encoded
(188318, 6303)
(125546, 6303)

NN dataset loaded. 



In [7]:
SEED = 2016
NFOLDS = 5
kf = KFold(X1.shape[0], n_folds = NFOLDS, shuffle = True, random_state = 111)
ntrain = X1.shape[0]
ntest = X_te1.shape[0]

X_tr1, X_val1, y_tr1, y_val1 = train_test_split(X1, y1, test_size = 0.2, random_state = 111)

#### With interactions:

VW: 1308.11978541

MCMC: 1206

In [None]:
from vowpalwabbit.sklearn_vw import VWRegressor, VW

model = VWRegressor(l = 2000, random_seed = 2016)
model.fit(X_tr1, y_tr1)

preds_vw = model.predict(X_val1)
print mean_absolute_error(y_val1, preds_vw)

In [11]:
from fastFM import sgd, als, mcmc
from scipy import sparse

fm_mcmc = mcmc.FMRegression(n_iter = 200, init_stdev=0.1, rank = 8, random_state = 2016)
fm_als = als.FMRegression(n_iter = 200, init_stdev=0.1, rank = 8, random_state = 2016, l2_reg_w = 0.05)

In [12]:
fm_als.fit(X_tr1, y_tr1)
preds_als = fm_als.predict(X_val1)

print mean_absolute_error(y_val1, preds_als)

1588.06178287


In [None]:
preds_mcmc2 = fm_mcmc.fit_predict(X_tr1, y_tr1, X_val1)
print mean_absolute_error(y_val1, preds_mcmc2)

In [16]:
def oof_VW(clf, X, y):
    
    t = time.time()
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        
        x_tr = X[train_index]
        y_tr = y[train_index]
        x_te = X[test_index]
        y_te = y[test_index]

        clf.fit(x_tr, y_tr)

        preds = clf.predict(x_te)
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_te1)
        
        print "Done for fold: ", i+1
        print "Fold MAE: ", mean_absolute_error(y_te, preds)

    oof_test[:] = oof_test_skf.mean(axis=0)
    
    print "Full training took: ", time.time() - t
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

def oof_FM(clf, X, y):
    
    t = time.time()
    oof_test_skf = np.empty((NFOLDS, ntest))
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))

    for i, (train_index, test_index) in enumerate(kf):
        
        x_tr = X[train_index]
        y_tr = y[train_index]
        x_te = X[test_index]
        y_te = y[test_index]
        
        oof_test_skf[i, :] = fm_mcmc.fit_predict(x_tr, y_tr, X_te1)
        oof_train[test_index] = fm_mcmc.fit_predict(x_tr, y_tr, x_te)
        
        print "Done for fold: ", i+1

    oof_test[:] = oof_test_skf.mean(axis=0)
    
    print "Full training took: ", time.time() - t
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
vw_tr, vw_te = oof_VW(model, X1, y1)

tr = [vw_tr]
te = [vw_te]

output('train1_VW_interactions_7.12', 'test1_VW_interactions_7.12', tr, te)

In [17]:
fm_tr, fm_te = oof_FM(fm_mcmc, X1, y1)

tr = [fm_tr]
te = [fm_te]

output('train1_FFM-MCMC_interactions_7.12', 'test1_FFM-MCMC_interactions_7.12', tr, te)

Done for fold:  1
Done for fold:  2
Done for fold:  3
Done for fold:  4
Done for fold:  5
Full training took:  16561.388062
