In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from fastFM import mcmc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import xgboost as xgb
from tqdm import *
from libtelepot import sendMessage
import gc
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('data/train.csv.gz', index_col='ID')
test = pd.read_csv('data/test.csv.gz', index_col='ID')

In [3]:
dlist = list()
for c in train.columns:
    if train[c].unique().shape[0] == 1:
        #print(c)
        dlist.append(c)
train.drop(dlist, axis=1, inplace=True)
test.drop(dlist, axis=1, inplace=True)

In [4]:
for cone in train.columns:
    for ctwo in train.columns:
        if cone not in train.columns:
            continue
        if ctwo not in train.columns:
            continue
        if cone != ctwo and np.all(train[cone] == train[ctwo]):
            #print(cone, ctwo)
            train.drop(ctwo, axis=1, inplace=True)
            test.drop(ctwo, axis=1, inplace=True)

In [5]:
target = train.TARGET.values.copy()
train.drop('TARGET', axis=1, inplace=True)

In [6]:
for c in train.columns:
    cmin = train[c].min()
    cmax = train[c].max()
    test.loc[test[c] < cmin, c] = cmin
    test.loc[test[c] > cmax, c] = cmax

In [7]:
skf = StratifiedKFold(target, n_folds=10, random_state=42)

In [None]:
train_clfs = np.zeros((train.shape[0], 7))
test_clfs = np.zeros((test.shape[0], 7))

In [None]:
fold = 1
tst_preds = np.zeros((test.shape[0], 10))
for train_index, test_index in skf:
    X_train, X_test = train.iloc[train_index].copy(), train.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    rfres = np.zeros((X_test.shape[0], 10))
    rfres_test = np.zeros((test.shape[0], 10))
    for st in range(10):
        rf = RandomForestClassifier(n_estimators=5000, max_depth=50, max_features=50,
                                    random_state=st, n_jobs=-1)
        rf.fit(X_train, y_train)
        #train_clfs[test_index, 0] = rf.predict_proba(X_test)[:, 1]
        rfres[:, st] = rf.predict_proba(X_test)[:, 1]
        rfres_test[:, st] = rf.predict_proba(test)[:, 1]
        sendMessage('Finished fitting RandomForest {:}, \
                     roc = {:.7f}'.format(st, roc_auc_score(y_test, rfres[:, st])))
        del(rf)
        gc.collect()
    train_clfs[test_index, 0] = rfres.mean(axis=1)
    tst_preds[:, fold-1] = rfres_test.mean(axis=1)
    sendMessage('Fold ROC = {:.7f}'.format(roc_auc_score(y_test, rfres.mean(axis=1))))
    fold += 1
test_clfs[:, 0] = tst_preds.mean(axis=1)

In [None]:
fold = 1
for train_index, test_index in skf:
    X_train, X_test = train.iloc[train_index].copy(), train.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    etres = np.zeros((X_test.shape[0], 10))
    for st in range(10):
        et = ExtraTreesClassifier(n_estimators=5000, max_depth=50, max_features=50,
                                  random_state=st, n_jobs=-1)
        et.fit(X_train, y_train)
        #train_clfs[test_index, 0] = rf.predict_proba(X_test)[:, 1]
        etres[:, st] = et.predict_proba(X_test)[:, 1]
        sendMessage('Finished fitting RandomForest {:}, \
                     roc = {:.7f}'.format(st, roc_auc_score(y_test, etres[:, st])))
        del(et)
        gc.collect()
    train_clfs[test_index, 1] = etres.mean(axis=1)
    sendMessage('Fold ROC = {:.7f}'.format(roc_auc_score(y_test, etres.mean(axis=1))))
    fold += 1

In [None]:
params = {'objective': 'binary:logistic', 
          'eval_metric': 'auc',
          'eta': 0.0202048,
          'max_depth': 5,
          'subsample': 0.6815,
          'colsample_bytree': 0.701,
          'silent': 1,
          'seed': 0,
          'nthreads': 12
         }

fold = 1
for train_index, test_index in skf:
    X_train, X_test = train.iloc[train_index].copy(), train.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    xgores = np.zeros((X_test.shape[0], 10))
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    for st in range(50):
        params['seed'] = st
        fxgb = xgb.train(params, dtrain, num_boost_round=560, verbose_eval=False)
        xgores[:, st] = fxgb.predict(dtest)
        sendMessage('Finished fitting XGB1 {:}, \
                     roc = {:.7f}'.format(st, roc_auc_score(y_test, xgores[:, st])))
        del(fxgb)
        gc.collect()
    train_clfs[test_index, 2] = xgores.mean(axis=1)
    
    sendMessage('Fold ROC = {:.7f}'.format(roc_auc_score(y_test, xgores.mean(axis=1))))
    fold += 1

In [None]:
params = {'objective': 'binary:logistic', 
          'eval_metric': 'auc',
          'eta': 0.0202048,
          'max_depth': 5,
          'subsample': 0.6815,
          'colsample_bytree': 0.701,
          'silent': 1,
          'seed': 0,
          'nthreads': 12
         }

fold = 1
for train_index, test_index in skf:
    X_train, X_test = train.iloc[train_index].copy(), train.iloc[test_index].copy()
    y_train, y_test = target[train_index], target[test_index]
    
    sendMessage('Going through fold {:}'.format(fold))
    xgtres = np.zeros((X_test.shape[0], 10))
    dtrain = xgb.DMatrix(X_train, y_train, missing=0)
    dtest = xgb.DMatrix(X_test, missing=0)
    for st in range(50):
        params['seed'] = st
        sxgb = xgb.train(params, dtrain, num_boost_round=560, verbose_eval=False)
        xgtres[:, st] = sxgb.predict(dtest)
        sendMessage('Finished fitting XGB2 {:}, \
                     roc = {:.7f}'.format(st, roc_auc_score(y_test, xgtres[:, st])))
        del(sxgb)
        gc.collect()
    train_clfs[test_index, 3] = xgtres.mean(axis=1)
    
    sendMessage('Fold ROC = {:.7f}'.format(roc_auc_score(y_test, xgtres.mean(axis=1))))
    fold += 1