In [1]:
import teleloggingbot
import pprint
import random
from tqdm import tqdm

import numpy as np
import pandas as pd
from scipy.stats import gmean

from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import xgboost as xgb
import lightgbm as lgb

import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import Adam, SGD, RMSprop, Nadam
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

Using Theano backend.


In [2]:
dfx_train = pd.read_csv('./x_train_my.csv', index_col=None)
dfx_test = pd.read_csv('./x_test_my.csv', index_col=None)

dfy_train = pd.read_csv('./y_train.csv', index_col=None, header=None)
dfy_train.columns = ['class']

In [3]:
xtrain = dfx_train.values
ytrain = dfy_train.values.ravel()
xtest =  dfx_test.values

In [4]:
def getKerasMLP():
    keras_mlp = Sequential()
    keras_mlp.add(Dense(128, input_dim=xtrain.shape[1], init='he_normal'))
    keras_mlp.add(Activation('relu'))
    keras_mlp.add(Dropout(0.2))

    keras_mlp.add(Dense(1, activation='sigmoid', init='he_normal'))
    keras_mlp.compile(loss='binary_crossentropy', optimizer=Adam(decay=1e-6))
    return keras_mlp

In [5]:
sklearn_mlp_param = {'hidden_layer_sizes':(32, 32), 'max_iter': 200, 'batch_size':64}
sklearn_mlp = MLPClassifier(**sklearn_mlp_param)

In [6]:
logreg_param = {'C': 0.225}
logreg = LogisticRegression(**logreg_param)

In [7]:
lgb_param = {
    'subsample': 0.55,
    'subsample_freq': 1,
    'boosting_type': 'gbdt',
    'colsample_bytree': 0.75,
    'reg_lambda': 6.05,
    'learning_rate': 0.012,
    'max_bin': 15,
    'max_depth': 6,
    'n_estimators': 500,
    'num_leaves': 15,
    'nthread': 8,
    'objective': 'binary'
}
lgb_clf = lgb.LGBMClassifier(**lgb_param)

In [8]:
xgb_param = {
    'colsample_bylevel': 0.85,
    'colsample_bytree': 0.55,
    'gamma': 3.3,
    'learning_rate': 0.008,
    'max_delta_step': 8.9,
    'max_depth': 4,
    'min_child_weight': 7.0,
    'n_estimators': 920,
    'nthread': 8,
    'objective': 'binary:logistic',
    'reg_lambda': 9.95,
    'scale_pos_weight': 1.0,
    'silent': 1,
    'subsample': 0.55
}
xgb_clf = xgb.XGBClassifier(**xgb_param)

In [9]:
%%time
clfs = [getKerasMLP(), lgb_clf, xgb_clf, sklearn_mlp, logreg]
n_folds = 6
kf = StratifiedKFold(n_splits=n_folds, shuffle=True)
preds_train = np.zeros((len(dfy_train),len(clfs)), dtype=np.float32)

for fold_i, (train_index, test_index) in enumerate(kf.split(xtrain, ytrain)):
    X_train, X_test = xtrain[train_index], xtrain[test_index]
    y_train, y_test = ytrain[train_index], ytrain[test_index]
    print('Fold #{}/{}'.format(fold_i+1, n_folds))
    for clf_i, clf in enumerate(clfs):
        if clf.__class__.__name__ != 'Sequential':
            clf.fit(X_train, y_train)
            prediction = clf.predict_proba(X_test)
        else:
            clf.fit(X_train, y_train, nb_epoch=35+random.randint(-5,5),
                    batch_size=512, validation_split=0, verbose=0)
            prediction = clf.predict_proba(X_test, verbose=0)
        if (prediction.shape[1] > 1):
            prediction = prediction[:,1]
        preds_train[test_index, clf_i] = prediction.ravel()
        print('{:15} \t logloss: {:.8}'.format(clf.__class__.__name__,
                                               log_loss(y_test, prediction)))
    print('='*45)

Fold #1/6
Sequential      	 logloss: 0.39167531
LGBMClassifier  	 logloss: 0.38958564
XGBClassifier   	 logloss: 0.38996541
MLPClassifier   	 logloss: 0.39523748
LogisticRegression 	 logloss: 0.39401907
Fold #2/6
Sequential      	 logloss: 0.38519678
LGBMClassifier  	 logloss: 0.38462679
XGBClassifier   	 logloss: 0.38459124
MLPClassifier   	 logloss: 0.38460793
LogisticRegression 	 logloss: 0.3891311
Fold #3/6
Sequential      	 logloss: 0.38563557
LGBMClassifier  	 logloss: 0.38375618
XGBClassifier   	 logloss: 0.38384324
MLPClassifier   	 logloss: 0.39050369
LogisticRegression 	 logloss: 0.3895984
Fold #4/6
Sequential      	 logloss: 0.35830899
LGBMClassifier  	 logloss: 0.3618075
XGBClassifier   	 logloss: 0.36257992
MLPClassifier   	 logloss: 0.36528482
LogisticRegression 	 logloss: 0.37002163
Fold #5/6
Sequential      	 logloss: 0.36618264
LGBMClassifier  	 logloss: 0.36863851
XGBClassifier   	 logloss: 0.36772608
MLPClassifier   	 logloss: 0.37382327
LogisticRegression 	 logloss:

In [10]:
preds_train_df = pd.DataFrame(preds_train,
                              columns=[clf.__class__.__name__ for clf in clfs], index=None)

In [11]:
preds_train_df['Gmean'] = gmean(preds_train, axis=1)

In [12]:
preds_train_df.head()

Unnamed: 0,Sequential,LGBMClassifier,XGBClassifier,MLPClassifier,LogisticRegression,Gmean
0,0.335032,0.361801,0.348334,0.342524,0.310631,0.339233
1,0.542141,0.665235,0.657173,0.561377,0.545661,0.591819
2,0.048808,0.05424,0.05238,0.075336,0.052062,0.05586
3,0.16508,0.222551,0.212316,0.162091,0.176611,0.186113
4,0.046927,0.045581,0.049623,0.060695,0.053448,0.050979


In [13]:
topfeat = [
    'nDaysPlayd*maxLvl',
    'nDaysPlayd*avgNTurns',
    'nAttemptedLevels_/_totalNAttempts',
    'nDaysPlayd*nAttempts',
    'diff_NofDaysPlayed_BoostersFraction',
    'totNumAttempts_/_nBoostersUsed',
    'maxLvl_/_BoostersUsed',
    'nAttempts*maxLvl',
    'avgNTurns_/_NDaysActuallyPlayed',
    'BoostersUsed*totScore'
]

In [14]:
sndLvlLgbParam ={
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'boosting': 'gbdt',
    'feature_fraction': 0.5,
    'lambda_l2': 0.9,
    'learning_rate': 0.164,
    'max_bin': 255,
    'max_depth': 3,
    'metric': ('binary_logloss',),
    'num_iterations': 27,
    'num_leaves': 255,
    'num_threads': 4,
    'objective': 'binary',
    'verbose': 0
}

In [15]:
sndLvlXgbParam =  {
    'colsample_bylevel': 0.65,
    'colsample_bytree': 0.55,
    'gamma': 8.35,
    'learning_rate': 0.008,
    'max_delta_step': 5.15,
    'max_depth': 3,
    'min_child_weight': 6.0,
    'n_estimators': 1088,
    'nthread': 8,
    'objective': 'binary:logistic',
    'reg_lambda': 0.5,
    'silent': 1,
    'subsample': 0.5
}

In [16]:
fst_lvl_xtrain = pd.concat([preds_train_df, dfx_train[topfeat]], axis=1).values

In [17]:
## CV for 2nd lvl model
kf = StratifiedKFold(n_splits=5, random_state=228, shuffle=True)
scores = []
for train_index, test_index in kf.split(fst_lvl_xtrain, ytrain):
    X_train, X_test = fst_lvl_xtrain[train_index], fst_lvl_xtrain[test_index]
    y_train, y_test = ytrain[train_index], ytrain[test_index]
    lgb_matrix = lgb.Dataset(X_train, y_train)
    clf = lgb.train(sndLvlLgbParam, lgb_matrix, num_boost_round=sndLvlLgbParam['num_iterations'])
    scores.append(log_loss(y_test, clf.predict(X_test)))
#     clf = xgb.XGBClassifier(**sndLvlXgbParam)
#     clf.fit(X_train, y_train)
#     scores.append(log_loss(y_test, clf.predict_proba(X_test)[:,1]))
    print(scores[-1])
print(np.mean(scores), np.std(scores))

0.375885031686
0.37323123042
0.385030960823
0.382938726113
0.379013455994
0.379219881007 0.00435031549845


In [18]:
lgbTrainData = lgb.Dataset(fst_lvl_xtrain, ytrain)

In [19]:
xgbTrainData = xgb.DMatrix(fst_lvl_xtrain, ytrain)

In [20]:
%%time
sklearn_mlp = MLPClassifier(**sklearn_mlp_param)
logreg = LogisticRegression(**logreg_param)
lgb_clf = lgb.LGBMClassifier(**lgb_param)
xgb_clf = xgb.XGBClassifier(**xgb_param)

clfs = [getKerasMLP(), lgb_clf, xgb_clf, sklearn_mlp, logreg]
preds_test = np.zeros((len(xtest), len(clfs)), dtype=np.float64)
for clf_i, clf in enumerate(clfs):
    if clf.__class__.__name__ != 'Sequential':
        clf.fit(xtrain, ytrain)
        prediction = clf.predict_proba(xtest)
    else:
        clf.fit(xtrain, ytrain, nb_epoch=16, batch_size=512, verbose=0)
        prediction = clf.predict_proba(xtest, verbose=0)
    if (prediction.shape[1] > 1):
        prediction = prediction[:,1]
    preds_test[:,clf_i] = prediction.ravel()

preds_test_df = pd.DataFrame(preds_test, columns=[clf.__class__.__name__ for clf in clfs], index=None)
preds_test_df['Gmean'] = gmean(preds_test, axis=1)
fst_lvl_xtest = pd.concat([preds_test_df, dfx_test[topfeat]], axis=1).values

In [21]:
xgbTestData = xgb.DMatrix(fst_lvl_xtest)

In [None]:
predictions = []
for i in tqdm(range(500)):
    seed=i+1
    np.random.seed(seed)
    random.seed(seed)

    xgbTmpParam = sndLvlXgbParam.copy()
    xgbTmpParam['seed'] = seed
    xgbTmpParam['n_estimators'] += random.randint(-5,10)
    
    lgbTmpParam = sndLvlLgbParam.copy()
    lgbTmpParam['seed'] = seed
    lgbTmpParam['num_iterations'] += random.randint(-5,10)
    
    sndLvlXgb = xgb.train(sndLvlXgbParam, xgbTrainData,
                          num_boost_round=sndLvlXgbParam['n_estimators'])
    sndLvlLgb = lgb.train(sndLvlLgbParam, lgbTrainData,
                          num_boost_round=sndLvlLgbParam['num_iterations']+random.randint(-5,10))
    
    predictions.append(sndLvlXgb.predict(xgbTestData))
    predictions.append(sndLvlLgb.predict(fst_lvl_xtest))

  3%|▎         | 16/500 [01:15<38:12,  4.74s/it]

In [None]:
pd.DataFrame(gmean(predictions)).to_csv('ans.csv', header=False, index=False)