In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
from sklearn.metrics import log_loss
from scipy.optimize import minimize
import multiprocessing
import difflib
import time
import gc

import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from bayes_opt import BayesianOptimization



In [2]:
def get_train():
    keras_q1 = np.load('../../data/transformed/keras_tokenizer/train_q1_transformed.npy')
    keras_q2 = np.load('../../data/transformed/keras_tokenizer/train_q2_transformed.npy')
    xgb_feats = pd.read_csv('../../data/features/the_1owl/owl_train.csv')
    abhishek_feats = pd.read_csv('../../data/features/abhishek/train_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/other_features/text_features_train.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv('../../data/features/other_features/img_features_train.csv')
    srk_feats = pd.read_csv('../../data/features/srk/SRK_grams_features_train.csv')

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    y_train = xgb_feats['is_duplicate']
    xgb_feats = xgb_feats.iloc[:, 8:]
    
    X_train2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_train2 = np.concatenate([xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    for i in range(X_train2.shape[1]):
        if np.sum(X_train2[:, i] == y_train.values) == X_train2.shape[0]:
            print('LEAK FOUND')
    
    X_train2 = X_train2.astype('float32')
    X_train2 = pd.DataFrame(X_train2)
    X_train2['is_duplicate'] = y_train
    print('Training data shape:', X_train2.shape)
    return X_train2, y_train


def xgb_bo(max_depth, min_child_weight, subsample, colsample_bytree):
    
    t = time.time()
    params = {
    'seed': 1337,
    'colsample_bytree': max(min(colsample_bytree, 1), 0),
    'silent': 1,
    'subsample': max(min(subsample, 1), 0),
    'eta': 0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': int(max_depth),
    'min_child_weight': int(min_child_weight),
    'nthread': 6,
    'tree_method': 'hist',
    }
    
    dtrain = xgb.DMatrix(X_tr, label = y_tr)
    dval = xgb.DMatrix(X_val, label = y_val)
    watchlist = [(dtrain, 'train'), (dval, 'valid')]

    print('Start training...')
    gbm = xgb.train(params, dtrain, 100000, watchlist, 
                    early_stopping_rounds = 350, verbose_eval = 250)

    print('Start predicting...')
    val_pred = gbm.predict(xgb.DMatrix(X_val), ntree_limit=gbm.best_ntree_limit)
    score = log_loss(y_val, val_pred)
    print('Final score:', score, '\n', 'Time it took to train and predict:', time.time() - t)
    return -score


In [None]:
X_train, y_train = get_train()
X_train = X_train.astype('float32')
X_train.drop(['is_duplicate'], axis = 1, inplace = True)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify = y_train,
                                            test_size = 0.2, random_state = 111)

Training data shape: (404290, 247)


In [None]:
xgbBO = BayesianOptimization(xgb_bo, {
        'max_depth': (4, 12),
        'min_child_weight': (1, 25),
        'subsample': (0.4, 0.75),
        'colsample_bytree': (0.4, 0.75),
    })

num_iter = 15
init_points = 15

xgbBO.maximize(init_points=init_points, n_iter=num_iter)

print('XGB: %f' % xgbBO.res['max']['max_val'])

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   min_child_weight |   subsample | 
Start training...
[0]	train-logloss:0.672324	valid-logloss:0.672803
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 350 rounds.
[250]	train-logloss:0.289992	valid-logloss:0.363347
[500]	train-logloss:0.223964	valid-logloss:0.35077
[750]	train-logloss:0.176753	valid-logloss:0.343918
[1000]	train-logloss:0.141413	valid-logloss:0.340287
[1250]	train-logloss:0.114206	valid-logloss:0.338127
[1500]	train-logloss:0.093812	valid-logloss:0.337303
[1750]	train-logloss:0.078693	valid-logloss:0.3369
[2000]	train-logloss:0.066433	valid-logloss:0.33745
Stopping. Best iteration:
[1698]	train-logloss:0.081401	valid-logloss:0.336859

Start predicting...
Final score: 0.336858593989 


Start training...
[0]	train-logloss:0.67327	valid-logloss:0.673571
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 350 rounds.
[250]	train-logloss:0.329898	valid-logloss:0.373982
[500]	train-logloss:0.275744	valid-logloss:0.359144
[750]	train-logloss:0.234146	valid-logloss:0.351376
[1000]	train-logloss:0.199199	valid-logloss:0.346394
[1250]	train-logloss:0.170214	valid-logloss:0.342428
[1500]	train-logloss:0.145339	valid-logloss:0.339934
[1750]	train-logloss:0.125027	valid-logloss:0.338111
[2000]	train-logloss:0.107382	valid-logloss:0.336746
[2250]	train-logloss:0.092548	valid-logloss:0.336055
[2500]	train-logloss:0.080048	valid-logloss:0.33571
[2750]	train-logloss:0.069324	valid-logloss:0.335841
Stopping. Best iteration:
[2540]	train-logloss:0.078243	valid-logloss:0.335659

Start predicting...
Final score: 0.335658883116 
 Time it took to train and predict: 1020.3898370265961
    6 | 17m00s | 

[8750]	train-logloss:0.157777	valid-logloss:0.34467
[9000]	train-logloss:0.154278	valid-logloss:0.344568
[9250]	train-logloss:0.150957	valid-logloss:0.34434
[9500]	train-logloss:0.147701	valid-logloss:0.34438
Stopping. Best iteration:
[9267]	train-logloss:0.150724	valid-logloss:0.344291

Start predicting...
Final score: 0.344290954783 
 Time it took to train and predict: 1295.967110157013
    9 | 21m36s |   -0.34429 |             0.5179 |      5.3554 |             7.0315 |      0.4040 | 
Start training...
[0]	train-logloss:0.675017	valid-logloss:0.675122
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 350 rounds.
[250]	train-logloss:0.403168	valid-logloss:0.407183
[500]	train-logloss:0.384672	valid-logloss:0.392316
[750]	train-logloss:0.372984	valid-logloss:0.38441
[1000]	train-logloss:0.36396	valid-logloss:0.37919
[1250]	train-logloss:0.35611	valid-logloss:0.375198
[1500]	train-logloss:0.348947

[8000]	train-logloss:0.162919	valid-logloss:0.337345
[8250]	train-logloss:0.159191	valid-logloss:0.336997
[8500]	train-logloss:0.155472	valid-logloss:0.336866
[8750]	train-logloss:0.151879	valid-logloss:0.33672
[9000]	train-logloss:0.148492	valid-logloss:0.336463
[9250]	train-logloss:0.145093	valid-logloss:0.336347
[9500]	train-logloss:0.141874	valid-logloss:0.336285
[9750]	train-logloss:0.138738	valid-logloss:0.336124
[10000]	train-logloss:0.135659	valid-logloss:0.336059
[10250]	train-logloss:0.132605	valid-logloss:0.335884
[10500]	train-logloss:0.129621	valid-logloss:0.335888
[10750]	train-logloss:0.126784	valid-logloss:0.335817
[11000]	train-logloss:0.12398	valid-logloss:0.335792
[11250]	train-logloss:0.121208	valid-logloss:0.335722
[11500]	train-logloss:0.118518	valid-logloss:0.335733
Stopping. Best iteration:
[11389]	train-logloss:0.119674	valid-logloss:0.335698

Start predicting...
Final score: 0.335698200999 
 Time it took to train and predict: 1951.2794160842896
   13 | 32m31s 

  " state: %s" % convergence_dict)


[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_depth |   min_child_weight |   subsample | 
Start training...
[0]	train-logloss:0.675071	valid-logloss:0.67517
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 350 rounds.
[250]	train-logloss:0.403818	valid-logloss:0.407847
[500]	train-logloss:0.384535	valid-logloss:0.392532
[750]	train-logloss:0.372566	valid-logloss:0.384713
[1000]	train-logloss:0.363091	valid-logloss:0.379344
[1250]	train-logloss:0.355176	valid-logloss:0.37515
[1500]	train-logloss:0.348233	valid-logloss:0.372077
[1750]	train-logloss:0.341604	valid-logloss:0.369016
[2000]	train-logloss:0.335543	valid-logloss:0.366682
[2250]	train-logloss:0.32976	valid-logloss:0.364612
[2500]	train-logloss:0.324138	valid-logloss:0.362548
[2750]	train-lo

  " state: %s" % convergence_dict)


   17 | 18m20s |   -0.33746 |             0.4005 |     11.6876 |             1.0625 |      0.7393 | 
Start training...
[0]	train-logloss:0.672365	valid-logloss:0.672891
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 350 rounds.
[250]	train-logloss:0.293138	valid-logloss:0.361749
[500]	train-logloss:0.234872	valid-logloss:0.348993
[750]	train-logloss:0.191359	valid-logloss:0.34222
[1000]	train-logloss:0.156979	valid-logloss:0.337747
[1250]	train-logloss:0.130305	valid-logloss:0.334873
[1500]	train-logloss:0.109137	valid-logloss:0.333138
[1750]	train-logloss:0.093177	valid-logloss:0.332366
[2000]	train-logloss:0.080131	valid-logloss:0.331876
[2250]	train-logloss:0.069866	valid-logloss:0.331754
[2500]	train-logloss:0.061154	valid-logloss:0.332163
Stopping. Best iteration:
[2225]	train-logloss:0.070829	valid-logloss:0.331668

Start predicting...
Final score: 0.331667988405 
 Time it took to train a

  " state: %s" % convergence_dict)


   21 | 17m03s |   -0.33224 |             0.4008 |     11.8933 |            22.6118 |      0.7489 | 
Start training...
[0]	train-logloss:0.672006	valid-logloss:0.672709
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 350 rounds.
[250]	train-logloss:0.268912	valid-logloss:0.359363
[500]	train-logloss:0.197751	valid-logloss:0.346211
