In [3]:
__author__ = 'Tilii: https://kaggle.com/tilii7' 

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier


In [4]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


train_df = pd.read_csv('../_dataset/input/train.csv', dtype={'id': np.int32, 'target': np.int8})
Y = train_df['target'].values
X = train_df.drop(['target', 'id'], axis=1)
test_df = pd.read_csv('../_dataset/input/test.csv', dtype={'id': np.int32})
test = test_df.drop(['id'], axis=1)

In [7]:
X

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,2,2,5,1,0,0,1,0,0,0,...,9,1,5,8,0,1,1,0,0,1
1,1,1,7,0,0,0,0,1,0,0,...,3,1,1,9,0,1,1,0,1,0
2,5,4,9,1,0,0,0,1,0,0,...,4,2,7,7,0,1,1,0,1,0
3,0,1,2,0,0,1,0,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,0,2,0,1,0,1,0,0,0,0,...,3,1,1,3,0,0,0,1,1,0
5,5,1,4,0,0,0,0,0,1,0,...,4,2,0,9,0,1,0,1,1,1
6,2,1,3,1,0,0,1,0,0,0,...,3,0,0,10,0,1,0,0,1,0
7,5,1,4,0,0,1,0,0,0,0,...,7,1,3,6,1,0,1,0,1,0
8,5,1,3,1,0,0,0,1,0,0,...,4,2,1,5,0,1,0,0,0,1
9,1,1,2,0,0,0,1,0,0,0,...,3,5,0,6,0,1,0,0,1,0


In [9]:
X.shape

(595212, 57)

In [11]:
print(Y.shape)
Y

(595212,)


array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

In [5]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [14]:
i = 14
if i in range(13,25):
    print(i)

14


In [6]:
xgb = XGBClassifier(learning_rate=0.02, 
                    n_estimators=600, 
                    objective='binary:logistic',
                    silent=True, 
                    nthread=1)

In [12]:
folds = 1
param_comb = 1

skf = StratifiedKFold(n_splits=folds, 
                      shuffle = True, 
                      random_state = 1001)

random_search = RandomizedSearchCV(xgb, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring='roc_auc', 
                                   n_jobs=4, 
                                   cv=skf.split(X,Y), 
                                   verbose=3, 
                                   random_state=1001)

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X, Y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0 
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0 
[CV] subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0 
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1.5, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0, score=0.6397857982244877, total=23.3min
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1.5, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0, score=0.6368150503037859, total=23.4min
[CV] subsample=0.6, min_child_weight=1, max_depth=5, gamma=1.5, colsample_bytree=0.8 
[CV]  subsample=1.0, min_child_weight=5, max_depth=3, gamma=5, colsample_bytree=1.0, score=0.6342709925172288, total=23.4min
[CV] subsample=0.8, min_child_weight=5, max_depth=5, ga

KeyboardInterrupt: 

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

In [None]:
y_test = random_search.predict_proba(test)
results_df = pd.DataFrame(data={'id':test_df['id'], 'target':y_test[:,1]})
results_df.to_csv('submission-random-grid-search-xgb-porto-01.csv', index=False)

In [None]:
grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X,Y), verbose=3 )
grid.fit(X, Y)
print('\n All results:')
print(grid.cv_results_)
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_ * 2 - 1)
print('\n Best parameters:')
print(grid.best_params_)
results = pd.DataFrame(grid.cv_results_)
results.to_csv('xgb-grid-search-results-01.csv', index=False)

y_test = grid.best_estimator_.predict_proba(test)
results_df = pd.DataFrame(data={'id':test_df['id'], 'target':y_test[:,1]})
results_df.to_csv('submission-grid-search-xgb-porto-01.csv', index=False)

In [2]:
! ls

Graph_test.ipynb                      local_function.py
Infocom05_clustering_and_diameter.pdf startFunction.py
RNN_KERAS_2.ipynb                     start_model.py
[34m__pycache__[m[m                           test.ipynb
[34mbackup[m[m                                test2.ipynb
create_LSTM_model.py                  time_unit_analysis_lstm.ipynb
[34mevaluate_result[m[m                       time_unit_analysis_xgboost.ipynb
[34mevalueate_result_[m[m                     userFunction.py
link_aws_key.py                       window_size_analysis_lstm.ipynb
local_function.ipynb                  window_size_analysis_xgboost.ipynb


In [4]:
import pickle

idx_time_unit = 10
idx_window_size = 10
idx_gap = 1
idx_margin_rate = 0.1

def Load_Dataset_X(dir_path, time_unit, window_size, gap, margin_rate):
    key_name_X = "X_" + \
                    str(time_unit) + "_" + \
                    str(window_size) + "_" + \
                    str(gap) + "_" + \
                    str(margin_rate)
                    

    with open(dir_path + key_name_X + ".pickle", 'rb') as handle:
        b_x = pickle.load(handle)
    return b_x

def Load_Dataset_y(dir_path, time_unit, window_size, gap, margin_rate):
    key_name_y = "y_" + \
                    str(time_unit) + "_" + \
                    str(window_size) + "_" + \
                    str(gap) + "_" + \
                    str(margin_rate)
                    

    with open(dir_path + key_name_y + ".pickle", 'rb') as handle:
        b_y = pickle.load(handle)
    return b_y


X = {}
y = {}

key_name_X = "X_"
key_name_y = "y_"
pickle_load_dir_path = '../_dataset/RNN_coin/'
dataset_scale = -1

key_name_X += str(idx_time_unit) + "_" + str(idx_window_size) + "_" + str(idx_gap) + "_" + str(idx_margin_rate)
key_name_y += str(idx_time_unit) + "_" + str(idx_window_size) + "_" + str(idx_gap) + "_" + str(idx_margin_rate)

# remove [:10000], when real training
X = Load_Dataset_X(pickle_load_dir_path, idx_time_unit, idx_window_size, idx_gap, idx_margin_rate)[0][:dataset_scale]
y = Load_Dataset_y(pickle_load_dir_path, idx_time_unit, idx_window_size, idx_gap, idx_margin_rate)[1][:dataset_scale]

y_single = {}
#     print("[INFO] y : {}".format(y))
#     y = np.asarray(y[0])
#     print("[INFO] y.shape : {}".format(y.shape))
#     print("[INFO] y : {}".format(y))
y_single['BTC'] = y[:, 1]
y_single['ETH'] = y[:, 2]
y_single['XRP'] = y[:, 3]
y_single['BCH'] = y[:, 4]
y_single['LTC'] = y[:, 5]
y_single['DASH'] = y[:, 6]
y_single['ETC'] = y[:, 7]

In [11]:
len(X)

52549

In [7]:
y_single

{'BTC': array([0, 0, 0, ..., 1, 1, 0]),
 'ETH': array([0, 0, 0, ..., 0, 1, 0]),
 'XRP': array([0, 0, 1, ..., 1, 0, 0]),
 'BCH': array([1, 0, 0, ..., 1, 0, 0]),
 'LTC': array([0, 1, 1, ..., 0, 0, 1]),
 'DASH': array([1, 0, 0, ..., 0, 1, 0]),
 'ETC': array([0, 0, 0, ..., 0, 1, 0])}

In [10]:
len(y_single['BTC'])

52549

In [13]:
print(y_single['BTC'])

[0 0 0 ... 1 1 0]


In [14]:
print(X)

[[[[1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
   [1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
   [1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
   ...
   [1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
   [1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
   [1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]]

  [[9.91535206e-01 9.96344748e-01 9.91342824e-01 1.39001852e+02]
   [9.91919969e-01 9.93459023e-01 9.89803771e-01 1.47394685e+02]
   [9.93266641e-01 9.94036168e-01 9.90958061e-01 7.57722642e+01]
   ...
   [9.98653328e-01 9.99422855e-01 9.95959985e-01 6.90474800e+01]
   [1.00000000e+00 1.00019238e+00 9.95959985e-01 1.24198021e+02]
   [1.00000000e+00 1.00019238e+00 9.96729511e-01 3.61616734e+01]]

  [[9.95767693e-01 1.00176346e+00 9.93298848e-01 4.23569067e+03]
   [9.94239360e-01 9.99764872e-01 9.92946156e-01 3.55378612e+03]
   [9.96237950e-01 9.98824359e-01 9.94121796e-01 1.81472572e+03]
   ...
