In [1]:
import os
os.chdir('/kaggle/input/myfunctions')
from functions import *
from myutils import *
import pandas as pd
import numpy as np
import pickle
from functions import *
os.chdir('/kaggle/input/mydatasets2')
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tqdm import tqdm

In [2]:
time_steps = 30 # Time steps to consider
df_path = '/kaggle/input/mydatasets2/training_set1.pkl'

In [3]:
data, labels = preprocess_data_via_close_values(df_path, time_steps=time_steps, shuffle=True, split=False)
print(data.shape)
print(labels.shape)

Columns of each dataframe Index(['Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')
Shape of each array in the list (2171, 30)
Label Classes ['decrease' 'increase' 'no big change']
Data available for each label - increase, decrease and no big change 2095651 2125382 120967
Combined array shape -  (4342000, 30)
Combined labels shape -  (4342000,)
Shuffled data shape -  (4342000, 30)
(4342000, 30)
(4342000,)


In [4]:
test_size=0.2
random_state=42
x_train, x_valid, y_train, y_valid = train_test_split(data, labels, test_size=test_size, random_state=random_state)
num_bagging_splits = 100

In [4]:
models_dict = {}
params = {'objective':'multi:softmax', 'eval_metric':'merror', 'num_class':'3', 'eta':'0.1', 'max_depth':'5'}
num_boost_rounds = 100
scaling = 2.5
for i in tqdm(range(num_bagging_splits)):
    indices = np.arange(0, len(x_train))
    np.random.shuffle(indices)
    slice_val = int(len(indices)//scaling)
    indices_selected = indices[:slice_val]
    x_tr = x_train[indices_selected]
    y_tr = y_train[indices_selected]
    xg_train = xgb.DMatrix(x_tr, label=y_tr)
    models_dict[i] = xgb.train(params, xg_train, num_boost_round=num_boost_rounds)

100%|██████████| 100/100 [1:12:27<00:00, 43.48s/it]


In [6]:
def xg_multi_predict(x, y, models_dict):
    xg_valid = xgb.DMatrix(x, label=y)
    y_pred = np.zeros_like(y, dtype=np.float32)
    # y_pred = models_dict[0].predict(xg_valid)
    for xg_model in models_dict.values():
        y_pred += xg_model.predict(xg_valid)
    y_pred = y_pred/len(models_dict)
    return y_pred

In [8]:
models_dict = load_file_from_pickle('/kaggle/input/models-xg-bagging-100-30/models_xg_bagging_100_30.pkl')

In [9]:
metric_keys = ("accuracy", "precision", "specificity", "recall", "f1score")
y_train_pred = xg_multi_predict(x = x_train, y = y_train, models_dict = models_dict)
y_train_pred = np.round(y_train_pred)

In [10]:
train_metrics = dict(zip(metric_keys, metric_calculations(y_train_pred, y_train, set_="Train set")))

For Train set
  Accuracy: 40.65361584523261,
  Precision: 50.26614938209241,
  Recall: 39.56780492027279,
  Specificity: 42.2551960494898,
  F1score: 44.27994310813852


In [11]:
y_pred = xg_multi_predict(x = x_valid, y = y_valid, models_dict = models_dict)
y_pred = np.round(y_pred)

In [12]:
valid_metrics = dict(zip(metric_keys, metric_calculations(y_pred, y_valid, set_="Validation set")))

For Validation set
  Accuracy: 40.640142791340395,
  Precision: 50.317707207652504,
  Recall: 39.5889752555231,
  Specificity: 42.19462276767854,
  F1score: 44.313201443603816


In [18]:
df_path_test = "/kaggle/input/mydatasets2/training_set11.pkl"
x_test, y_test = preprocess_data_via_close_values(df_path_test, time_steps=30, shuffle=True, split=False)

Columns of each dataframe Index(['Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')
Shape of each array in the list (858, 30)
Label Classes ['decrease' 'increase' 'no big change']
Data available for each label - increase, decrease and no big change 1027514 1019405 52607
Combined array shape -  (2099526, 30)
Combined labels shape -  (2099526,)
Shuffled data shape -  (2099526, 30)


In [19]:
y_pred_test = xg_multi_predict(x = x_test, y = y_test, models_dict = models_dict)
y_pred_test = np.round(y_pred_test)

In [20]:
test_metrics = dict(zip(metric_keys, metric_calculations(y_pred_test, y_test, set_="Test set")))

For Test set
  Accuracy: 35.54178419319408,
  Precision: 49.89981863937217,
  Recall: 35.365007199689764,
  Specificity: 35.861109328405554,
  F1score: 41.39356242339627


In [21]:
output_keys = ["models", "train_metrics", "validation_metrics", "test metrics"]
output_values = [models_dict, train_metrics, valid_metrics, test_metrics]

In [22]:
output_file_path = f"/kaggle/working/models_xg_bagging_{num_bagging_splits}_{time_steps}.pkl"
write_file_to_pickle(output_file_path, output_values)