In [9]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/StockForcasting/Stock-Forcast')
from functions import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!ls

functions.ipynb  __pycache__  Stock-Forcast


In [10]:
import pandas as pd
import numpy as np
import pickle
os.chdir('/content/drive/MyDrive/dataForStockForcast')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [11]:
num_features = 10
min_change = 1e-4

In [12]:
def xg_models_predict(model_list, x_values):
  if isinstance(model_list, list):
    preds = np.zeros((x_values.get_data().shape[0], ))
    for mdl in  models_list:
      preds += mdl.predict(x_values)
    return preds/len(model_list)
  else:
    return model_list.predict(x_values)

In [13]:
pickle_file_path = 'training_set.pkl'
x_train, x_valid, y_train, y_valid = preprocess_data_via_close_values(pickle_file_path, num_features=10)

Columns of each dataframe Index(['Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')
Shape of each array in the list (2191, 10)
Label Classes ['decrease' 'increase' 'no big change']
Data available for each label - increase, decrease and no big change 2118035 2141921 122044
Combined array shape -  (4382000, 10)
Combined labels shape -  (4382000,)
Shuffled data shape -  (4382000, 10)


In [14]:
import xgboost
xg_train = xgboost.DMatrix(x_train, label=y_train)
xg_valid = xgboost.DMatrix(x_valid)
params = {'objective':'multi:softmax', 'eval_metric':'merror', 'num_class':'3', 'device':'cuda', 'eta':'0.1', 'max_depth':'10'}

In [16]:
models_list = []
for i in range(1):
  mdl = xgboost.train(params, xg_train, num_boost_round=120)
  models_list.append(mdl)



In [17]:
gc.collect()

55

In [18]:
train_preds = models_predict(models_list, xg_train)

In [19]:
_, _, _, _, _ = metric_calculations(train_preds, y_train, set_ = "Train Set")

For Train Set
  Accuracy: 52.79706754906436,
  Precision: 52.18847674060483,
  Recall: 59.63367957181478,
  Specificity: 46.04524370696992,
  F1score: 55.663224568755325


In [20]:
valid_preds = models_predict(models_list, xg_valid)

In [21]:
_, _, _, _, _ = metric_calculations(valid_preds, y_valid, set_ = "Validation Set")

For Validation Set
  Accuracy: 49.66556366955728,
  Precision: 49.48231616180742,
  Recall: 56.3431355234397,
  Specificity: 43.054978440495084,
  F1score: 52.690327347916686


In [31]:
def preprocess_data_via_close_values(pickle_file_path, num_features=32, min_change=1e-5, shuffle=True, split=True, test_size=0.2, random_state=42):
  dataframes_list = extract_df_list1(pickle_file_path)
  print("Columns of each dataframe", dataframes_list[0].columns)
  # All values are pre-normalized with respect to their columns
  array_list, labels_list = get_array_label_list1(dataframes_list, num_features, min_change)
  del(dataframes_list)
  print("Shape of each array in the list", array_list[0].shape)
  data_array = np.concatenate(array_list, axis = 0)
  label_array = np.concatenate(labels_list, axis = 0)
  del(array_list)
  del(labels_list)
  print("Label Classes", np.unique(label_array))
  print("Data available for each label - increase, decrease and no big change", sum((label_array=='increase')*1), sum((label_array=='decrease')*1), sum((label_array=='no big change')*1))
  print("Combined array shape - ", data_array.shape)
  print("Combined labels shape - ", label_array.shape)
  label_encoded = encode_label_array1(label_array)
  del(label_array)
  if shuffle:
    inds = list(range(data_array.shape[0]))
    np.random.shuffle(inds)
    data_shuff = data_array[inds, :]
    label_shuff = label_encoded[inds]
    print("Shuffled data shape - ", data_shuff.shape)
    if split:
      x_train, x_test, y_train, y_test = train_test_split(data_shuff, label_shuff, test_size=test_size, random_state=random_state)
      return x_train, x_test, y_train, y_test
    else:
      return data_shuff, label_shuff
  else:
    print("Data Array shape - ", data_array.shape)
    if split:
      x_train, x_test, y_train, y_test = train_test_split(data_shuff, label_shuff, test_size=test_size, random_state=random_state)
      return x_train, x_test, y_train, y_test
    else:
      return data_array, label_encoded

In [32]:
pickle_file_path = 'testing_set1.pkl'
x_test, y_test = preprocess_data_via_close_values(pickle_file_path, num_features=10, shuffle=False, split=False)

Columns of each dataframe Index(['Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')
Shape of each array in the list (878, 10)
Label Classes ['decrease' 'increase' 'no big change']
Data available for each label - increase, decrease and no big change 1050569 1044439 53458
Combined array shape -  (2148466, 10)
Combined labels shape -  (2148466,)
Data Array shape -  (2148466, 10)


In [33]:
xg_test = xgboost.DMatrix(x_test)

In [34]:
test_preds = models_predict(models_list, xg_test)

In [35]:
_, _, _, _, _ = metric_calculations(test_preds, y_test, set_ = "Test Set")

For Test Set
  Accuracy: 49.16279801495579,
  Precision: 49.309503470327414,
  Recall: 59.80626068791791,
  Specificity: 38.51945423690631,
  F1score: 54.052996680871466
