In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/StockForecasting')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ls

Mining-1.ipynb	Mining-2.ipynb	Mining3.ipynb  testing_set1.pkl  testing_set2.pkl  training_set.pkl


In [4]:
import pandas as pd
import numpy as np
import pickle

In [60]:
with open('training_set.pkl', 'rb') as file:
  training_file = pickle.load(file)

In [61]:
type(training_file)

list

In [62]:
len(training_file)

2000

In [63]:
type(training_file[0])

pandas.core.frame.DataFrame

In [64]:
df_samp = training_file[0].copy()

In [65]:
df_samp.shape

(2202, 5)

In [66]:
df_samp

Unnamed: 0,Open,High,Low,Close,Volume
0,0.672106,0.679902,0.669265,0.671966,0.336899
1,0.672106,0.674597,0.670546,0.672106,0.208023
2,0.672106,0.678621,0.671827,0.676282,0.142405
3,0.676338,0.677340,0.672384,0.673498,0.146400
4,0.673442,0.676004,0.672997,0.675892,0.116960
...,...,...,...,...,...
2197,0.680849,0.683633,0.672997,0.674055,0.351048
2198,0.674389,0.674890,0.671326,0.674055,0.143069
2199,0.674110,0.676227,0.672440,0.676004,0.312048
2200,0.675725,0.675837,0.668820,0.669321,0.279230


In [67]:
df_samp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2202 entries, 0 to 2201
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    2202 non-null   float64
 1   High    2202 non-null   float64
 2   Low     2202 non-null   float64
 3   Close   2202 non-null   float64
 4   Volume  2202 non-null   float64
dtypes: float64(5)
memory usage: 86.1 KB


In [92]:
array_list = []
labels_list = []
num_features = 50
k = num_features+1
min_change = 1e-3
for df in training_file:
  df = df['Close'].values
  arr = np.zeros((len(df)-k, k-1))
  labels = np.full((len(df)-k, ), '', dtype=object)
  for i in range(len(df)-k):
    arr[i] = df[i:i+k-1]
    if np.abs(df[i+k]-df[i+k-1]) < min_change:
      labels[i] = 'no big change'
    elif df[i+k] > df[i+k-1]:
      labels[i] = 'increase'
    elif df[i+k] < df[i+k-1]:
      labels[i] = 'decrease'
  array_list.append(arr)
  labels_list.append(labels)

In [69]:
labels_list[0][0]

'increase'

In [70]:
array_list[0].shape

(2151, 50)

In [71]:
data_array = np.concatenate(array_list, axis = 0)
label_array = np.concatenate(labels_list, axis = 0)
# del(array_list)
# del(labels_list)
# del(training_file)

In [72]:
sum((label_array=='increase')*1), sum((label_array=='decrease')*1), sum((label_array=='no big change')*1)

(1594960, 1631397, 1075643)

In [19]:
data_array.shape

(4302000, 50)

In [20]:
label_array.shape

(4302000,)

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_encoded = le.fit_transform(label_array)

In [23]:
le.classes_

array(['decrease', 'increase', 'no big change'], dtype=object)

In [24]:
inds = list(range(data_array.shape[0]))

In [25]:
np.random.seed(42)
np.random.shuffle(inds)

In [26]:
data_shuff = data_array[inds, :]
label_shuff = label_encoded[inds]

In [27]:
data_shuff.shape

(4302000, 50)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

x_train, x_test, y_train, y_test = train_test_split(data_shuff, label_shuff, test_size=0.2, random_state=42)

In [29]:
lin_mdl = LogisticRegression(max_iter=2000)
lin_mdl.fit(x_train, y_train)

In [35]:
train_preds = lin_mdl.predict(x_train)

In [36]:
train_preds

array([1, 0, 0, ..., 0, 0, 0])

In [74]:
accu = sum((train_preds==y_train)*1)/len(y_train)*100
TP = sum((train_preds[train_preds==1] == y_train[train_preds==1])*1)
FP = sum((train_preds[train_preds==1] != y_train[train_preds==1])*1)
TN = sum((train_preds[train_preds!=1] == y_train[train_preds!=1])*1)
FN = sum((train_preds[train_preds!=1] != y_train[train_preds!=1])*1)
prec = TP/(TP + FP)*100
recall = TP/(TP+FN)*100
specificity = TN/(TN+FP)*100
F1score = 2*prec*recall/(prec+recall)
print("For Train Set")
print(f"""\
Accuracy: {accu},
Precision: {prec},
Recall: {recall},
Specificity: {specificity},
F1score: {F1score}""")

For Train Set
Accuracy: 38.84199209669921, 
Precision: 37.97380102450806, 
Recall: 28.17815696486591, 
Specificity: 50.34631848978923, 
F1score: 32.35071971695997


In [106]:
df_train_scores = pd.DataFrame({
    'Model': np.array(['Logistic Regression']),
    'Num of Features': np.array([num_features]),
    'min change' : np.array([min_change]),
    'increase count':np.array([sum((label_array=='increase')*1)]),
    'decrease count':np.array([sum((label_array=='decrease')*1)]),
    'no big change count':np.array([sum((label_array=='no big change')*1)]),
    'Accuracy' : np.array([accu]),
    'Precision' : np.array([prec]),
    'Recall' : np.array([recall]),
    'Specificity' : np.array([specificity]),
    'F1score' : np.array([F1score])
})

In [107]:
df_train_scores.columns

Index(['Model', 'Num of Features', 'min change', 'increase count',
       'decrease count', 'no big change count', 'Accuracy', 'Precision',
       'Recall', 'Specificity', 'F1score'],
      dtype='object')

In [108]:
df_train_scores.to_csv('train_scores_close_price.csv')

In [52]:
test_preds = lin_mdl.predict(x_test)

In [56]:
accu = sum((test_preds==y_test)*1)/len(y_test)*100
TP = sum((test_preds[test_preds==1] == y_test[test_preds==1])*1)
FP = sum((test_preds[test_preds==1] != y_test[test_preds==1])*1)
TN = sum((test_preds[test_preds!=1] == y_test[test_preds!=1])*1)
FN = sum((test_preds[test_preds!=1] != y_test[test_preds!=1])*1)
prec = TP/(TP + FP)*100
recall = TP/(TP+FN)*100
specificty = TN/(TN+FP)*100
F1score = 2*prec*recall/(prec+recall)
print("For Test Set")
print(f"""\
Accuracy: {accu},
Precision: {prec},
Recall: {recall},
Specificity: {specificty},
F1score: {F1score}""")

For Test Set
Accuracy: 38.87145513714552, 
Precision: 37.83695750520252, 
Recall: 28.17387242528498, 
Specificity: 50.346897674821975, 
F1score: 32.29814304213618
