In [107]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/StockForecasting')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [67]:
!ls

Mining-1.ipynb	Mining3Logisticwith50features.ipynb  testing_set2.pkl  train_scores_close_price.csv
Mining-2.ipynb	testing_set1.pkl		     training_set.pkl


In [3]:
import pandas as pd
import numpy as np
import pickle

In [4]:
with open('training_set.pkl', 'rb') as file:
  training_file = pickle.load(file)

In [48]:
array_list = []
labels_list = []
num_features = 10
k = num_features+1
min_change = 1e-3
for df in training_file:
  df = df['Close'].values
  arr = np.zeros((len(df)-k, k-1))
  labels = np.full((len(df)-k, ), '', dtype=object)
  for i in range(len(df)-k):
    arr[i] = df[i:i+k-1]
    if np.abs(df[i+k]-df[i+k-1]) < min_change:
      labels[i] = 'no big change'
    elif df[i+k] > df[i+k-1]:
      labels[i] = 'increase'
    elif df[i+k] < df[i+k-1]:
      labels[i] = 'decrease'
  array_list.append(arr)
  labels_list.append(labels)

In [16]:
labels_list[0][0]

'no big change'

In [17]:
array_list[0].shape

(2191, 10)

In [19]:
data_array = np.concatenate(array_list, axis = 0)
label_array = np.concatenate(labels_list, axis = 0)

In [40]:
sum((label_array=='increase')*1), sum((label_array=='decrease')*1), sum((label_array=='no big change')*1)

(1629427, 1657098, 1095475)

In [25]:
data_array.shape

(4382000, 10)

In [26]:
label_array.shape

(4382000,)

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_encoded = le.fit_transform(label_array)

In [28]:
le.classes_

array(['decrease', 'increase', 'no big change'], dtype=object)

In [29]:
inds = list(range(data_array.shape[0]))

In [30]:
np.random.seed(42)
np.random.shuffle(inds)

In [31]:
data_shuff = data_array[inds, :]
label_shuff = label_encoded[inds]

In [32]:
data_shuff.shape

(4382000, 10)

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

x_train, x_test, y_train, y_test = train_test_split(data_shuff, label_shuff, test_size=0.2, random_state=42)

In [34]:
lin_mdl = LogisticRegression(max_iter=2000)
lin_mdl.fit(x_train, y_train)

In [35]:
train_preds = lin_mdl.predict(x_train)

In [36]:
train_preds

array([0, 1, 0, ..., 0, 0, 0])

In [41]:
accu = sum((train_preds==y_train)*1)/len(y_train)*100
TP = sum((train_preds[train_preds==1] == y_train[train_preds==1])*1)
FP = sum((train_preds[train_preds==1] != y_train[train_preds==1])*1)
TN = sum((train_preds[train_preds!=1] == y_train[train_preds!=1])*1)
FN = sum((train_preds[train_preds!=1] != y_train[train_preds!=1])*1)
prec = TP/(TP + FP)*100
recall = TP/(TP+FN)*100
specificity = TN/(TN+FP)*100
F1score = 2*prec*recall/(prec+recall)
print("For Train Set")
print(f"""\
Accuracy: {accu},
Precision: {prec},
Recall: {recall},
Specificity: {specificity},
F1score: {F1score}""")

For Train Set
Accuracy: 37.79952076677316,
Precision: 34.121530065868484,
Recall: 19.43298000412666,
Specificity: 58.2413389678187,
F1score: 24.762919523064884


In [96]:
df_train_scores = pd.DataFrame({
    'Model': np.array(['Logistic Regression']),
    'Num of Features': np.array([num_features]),
    'min change' : np.array([min_change]),
    'increase count':np.array([sum((label_array=='increase')*1)]),
    'decrease count':np.array([sum((label_array=='decrease')*1)]),
    'no big change count':np.array([sum((label_array=='no big change')*1)]),
    'Accuracy' : np.array([accu]),
    'Precision' : np.array([prec]),
    'Recall' : np.array([recall]),
    'Specificity' : np.array([specificity]),
    'F1score' : np.array([F1score])
})

In [114]:
df_train_csv = pd.read_csv('train_scores_close_price.csv')

In [113]:
df_train_scores = pd.concat([df_train_scores, df_train_csv.iloc[:, 1:]], axis=0)
df_train_scores.to_csv('train_scores_close_price.csv', index=True)

In [115]:
df_train_csv

Unnamed: 0.1,Unnamed: 0,Model,Num of Features,min change,increase count,decrease count,no big change count,Accuracy,Precision,Recall,Specificity,F1score
0,0,Logistic Regression,10,0.001,1629427,1657098,1095475,37.799521,34.12153,19.43298,58.241339,24.76292
1,0,Logistic Regression,50,0.001,1594960,1631397,1075643,38.841992,37.973801,28.178157,50.346318,32.35072


In [38]:
test_preds = lin_mdl.predict(x_test)

In [39]:
accu = sum((test_preds==y_test)*1)/len(y_test)*100
TP = sum((test_preds[test_preds==1] == y_test[test_preds==1])*1)
FP = sum((test_preds[test_preds==1] != y_test[test_preds==1])*1)
TN = sum((test_preds[test_preds!=1] == y_test[test_preds!=1])*1)
FN = sum((test_preds[test_preds!=1] != y_test[test_preds!=1])*1)
prec = TP/(TP + FP)*100
recall = TP/(TP+FN)*100
specificty = TN/(TN+FP)*100
F1score = 2*prec*recall/(prec+recall)
print("For Test Set")
print(f"""\
Accuracy: {accu},
Precision: {prec},
Recall: {recall},
Specificity: {specificty},
F1score: {F1score}""")

For Test Set
Accuracy: 37.853149246919216,
Precision: 34.21582152575057,
Recall: 19.441166344390336,
Specificity: 58.36302194263606,
F1score: 24.79436525590945
