In [1]:
# Dependencies
import numpy as np
import pandas as pd
import yfinance as yf
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Load the data
data = pd.read_csv('Resources/CMS_Clean.csv')
data.drop(['Volume', 'Close'], axis=1, inplace=True)
data.set_index('Date', inplace=True)
data

Unnamed: 0_level_0,Low,Open,High,Day Result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-04,59.360001,60.639999,60.799999,0
2021-01-05,58.389999,59.709999,59.849998,0
2021-01-06,58.310001,58.529999,59.830002,1
2021-01-07,57.939999,59.880001,59.990002,0
2021-01-08,57.099998,57.889999,58.040001,0
...,...,...,...,...
2022-05-03,67.389999,67.980003,69.169998,0
2022-05-04,67.480003,67.529999,69.000000,1
2022-05-05,66.989998,68.209999,68.519997,0
2022-05-06,66.919998,67.199997,68.010002,1


In [3]:
# Split data into training and testing sets
X = data.drop(['Day Result'], axis=1)
y = data['Day Result'].values

y[:5]

array([0, 0, 1, 0, 0], dtype=int64)

In [4]:
X.describe()

Unnamed: 0,Low,Open,High
count,340.0,340.0,340.0
mean,61.820382,62.370941,62.922515
std,3.781135,3.797462,3.798931
min,53.189999,54.049999,54.790001
25%,59.642501,60.155,60.5825
50%,61.710001,62.220001,62.890001
75%,63.65,64.242498,64.75
max,72.57,72.959999,73.760002


In [5]:
Counter(y)

Counter({0: 169, 1: 171})

In [6]:
# Build the logistic regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Counter(y_train)

Counter({1: 134, 0: 121})

In [7]:
# Scale and normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

X_test_scaled[:5]

array([[-0.14858179, -0.11648318, -0.27801591],
       [ 0.51972811,  0.40310697,  0.5740686 ],
       [-0.45828493, -0.34420465, -0.52804786],
       [ 0.78379084,  0.7655365 ,  0.84384041],
       [ 1.35429844,  1.19211447,  1.33074692]])

In [8]:
# Naive Random Oversampling
ros = RandomOverSampler(random_state=0)
X_res, y_res = ros.fit_resample(X_train_scaled, y_train)

Counter(y_res)

Counter({1: 134, 0: 134})

In [9]:
# Train the logistic regression model using resampled data
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg.fit(X_res, y_res)
y_pred = logreg.predict(X_test)
balanced_accuracy_score(y_test, y_pred)
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results



Unnamed: 0,Actual,Predicted
0,0,0
1,1,0
2,0,0
3,1,0
4,1,0
...,...,...
80,1,0
81,0,0
82,0,0
83,1,0


In [10]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[48,  0],
       [37,  0]], dtype=int64)

In [11]:
# Print the classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.56      1.00      0.00      0.72      0.00      0.00        48
          1       0.00      0.00      1.00      0.00      0.00      0.00        37

avg / total       0.32      0.56      0.44      0.41      0.00      0.00        85



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Pull the data from Yahoo Finance for the last 3 days
test_data = yf.download('CMS', '2022-05-10', '2022-05-13')
test_data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-10,69.150002,70.199997,67.660004,68.279999,68.279999,1880100
2022-05-11,68.209999,69.720001,68.150002,68.580002,68.580002,1345400
2022-05-12,68.599998,69.050003,67.709999,68.580002,68.580002,2882600


In [14]:
# Format our data
cleaned_test_data = test_data.drop(['Volume', 'Close', 'Adj Close'], axis=1)

cleaned_scaled_test_data = scaler.fit_transform(cleaned_test_data)

test_pred = logreg.predict(cleaned_scaled_test_data)

In [15]:
# Print results
test_pred

array([0, 0, 1], dtype=int64)