In [44]:
# Dependencies
import numpy as np
import pandas as pd
import yfinance as yf
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced

In [45]:
# Load the data
data = pd.read_csv('Resources/CMS_Clean.csv')
data.drop(['Volume', 'Close'], axis=1, inplace=True)
data.set_index('Date', inplace=True)
data

Unnamed: 0_level_0,Low,Open,High,Day Result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-04,59.360001,60.639999,60.799999,0
2021-01-05,58.389999,59.709999,59.849998,0
2021-01-06,58.310001,58.529999,59.830002,1
2021-01-07,57.939999,59.880001,59.990002,0
2021-01-08,57.099998,57.889999,58.040001,0
...,...,...,...,...
2022-05-03,67.389999,67.980003,69.169998,0
2022-05-04,67.480003,67.529999,69.000000,1
2022-05-05,66.989998,68.209999,68.519997,0
2022-05-06,66.919998,67.199997,68.010002,1


In [46]:
# Encode the data
le = LabelEncoder()
data['Open'] = le.fit_transform(data['Open'])
data['High'] = le.fit_transform(data['High'])
data['Low'] = le.fit_transform(data['Low'])

# Show the data
data.head()



Unnamed: 0_level_0,Low,Open,High,Day Result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-04,68,100,88,0
2021-01-05,50,65,57,0
2021-01-06,49,46,56,1
2021-01-07,42,71,60,0
2021-01-08,31,36,29,0


In [47]:
# Split data into training and testing sets
X = data.drop(['Day Result'], axis=1)
y = data['Day Result'].values

y[:5]

array([0, 0, 1, 0, 0], dtype=int64)

In [48]:
X.describe()

Unnamed: 0,Low,Open,High
count,340.0,340.0,340.0
mean,141.520588,146.811765,152.102941
std,79.339509,82.895854,86.196342
min,0.0,0.0,0.0
25%,75.75,78.75,79.75
50%,143.5,146.5,152.5
75%,206.25,216.25,226.0
max,283.0,296.0,303.0


In [49]:
Counter(y)

Counter({0: 169, 1: 171})

In [50]:
# Build the logistic regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Counter(y_train)

Counter({1: 134, 0: 121})

In [51]:
# Scale and normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

X_test_scaled[:5]

array([[-0.1698456 , -0.14639298, -0.29085494],
       [ 0.68864058,  0.51779741,  0.87177873],
       [-0.57047248, -0.4039362 , -0.62494508],
       [ 1.07495936,  1.11421327,  1.21923247],
       [ 1.51851056,  1.49375064,  1.51323179]])

In [52]:
# Naive Random Oversampling
ros = RandomOverSampler(random_state=0)
X_res, y_res = ros.fit_resample(X_train_scaled, y_train)

Counter(y_res)

Counter({1: 134, 0: 134})

In [53]:
# Train the logistic regression model using resampled data
logreg = LogisticRegression(solver='lbfgs', random_state=1)
logreg.fit(X_res, y_res)
y_pred = logreg.predict(X_test)
balanced_accuracy_score(y_test, y_pred)
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results



Unnamed: 0,Actual,Predicted
0,0,0
1,1,1
2,0,0
3,1,0
4,1,0
...,...,...
80,1,1
81,0,0
82,0,0
83,1,0


In [54]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[41,  7],
       [ 8, 29]], dtype=int64)

In [55]:
# Print the classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.85      0.78      0.85      0.82      0.67        48
          1       0.81      0.78      0.85      0.79      0.82      0.66        37

avg / total       0.82      0.82      0.81      0.82      0.82      0.67        85



In [56]:
# Pull the data from Yahoo Finance for the last 3 days
test_data = yf.download('CMS', '2022-05-10', '2022-05-13')
test_data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-10,69.150002,70.199997,67.660004,68.279999,68.279999,1880100
2022-05-11,68.209999,69.720001,68.150002,68.580002,68.580002,1345400
2022-05-12,68.599998,69.050003,67.709999,68.580002,68.580002,2882600


In [57]:
# Format our data
cleaned_test_data = test_data.drop(['Volume', 'Close', 'Adj Close'], axis=1)

cleaned_test_data['Open'] = le.fit_transform(cleaned_test_data['Open'])
cleaned_test_data['High'] = le.fit_transform(cleaned_test_data['High'])
cleaned_test_data['Low'] = le.fit_transform(cleaned_test_data['Low'])

cleaned_scaled_test_data = scaler.fit_transform(cleaned_test_data)

test_pred = logreg.predict(cleaned_scaled_test_data)

In [58]:
# Print results
test_pred

array([0, 1, 1], dtype=int64)