In [574]:
import pandas as np
import yfinance as yf
from datetime import datetime, timedelta
from sklearn.preprocessing import OrdinalEncoder
"""
Note: Never train your model on the entire data set
You need to split the data into test data and validation data to make sure that
you're able to train the model but also make predictions with it based on data
from the dataset its never seen before.
"""

# This will split your data set in the way we need for training.
from sklearn.model_selection import train_test_split

# For predictions
from sklearn.linear_model import LogisticRegression

# Will be used to get some stats on the model's predictions
from sklearn.metrics import classification_report

In [575]:
# Get s&p500 data from the last years.  |  https://finance.yahoo.com/quote/%5EGSPC/

date = datetime.now().date() - timedelta(days=1)
year_now = date.year

minus_five_years = year_now - 1

date_today = str(date)

five_years_ago = (str(minus_five_years) + (date_today[4:]))
print(five_years_ago)

data = yf.download("^GSPC",start=five_years_ago, end=date_today )

  data = yf.download("^GSPC",start=five_years_ago, end=date_today )
[*********************100%***********************]  1 of 1 completed

2024-11-28





In [576]:
data

Price,Close,High,Low,Open,Volume
Ticker,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2024-11-29,6032.379883,6044.169922,6003.979980,6003.979980,2444420000
2024-12-02,6047.149902,6053.580078,6035.330078,6040.109863,4412470000
2024-12-03,6049.879883,6052.069824,6033.390137,6042.970215,4095000000
2024-12-04,6086.490234,6089.839844,6061.060059,6069.390137,4003390000
2024-12-05,6075.109863,6094.549805,6072.899902,6089.029785,4212020000
...,...,...,...,...,...
2025-11-20,6538.759766,6770.350098,6534.049805,6737.930176,5596080000
2025-11-21,6602.990234,6660.049805,6521.919922,6555.770020,5929930000
2025-11-24,6705.120117,6715.750000,6630.700195,6636.540039,6039740000
2025-11-25,6765.879883,6776.399902,6659.979980,6697.029785,5003330000


In [577]:
# Using price close data to create a column "daily returns"
df = data['Close'].pct_change() * 100
df = df.reset_index()

In [578]:
for i in range(1,6):
  df['Lag ' +str(i)] = df['^GSPC'].shift(i)

In [579]:
df['Volume'] = data.Volume.shift(1).values/1000_000_000

In [580]:
df = df.dropna()

In [581]:
# Direction | 1 for up, 0 for down
df['Direction'] = [1.0 if i > 0 else 0.0 for i in df['^GSPC']]
df

Ticker,Date,^GSPC,Lag 1,Lag 2,Lag 3,Lag 4,Lag 5,Volume,Direction
6,2024-12-09,-0.614421,0.249545,-0.186978,0.605142,0.045145,0.244846,3.92483,0.0
7,2024-12-10,-0.296388,-0.614421,0.249545,-0.186978,0.605142,0.045145,4.55646,0.0
8,2024-12-11,0.816579,-0.296388,-0.614421,0.249545,-0.186978,0.605142,4.04841,1.0
9,2024-12-12,-0.541402,0.816579,-0.296388,-0.614421,0.249545,-0.186978,4.26995,0.0
10,2024-12-13,-0.002647,-0.541402,0.816579,-0.296388,-0.614421,0.249545,3.67801,0.0
...,...,...,...,...,...,...,...,...,...
244,2025-11-20,-1.556728,0.375384,-0.825644,-0.916227,-0.050173,-1.655685,5.02161,0.0
245,2025-11-21,0.982304,-1.556728,0.375384,-0.825644,-0.916227,-0.050173,5.59608,1.0
246,2025-11-24,1.546722,0.982304,-1.556728,0.375384,-0.825644,-0.916227,5.92993,1.0
247,2025-11-25,0.906170,1.546722,0.982304,-1.556728,0.375384,-0.825644,6.03974,1.0


In [582]:
# We're using multiple cols to predict Y so we extract a part of the dataframe
X_VARS = df[['Lag 1', 'Lag 2']]
# The column we're going to be predicting
Y_VAR = df.iloc[:, 8]

# Splitting up test and training variables | 80/20
# Basically makes four separate data frames. 2 of which are for training and 2 of which are for testing.
x_training, x_testing, y_training, y_testing = train_test_split(X_VARS, Y_VAR, train_size=0.8, random_state=12)

In [583]:
# Initialize Logistic Regression Model

log = LogisticRegression()

# Give the model the training data:
log.fit(x_training, y_training)

In [584]:
summary = log.predict(x_testing)

print(x_testing)

log.score(x_testing, y_testing)

Ticker     Lag 1     Lag 2
157    -0.009049  0.537377
37     -0.285513  0.531349
54      0.237697  0.244499
67     -2.697309  0.552062
244     0.375384 -0.825644
236     0.126184 -1.117819
107    -0.769324 -0.638160
153     0.140741 -0.329752
200     0.478912 -0.097017
112     0.724820  3.255878
188    -0.692228 -0.639818
17      0.728727  1.086912
247     1.546722  0.982304
235    -1.117819  0.365356
104     0.629910  0.147999
69      0.488687 -0.756787
100     0.737313  2.025911
13     -0.386399  0.379935
207     0.263554  0.590184
126     0.580029  0.410204
180    -0.243167 -0.585810
205    -0.500906 -0.284662
89      9.515388 -1.570052
192     0.210598 -0.316515
27      0.156036 -1.110391
170     0.728191 -0.485787
98      1.666114  2.511720
195     0.848589  0.298347
12      0.379935 -0.002647
93     -0.172778  0.794471
220    -0.629439  0.402600
18      1.104276  0.728727
190     0.834649  0.510007
204    -0.284662 -0.550216
84      0.672819  0.378124
58     -0.467973 -0.496911
2

0.5306122448979592

In [585]:
print(classification_report(summary, y_testing))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.96      0.54      0.69        48

    accuracy                           0.53        49
   macro avg       0.48      0.27      0.35        49
weighted avg       0.94      0.53      0.68        49

