In [15]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix


In [16]:
data = yf.download("AAPL", start="2020-01-01", end="2024-12-31")
data.shape



[*********************100%***********************]  1 of 1 completed


(1257, 5)

In [17]:
data.head()

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2020-01-02,72.620827,72.681274,71.373203,71.627077,135480400
2020-01-03,71.91481,72.676439,71.68995,71.84711,146322800
2020-01-06,72.487854,72.526541,70.783256,71.034717,118387200
2020-01-07,72.146935,72.753816,71.926907,72.497522,108872000
2020-01-08,73.307526,73.60976,71.849548,71.849548,132079200


In [18]:
data['Return'] = data['Close'].pct_change()
data['MA5'] = data['Close'].rolling(window=5).mean()
data['MA10'] = data['Close'].rolling(window=10).mean()
data['Volatility'] = data['Close'].rolling(window=5).std()
data['Target'] = np.where(data['Close'].shift(-1) > data['Close'], 1, 0)  # Up = 1, Down = 0
data = data.dropna()
data.sample(10)


Price,Close,High,Low,Open,Volume,Return,MA5,MA10,Volatility,Target
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2020-02-03,74.630081,75.797911,73.07297,73.575885,173788400,-0.002746,76.601617,76.521591,1.821567,1
2022-09-23,148.260361,149.285369,146.417336,149.009409,96029900,-0.015124,151.43786,152.165218,2.335041,1
2023-08-03,189.274902,190.463004,188.799665,189.670946,61235200,-0.007322,192.401614,191.846179,2.291738,0
2021-07-12,141.410446,143.19154,140.921137,143.083891,76299700,-0.004203,140.81152,137.47052,1.22251,1
2023-12-26,191.646561,192.48045,191.428159,192.202487,28919300,-0.002841,193.205139,194.311037,1.483744,1
2020-12-22,128.649155,131.117172,126.473775,128.385765,168904800,0.028464,125.504163,122.949321,1.906422,0
2022-11-25,146.216003,146.976161,145.238658,146.413443,35195900,-0.019593,147.817242,147.705687,1.562039,0
2023-05-26,173.690964,174.027605,171.39397,171.601895,54835000,0.014105,171.487024,171.588014,1.608407,1
2024-03-07,167.985474,169.705084,167.478541,168.134567,71765100,-0.000709,171.563861,176.084558,4.644069,1
2020-03-13,67.369125,67.841733,61.305249,64.199045,370732000,0.119808,65.589722,68.490304,3.459381,0


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1248 entries, 2020-01-15 to 2024-12-30
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (Close, AAPL)   1248 non-null   float64
 1   (High, AAPL)    1248 non-null   float64
 2   (Low, AAPL)     1248 non-null   float64
 3   (Open, AAPL)    1248 non-null   float64
 4   (Volume, AAPL)  1248 non-null   int64  
 5   (Return, )      1248 non-null   float64
 6   (MA5, )         1248 non-null   float64
 7   (MA10, )        1248 non-null   float64
 8   (Volatility, )  1248 non-null   float64
 9   (Target, )      1248 non-null   int64  
dtypes: float64(8), int64(2)
memory usage: 107.2 KB


In [20]:
features = ['Return', 'MA5', 'MA10', 'Volatility']
X = data[features]
y = data['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [21]:
X_train.sample(7)

Price,Return,MA5,MA10,Volatility
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2021-04-14,-0.017853,129.165779,125.658494,1.540703
2024-04-25,0.005147,166.325497,168.016284,2.063813
2021-11-03,0.009798,147.569476,146.799124,1.414671
2021-03-24,-0.019994,118.513005,119.565189,1.519674
2023-10-05,0.007198,171.700757,171.504451,1.405351
2023-06-05,-0.007571,177.259225,174.373125,1.662179
2022-12-15,-0.046854,140.545456,141.444797,3.466192


In [22]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [23]:
y_pred[:5]

array([0, 1, 0, 1, 1])

In [24]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.472
Confusion Matrix:
 [[ 66 122]
 [ 76 111]]
