In [15]:
#will make a table where i get ohlc data, i will add columns to it which will be price independent and then train a model
#that outputs either a one or a zero (i.e buy or sell): i should also try to put a model that will output probabilities of it going up
#using maybe predict_proba() - so if i want i can only buy on very high probability as well

import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier



In [2]:
#importing the data as a pandas dataframe
ticker = 'AAPL'
start_date = '2016-01-01'
end_date = '2024-01-01'
data = yf.download(ticker,start = start_date,end =end_date)

[*********************100%%**********************]  1 of 1 completed


In [3]:
#adding the moving averages column
data['20_day_MA'] = data['Close'].rolling(window = 20).mean()
data['50_day_MA'] = data['Close'].rolling(window = 50).mean()

In [4]:
#adding the convergence and divergence column after making all NaN = 0
data.fillna(0,inplace = True)
data['difference'] = data['20_day_MA'] - data['50_day_MA']
this_row = data['difference'].shift(0)
prev_row = data['difference'].shift(1)
data['MACD'] = this_row - prev_row


In [5]:
#now i will add the column of RSI and use it as another feature
condition = data['Close'].diff() >=0
condition_2 = data['Close'].diff() <0
data['Points_Gained'] = np.where(condition,data['Close'].diff(),0)
data['Points_Lost'] = np.where(condition_2,data['Close'].diff(),0)
data['RSI'] = 100*(1-((abs(data['Points_Lost'].rolling(window = 14).sum()/14))/((abs(data['Points_Lost'].rolling(window = 14).sum()/14)) + (data['Points_Gained'].rolling(window = 14).sum()/14))))

In [8]:
#adding commodity channel index
data['typical_price'] = (data['High'] + data['Low'] + data['Close'])/3
#calculating simple moving average of this: using 50 days for now but lets see how it works
period = 50
data['avg_cci'] = data['typical_price'].rolling(window = period).mean()
data['values_for_cci'] = abs(data['typical_price'] - data['avg_cci'])
data['md_cci'] = data['values_for_cci'].rolling(window = period).mean()
data['CCI'] = (data['typical_price'] - data['avg_cci'])/(0.015*data['md_cci'])


In [11]:
# adding Williams %R as a feature
data['highest_high'] = data['High'].rolling(window = 14).max()
data['lowest_low'] = data['Low'].rolling(window = 14).min()
data['williams'] = ((data['highest_high'] - data['Close'])/(data['highest_high'] - data['lowest_low']))*(-100)

In [12]:
data['williams']

Date
2016-01-04          NaN
2016-01-05          NaN
2016-01-06          NaN
2016-01-07          NaN
2016-01-08          NaN
                ...    
2023-12-22   -63.771054
2023-12-26   -80.121884
2023-12-27   -75.849967
2023-12-28   -70.808841
2023-12-29   -83.118375
Name: williams, Length: 2012, dtype: float64

In [13]:
data['final_difference'] = data['Close'].shift(-1) - data['Close']
data['output'] = np.where(data['final_difference'] >=0, 1 , 0)
data[['final_difference','output']]

Unnamed: 0_level_0,final_difference,output
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-04,-0.660000,0
2016-01-05,-0.502501,0
2016-01-06,-1.062500,0
2016-01-07,0.127501,1
2016-01-08,0.392500,1
...,...,...
2023-12-22,-0.550003,0
2023-12-26,0.099991,1
2023-12-27,0.430008,1
2023-12-28,-1.050003,0


In [14]:
X_train = data[['MACD','RSI','CCI','williams']].iloc[100:1201]
y_train = data['output'].iloc[100:1201]
X_validation = data[['MACD','RSI','CCI','williams']].iloc[1201:1601]
y_validation = data['output'].iloc[1201:1601]
X_test = data[['MACD','RSI','CCI','williams']].iloc[1601:2001]
y_test = data['output'].iloc[1601:2001]

In [16]:
clf = RandomForestClassifier(random_state = 42)
clf.fit(X_train,y_train)