In [26]:
import os
import sys
import requests

import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import confusion_matrix # , plot_confusion_matrix
# from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score, classification_report

In [27]:

def grab_price_data():
    """
    Grabs the daily price history for five tickers using the yfinance library 
    and stores the data in a CSV file in the specified format.
    
    Format: close,datetime,high,low,open,symbol,volume
    """

    # Define the list of tickers
    tickers_list = ['AAPL', 'MSFT', 'AMZN', 'KO', 'AXP', 'BA', 'CAT', 'COST', 'DECK', 'FTNT', 'HD', 'LDOS', 'RL', 'BRK/B']

    # Store multiple result sets
    full_price_history = []

    # Fetch the data for all tickers
    for ticker in tickers_list:
        # Grab the daily price history for 1 year
        price_history = yf.download(ticker, start='2022-01-01')

        # Add the required data to the full list in the specified format
        for index, row in price_history.iterrows():
            row_data = {
                'close': row['Close'],
                'datetime': index,
                'high': row['High'],
                'low': row['Low'],
                'open': row['Open'],
                'symbol': ticker,
                'volume': row['Volume']
            }
            full_price_history.append(row_data)

    # Convert the list to a DataFrame
    price_data = pd.DataFrame(full_price_history)

    # Reorder the columns
    price_data = price_data[['close', 'datetime', 'high', 'low', 'open', 'symbol', 'volume']]

    # Dump the data to a CSV file, without an index column
    price_data.to_csv('price_data.csv', index=False)

# Example usage
grab_price_data()


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [28]:
if os.path.exists('price_data.csv'):

    # Load the data
    price_data = pd.read_csv('price_data.csv')

else:

    # Grab the data and store it.
    grab_price_data()

    # Load the data
    price_data = pd.read_csv('price_data.csv')

# Display the head before moving on.
price_data.head()

Unnamed: 0,close,datetime,high,low,open,symbol,volume
0,182.009995,2022-01-03,182.880005,177.710007,177.830002,AAPL,104487900.0
1,179.699997,2022-01-04,182.940002,179.119995,182.630005,AAPL,99310400.0
2,174.919998,2022-01-05,180.169998,174.639999,179.610001,AAPL,94537600.0
3,172.0,2022-01-06,175.300003,171.639999,172.699997,AAPL,96904000.0
4,172.169998,2022-01-07,174.139999,171.029999,172.889999,AAPL,86709100.0


In [29]:
# I Just need the Close
price_data = price_data[['symbol','datetime','close','high','low','open','volume']]


'''
    First, for average investors, the return of an asset is a complete and scale–free 
    summary of the investment opportunity. Second, return series are easier to 
    handle than prices series as they have more attractive statistical properties
'''


# sort the values by symbol and then date
price_data.sort_values(by = ['symbol','datetime'], inplace = True)

# calculate the change in price
price_data['change_in_price'] = price_data['close'].diff()


In [30]:
# identify rows where the symbol changes
mask = price_data['symbol'] != price_data['symbol'].shift(1)

# For those rows, let's make the value null
price_data['change_in_price'] = np.where(mask == True, np.nan, price_data['change_in_price'])

# print the rows that have a null value, should only be 5
price_data[price_data.isna().any(axis = 1)]


Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,
1338,AMZN,2022-01-03,170.404495,170.703506,166.160507,167.550003,63520000.0,
2676,AXP,2022-01-03,168.210007,168.399994,164.399994,164.509995,3236400.0,
3345,BA,2022-01-03,207.860001,210.550003,203.339996,204.0,9060200.0,
4014,CAT,2022-01-03,207.0,208.600006,205.800003,207.330002,2055600.0,
4683,COST,2022-01-03,566.710022,567.469971,555.51001,565.030029,2714100.0,
5352,DECK,2022-01-03,367.940002,372.910004,365.26001,367.23999,247100.0,
6021,FTNT,2022-01-03,66.624001,72.424004,66.487999,72.424004,9515000.0,
6690,HD,2022-01-03,408.640015,417.839996,403.26001,416.570007,3715700.0,
2007,KO,2022-01-03,59.299999,59.310001,58.380001,58.82,20187300.0,


In [31]:
# define the number of days out you want to predict
days_out = 30

# Group by symbol, then apply the rolling function and grab the Min and Max.
price_data_smoothed = price_data.groupby(['symbol'])[['close','low','high','open','volume']].transform(lambda x: x.ewm(span = days_out).mean())

# Join the smoothed columns with the symbol and datetime column from the old data frame.
smoothed_df = pd.concat([price_data[['symbol','datetime']], price_data_smoothed], axis=1, sort=False)

smoothed_df

Unnamed: 0,symbol,datetime,close,low,high,open,volume
0,AAPL,2022-01-03,182.009995,177.710007,182.880005,177.830002,1.044879e+08
1,AAPL,2022-01-04,180.816496,178.438501,182.911004,180.310003,1.018129e+08
2,AAPL,2022-01-05,178.718556,177.087016,181.935770,180.060947,9.922436e+07
3,AAPL,2022-01-06,176.867349,175.586165,180.107374,178.032737,9.858502e+07
4,AAPL,2022-01-07,175.798588,174.549527,178.749652,176.862639,9.588296e+07
...,...,...,...,...,...,...,...
8692,RL,2024-08-26,168.059318,165.684726,170.456758,168.201789,8.257994e+05
8693,RL,2024-08-27,168.342588,165.925066,170.594386,168.343609,7.983284e+05
8694,RL,2024-08-28,168.415324,166.015062,170.614103,168.464022,7.810556e+05
8695,RL,2024-08-29,168.560787,166.158607,170.716419,168.588278,7.592198e+05


In [32]:
# define the number of days out you want to predict
days_out = 30

# create a new column that will house the flag, and for each group calculate the diff compared to 30 days ago. Then use Numpy to define the sign.
smoothed_df['Signal_Flag'] = smoothed_df.groupby('symbol')['close'].transform(lambda x : np.sign(x.diff(days_out)))

# print the first 50 rows
smoothed_df.head(50)

Unnamed: 0,symbol,datetime,close,low,high,open,volume,Signal_Flag
0,AAPL,2022-01-03,182.009995,177.710007,182.880005,177.830002,104487900.0,
1,AAPL,2022-01-04,180.816496,178.438501,182.911004,180.310003,101812900.0,
2,AAPL,2022-01-05,178.718556,177.087016,181.93577,180.060947,99224360.0,
3,AAPL,2022-01-06,176.867349,175.586165,180.107374,178.032737,98585020.0,
4,AAPL,2022-01-07,175.798588,174.549527,178.749652,176.862639,95882960.0,
5,AAPL,2022-01-10,175.092625,173.301472,177.527006,175.340089,98011980.0,
6,AAPL,2022-01-11,175.090442,172.872287,177.121075,174.817746,94228780.0,
7,AAPL,2022-01-12,175.159029,173.176201,177.130268,175.020944,91198010.0,
8,AAPL,2022-01-13,174.734598,172.978039,177.057323,175.129453,90241340.0,
9,AAPL,2022-01-14,174.513947,172.727768,176.622895,174.627139,88942220.0,


In [33]:
# Calculate the 14 day RSI
n = 14

# First make a copy of the data frame twice
up_df, down_df = price_data[['symbol','change_in_price']].copy(), price_data[['symbol','change_in_price']].copy()

# For up days, if the change is less than 0 set to 0.
up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0

# For down days, if the change is greater than 0 set to 0.
down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0

# We need change in price to be absolute.
down_df['change_in_price'] = down_df['change_in_price'].abs()

# Calculate the EWMA (Exponential Weighted Moving Average), meaning older values are given less weight compared to newer values.
ewma_up = up_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
ewma_down = down_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())

# Calculate the Relative Strength
relative_strength = ewma_up / ewma_down

# Calculate the Relative Strength Index
relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))

# Add the info to the data frame.
price_data['down_days'] = down_df['change_in_price']
price_data['up_days'] = up_df['change_in_price']
price_data['RSI'] = relative_strength_index

# Display the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,,,,
1,AAPL,2022-01-04,179.699997,182.940002,179.119995,182.630005,99310400.0,-2.309998,2.309998,0.0,0.0
2,AAPL,2022-01-05,174.919998,180.169998,174.639999,179.610001,94537600.0,-4.779999,4.779999,0.0,0.0
3,AAPL,2022-01-06,172.0,175.300003,171.639999,172.699997,96904000.0,-2.919998,2.919998,0.0,0.0
4,AAPL,2022-01-07,172.169998,174.139999,171.029999,172.889999,86709100.0,0.169998,0.0,0.169998,2.180947
5,AAPL,2022-01-10,172.190002,172.5,168.169998,169.080002,106765600.0,0.020004,0.0,0.020004,2.469756
6,AAPL,2022-01-11,175.080002,175.179993,170.820007,172.320007,76138300.0,2.889999,0.0,2.889999,34.638407
7,AAPL,2022-01-12,175.529999,177.179993,174.820007,176.119995,74805200.0,0.449997,0.0,0.449997,38.294976
8,AAPL,2022-01-13,172.190002,176.619995,171.789993,175.779999,84505800.0,-3.339996,3.339996,0.0,25.890548
9,AAPL,2022-01-14,173.070007,173.779999,171.089996,171.339996,80440800.0,0.880005,0.0,0.880005,32.534183


In [34]:
# Calculate the Stochastic Oscillator
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['symbol','low']].copy(), price_data[['symbol','high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate the Stochastic Oscillator.
k_percent = 100 * ((price_data['close'] - low_14) / (high_14 - low_14))

# Add the info to the data frame.
price_data['low_14'] = low_14
price_data['high_14'] = high_14
price_data['k_percent'] = k_percent

# Display the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,,,,,,,
1,AAPL,2022-01-04,179.699997,182.940002,179.119995,182.630005,99310400.0,-2.309998,2.309998,0.0,0.0,,,
2,AAPL,2022-01-05,174.919998,180.169998,174.639999,179.610001,94537600.0,-4.779999,4.779999,0.0,0.0,,,
3,AAPL,2022-01-06,172.0,175.300003,171.639999,172.699997,96904000.0,-2.919998,2.919998,0.0,0.0,,,
4,AAPL,2022-01-07,172.169998,174.139999,171.029999,172.889999,86709100.0,0.169998,0.0,0.169998,2.180947,,,
5,AAPL,2022-01-10,172.190002,172.5,168.169998,169.080002,106765600.0,0.020004,0.0,0.020004,2.469756,,,
6,AAPL,2022-01-11,175.080002,175.179993,170.820007,172.320007,76138300.0,2.889999,0.0,2.889999,34.638407,,,
7,AAPL,2022-01-12,175.529999,177.179993,174.820007,176.119995,74805200.0,0.449997,0.0,0.449997,38.294976,,,
8,AAPL,2022-01-13,172.190002,176.619995,171.789993,175.779999,84505800.0,-3.339996,3.339996,0.0,25.890548,,,
9,AAPL,2022-01-14,173.070007,173.779999,171.089996,171.339996,80440800.0,0.880005,0.0,0.880005,32.534183,,,


In [35]:
# Calculate the Williams %R
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['symbol','low']].copy(), price_data[['symbol','high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate William %R indicator.
r_percent = ((high_14 - price_data['close']) / (high_14 - low_14)) * - 100

# Add the info to the data frame.
price_data['r_percent'] = r_percent

# Display the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,,,,,,,,
1,AAPL,2022-01-04,179.699997,182.940002,179.119995,182.630005,99310400.0,-2.309998,2.309998,0.0,0.0,,,,
2,AAPL,2022-01-05,174.919998,180.169998,174.639999,179.610001,94537600.0,-4.779999,4.779999,0.0,0.0,,,,
3,AAPL,2022-01-06,172.0,175.300003,171.639999,172.699997,96904000.0,-2.919998,2.919998,0.0,0.0,,,,
4,AAPL,2022-01-07,172.169998,174.139999,171.029999,172.889999,86709100.0,0.169998,0.0,0.169998,2.180947,,,,
5,AAPL,2022-01-10,172.190002,172.5,168.169998,169.080002,106765600.0,0.020004,0.0,0.020004,2.469756,,,,
6,AAPL,2022-01-11,175.080002,175.179993,170.820007,172.320007,76138300.0,2.889999,0.0,2.889999,34.638407,,,,
7,AAPL,2022-01-12,175.529999,177.179993,174.820007,176.119995,74805200.0,0.449997,0.0,0.449997,38.294976,,,,
8,AAPL,2022-01-13,172.190002,176.619995,171.789993,175.779999,84505800.0,-3.339996,3.339996,0.0,25.890548,,,,
9,AAPL,2022-01-14,173.070007,173.779999,171.089996,171.339996,80440800.0,0.880005,0.0,0.880005,32.534183,,,,


In [36]:
# Calculate the MACD
ema_26 = price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 26).mean())
ema_12 = price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 12).mean())
macd = ema_12 - ema_26

# Calculate the EMA
ema_9_macd = macd.ewm(span = 9).mean()

# Store the data in the data frame.
price_data['MACD'] = macd
price_data['MACD_EMA'] = ema_9_macd

# Print the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,,,,,,,,,0.0,0.0
1,AAPL,2022-01-04,179.699997,182.940002,179.119995,182.630005,99310400.0,-2.309998,2.309998,0.0,0.0,,,,,-0.051827,-0.028793
2,AAPL,2022-01-05,174.919998,180.169998,174.639999,179.610001,94537600.0,-4.779999,4.779999,0.0,0.0,,,,,-0.214265,-0.104806
3,AAPL,2022-01-06,172.0,175.300003,171.639999,172.699997,96904000.0,-2.919998,2.919998,0.0,0.0,,,,,-0.38832,-0.200847
4,AAPL,2022-01-07,172.169998,174.139999,171.029999,172.889999,86709100.0,0.169998,0.0,0.169998,2.180947,,,,,-0.467959,-0.280307
5,AAPL,2022-01-10,172.190002,172.5,168.169998,169.080002,106765600.0,0.020004,0.0,0.020004,2.469756,,,,,-0.506181,-0.341531
6,AAPL,2022-01-11,175.080002,175.179993,170.820007,172.320007,76138300.0,2.889999,0.0,2.889999,34.638407,,,,,-0.391053,-0.354064
7,AAPL,2022-01-12,175.529999,177.179993,174.820007,176.119995,74805200.0,0.449997,0.0,0.449997,38.294976,,,,,-0.286166,-0.337747
8,AAPL,2022-01-13,172.190002,176.619995,171.789993,175.779999,84505800.0,-3.339996,3.339996,0.0,25.890548,,,,,-0.374889,-0.346327
9,AAPL,2022-01-14,173.070007,173.779999,171.089996,171.339996,80440800.0,0.880005,0.0,0.880005,32.534183,,,,,-0.387045,-0.35545


In [37]:
# Calculate the Price Rate of Change
n = 9

# Calculate the Rate of Change in the Price, and store it in the Data Frame.
price_data['Price_Rate_Of_Change'] = price_data.groupby('symbol')['close'].transform(lambda x: x.pct_change(periods = n))

# Print the first 30 rows
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,,,,,,,,,0.0,0.0,
1,AAPL,2022-01-04,179.699997,182.940002,179.119995,182.630005,99310400.0,-2.309998,2.309998,0.0,0.0,,,,,-0.051827,-0.028793,
2,AAPL,2022-01-05,174.919998,180.169998,174.639999,179.610001,94537600.0,-4.779999,4.779999,0.0,0.0,,,,,-0.214265,-0.104806,
3,AAPL,2022-01-06,172.0,175.300003,171.639999,172.699997,96904000.0,-2.919998,2.919998,0.0,0.0,,,,,-0.38832,-0.200847,
4,AAPL,2022-01-07,172.169998,174.139999,171.029999,172.889999,86709100.0,0.169998,0.0,0.169998,2.180947,,,,,-0.467959,-0.280307,
5,AAPL,2022-01-10,172.190002,172.5,168.169998,169.080002,106765600.0,0.020004,0.0,0.020004,2.469756,,,,,-0.506181,-0.341531,
6,AAPL,2022-01-11,175.080002,175.179993,170.820007,172.320007,76138300.0,2.889999,0.0,2.889999,34.638407,,,,,-0.391053,-0.354064,
7,AAPL,2022-01-12,175.529999,177.179993,174.820007,176.119995,74805200.0,0.449997,0.0,0.449997,38.294976,,,,,-0.286166,-0.337747,
8,AAPL,2022-01-13,172.190002,176.619995,171.789993,175.779999,84505800.0,-3.339996,3.339996,0.0,25.890548,,,,,-0.374889,-0.346327,
9,AAPL,2022-01-14,173.070007,173.779999,171.089996,171.339996,80440800.0,0.880005,0.0,0.880005,32.534183,,,,,-0.387045,-0.35545,-0.049118


In [38]:
def obv(group):

    # Grab the volume and close column.
    volume = group['volume']
    change = group['close'].diff()

    # intialize the previous OBV
    prev_obv = 0
    obv_values = []

    # calculate the On Balance Volume
    for i, j in zip(change, volume):

        if i > 0:
            current_obv = prev_obv + j
        elif i < 0:
            current_obv = prev_obv - j
        else:
            current_obv = prev_obv

        # OBV.append(current_OBV)
        prev_obv = current_obv
        obv_values.append(current_obv)
    
    # Return a panda series.
    return pd.Series(obv_values, index = group.index)
        

# apply the function to each group
obv_groups = price_data.groupby('symbol').apply(obv)

# add to the data frame, but drop the old index, before adding it.
price_data['On Balance Volume'] = obv_groups.reset_index(level=0, drop=True)

# display the data frame.
price_data.head(30)

  obv_groups = price_data.groupby('symbol').apply(obv)


Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,,,,,,,,,0.0,0.0,,0.0
1,AAPL,2022-01-04,179.699997,182.940002,179.119995,182.630005,99310400.0,-2.309998,2.309998,0.0,0.0,,,,,-0.051827,-0.028793,,-99310400.0
2,AAPL,2022-01-05,174.919998,180.169998,174.639999,179.610001,94537600.0,-4.779999,4.779999,0.0,0.0,,,,,-0.214265,-0.104806,,-193848000.0
3,AAPL,2022-01-06,172.0,175.300003,171.639999,172.699997,96904000.0,-2.919998,2.919998,0.0,0.0,,,,,-0.38832,-0.200847,,-290752000.0
4,AAPL,2022-01-07,172.169998,174.139999,171.029999,172.889999,86709100.0,0.169998,0.0,0.169998,2.180947,,,,,-0.467959,-0.280307,,-204042900.0
5,AAPL,2022-01-10,172.190002,172.5,168.169998,169.080002,106765600.0,0.020004,0.0,0.020004,2.469756,,,,,-0.506181,-0.341531,,-97277300.0
6,AAPL,2022-01-11,175.080002,175.179993,170.820007,172.320007,76138300.0,2.889999,0.0,2.889999,34.638407,,,,,-0.391053,-0.354064,,-21139000.0
7,AAPL,2022-01-12,175.529999,177.179993,174.820007,176.119995,74805200.0,0.449997,0.0,0.449997,38.294976,,,,,-0.286166,-0.337747,,53666200.0
8,AAPL,2022-01-13,172.190002,176.619995,171.789993,175.779999,84505800.0,-3.339996,3.339996,0.0,25.890548,,,,,-0.374889,-0.346327,,-30839600.0
9,AAPL,2022-01-14,173.070007,173.779999,171.089996,171.339996,80440800.0,0.880005,0.0,0.880005,32.534183,,,,,-0.387045,-0.35545,-0.049118,49601200.0


In [39]:
#  Create a column we wish to predict
'''
    In this case, let's create an output column that will be 1 if the closing price at time 't' is greater than 't-1' and 0 otherwise.
    In other words, if the today's closing price is greater than yesterday's closing price it would be 1.
'''

# Group by the `Symbol` column, then grab the `Close` column.
close_groups = price_data.groupby('symbol')['close']

# Apply the lambda function which will return -1.0 for down, 1.0 for up and 0.0 for no change.
close_groups = close_groups.transform(lambda x : np.sign(x.diff()))

# add the data to the main dataframe.
price_data['Prediction'] = close_groups

# for simplicity in later sections I'm going to make a change to our prediction column. To keep this as a binary classifier I'll change flat days and consider them up days.
price_data.loc[price_data['Prediction'] == 0.0] = 1.0

# print the head
price_data.head(50)

# OPTIONAL CODE: Dump the data frame to a CSV file to examine the data yourself.
# price_data.to_csv('final_metrics.csv')

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
0,AAPL,2022-01-03,182.009995,182.880005,177.710007,177.830002,104487900.0,,,,,,,,,0.0,0.0,,0.0,
1,AAPL,2022-01-04,179.699997,182.940002,179.119995,182.630005,99310400.0,-2.309998,2.309998,0.0,0.0,,,,,-0.051827,-0.028793,,-99310400.0,-1.0
2,AAPL,2022-01-05,174.919998,180.169998,174.639999,179.610001,94537600.0,-4.779999,4.779999,0.0,0.0,,,,,-0.214265,-0.104806,,-193848000.0,-1.0
3,AAPL,2022-01-06,172.0,175.300003,171.639999,172.699997,96904000.0,-2.919998,2.919998,0.0,0.0,,,,,-0.38832,-0.200847,,-290752000.0,-1.0
4,AAPL,2022-01-07,172.169998,174.139999,171.029999,172.889999,86709100.0,0.169998,0.0,0.169998,2.180947,,,,,-0.467959,-0.280307,,-204042900.0,1.0
5,AAPL,2022-01-10,172.190002,172.5,168.169998,169.080002,106765600.0,0.020004,0.0,0.020004,2.469756,,,,,-0.506181,-0.341531,,-97277300.0,1.0
6,AAPL,2022-01-11,175.080002,175.179993,170.820007,172.320007,76138300.0,2.889999,0.0,2.889999,34.638407,,,,,-0.391053,-0.354064,,-21139000.0,1.0
7,AAPL,2022-01-12,175.529999,177.179993,174.820007,176.119995,74805200.0,0.449997,0.0,0.449997,38.294976,,,,,-0.286166,-0.337747,,53666200.0,1.0
8,AAPL,2022-01-13,172.190002,176.619995,171.789993,175.779999,84505800.0,-3.339996,3.339996,0.0,25.890548,,,,,-0.374889,-0.346327,,-30839600.0,-1.0
9,AAPL,2022-01-14,173.070007,173.779999,171.089996,171.339996,80440800.0,0.880005,0.0,0.880005,32.534183,,,,,-0.387045,-0.35545,-0.049118,49601200.0,1.0


In [40]:
# We need to remove all rows that have an NaN value.
print('Before NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Any row that has a `NaN` value will be dropped.
price_data = price_data.dropna()

# Display how much we have left now.
print('After NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Print the head.
price_data.head()

Before NaN Drop we have 8697 rows and 20 columns
After NaN Drop we have 8528 rows and 20 columns


Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
13,AAPL,2022-01-21,162.410004,166.330002,162.300003,164.419998,122848900.0,-2.099991,2.099991,0.0,12.926265,162.300003,182.940002,0.532949,-99.467051,-1.551824,-0.84097,-0.056688,-350439900.0,-1.0
14,AAPL,2022-01-24,161.619995,162.300003,154.699997,160.020004,162294600.0,-0.790009,0.790009,0.0,12.132826,154.699997,182.940002,24.504238,-75.495762,-1.840502,-1.048167,-0.061386,-512734500.0,-1.0
15,AAPL,2022-01-25,159.779999,162.759995,157.020004,158.979996,115798400.0,-1.839996,1.839996,0.0,10.414818,154.699997,180.169998,19.94504,-80.05496,-2.148011,-1.274507,-0.087389,-628532900.0,-1.0
16,AAPL,2022-01-26,159.690002,164.389999,157.820007,163.5,108275300.0,-0.089996,0.089996,0.0,10.33225,154.699997,177.179993,22.197538,-77.802462,-2.361443,-1.496902,-0.090241,-736808200.0,-1.0
17,AAPL,2022-01-27,159.220001,163.839996,158.279999,162.449997,121954600.0,-0.470001,0.470001,0.0,9.861151,154.699997,177.179993,20.106784,-79.893216,-2.52554,-1.706403,-0.075324,-858762800.0,-1.0


In [41]:
# Grab our X & Y Columns.
X_Cols = price_data[['RSI','k_percent','r_percent','Price_Rate_Of_Change','MACD','On Balance Volume']]
Y_Cols = price_data['Prediction']

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X_Cols, Y_Cols, random_state = 0)

# Create a Random Forest Classifier
rand_frst_clf = RandomForestClassifier(n_estimators = 100, oob_score = True, criterion = "gini", random_state = 0)

# Fit the data to the model
rand_frst_clf.fit(X_train, y_train)

# Make predictions
y_pred = rand_frst_clf.predict(X_test)

In [42]:
print('Correct Prediction (%): ', accuracy_score(y_test, rand_frst_clf.predict(X_test), normalize = True) * 100.0)

Correct Prediction (%):  72.93621013133207


In [43]:
# Define the traget names
target_names = ['Down Day', 'Up Day']

# Build a classifcation report
report = classification_report(y_true = y_test, y_pred = y_pred, target_names = target_names, output_dict = True)

# Add it to a data frame, transpose it for readability.
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
Down Day,0.707602,0.723829,0.715623,1003.0
Up Day,0.749548,0.734278,0.741834,1129.0
accuracy,0.729362,0.729362,0.729362,0.729362
macro avg,0.728575,0.729053,0.728729,2132.0
weighted avg,0.729815,0.729362,0.729503,2132.0


In [44]:
feature_imp = pd.Series(rand_frst_clf.feature_importances_, index=X_Cols.columns).sort_values(ascending=False)
feature_imp

r_percent               0.195148
k_percent               0.190083
RSI                     0.188588
Price_Rate_Of_Change    0.152263
MACD                    0.151979
On Balance Volume       0.121940
dtype: float64

In [45]:
# Number of trees in random forest
# Number of trees is not a parameter that should be tuned, but just set large enough usually. There is no risk of overfitting in random forest with growing number of # trees, as they are trained independently from each other. 
n_estimators = list(range(200, 2000, 200))

# Number of features to consider at every split
max_features = ['sqrt', None, 'log2']

# Maximum number of levels in tree
# Max depth is a parameter that most of the times should be set as high as possible, but possibly better performance can be achieved by setting it lower.
max_depth = list(range(10, 110, 10))
max_depth.append(None)

# Minimum number of samples required to split a node
# Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree. Too high values can also lead to # under-fitting hence depending on the level of underfitting or overfitting, you can tune the values for min_samples_split.
min_samples_split = [2, 5, 10, 20, 30, 40]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 7, 12, 14, 16 ,20]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800], 'max_features': ['sqrt', None, 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 10, 20, 30, 40], 'min_samples_leaf': [1, 2, 7, 12, 14, 16, 20], 'bootstrap': [True, False]}


In [46]:
# New Random Forest Classifier to house optimal parameters
rf = RandomForestClassifier()

# Specfiy the details of our Randomized Search
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1, error_score="raise")

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=20, min_samples_split=40, n_estimators=400; total time=   4.9s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=20, min_samples_split=40, n_estimators=400; total time=   4.9s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=20, min_samples_split=40, n_estimators=400; total time=   5.0s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=7, min_samples_split=40, n_estimators=200; total time=   2.3s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=7, min_samples_split=40, n_estimators=200; total time=   2.4s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=7, min_samples_split=40, n_estimators=200; total time=   2.3s
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=16, min_samples_split=2, n_estima

In [72]:
# With the new Random Classifier trained we can proceed to our regular steps, prediction.
rf_random.predict(X_test)

'''
    ACCURACY
'''
# Once the predictions have been made, then grab the accuracy score.
print('Correct Prediction (%): ', accuracy_score(y_test, rf_random.predict(X_test), normalize = True) * 100.0)


'''
    CLASSIFICATION REPORT
'''
# Define the traget names
target_names = ['Down Day', 'Up Day']

# Build a classifcation report
report = classification_report(y_true = y_test, y_pred = y_pred, target_names = target_names, output_dict = True)

# Add it to a data frame, transpose it for readability.
report_df = pd.DataFrame(report).transpose()
display(report_df)
print('\n')

'''
    FEATURE IMPORTANCE
'''
# Calculate feature importance and store in pandas series
feature_imp = pd.Series(rand_frst_clf.feature_importances_, index=X_Cols.columns).sort_values(ascending=False)
display(feature_imp)

Correct Prediction (%):  74.671669793621


Unnamed: 0,precision,recall,f1-score,support
Down Day,0.707602,0.723829,0.715623,1003.0
Up Day,0.749548,0.734278,0.741834,1129.0
accuracy,0.729362,0.729362,0.729362,0.729362
macro avg,0.728575,0.729053,0.728729,2132.0
weighted avg,0.729815,0.729362,0.729503,2132.0






r_percent               0.195148
k_percent               0.190083
RSI                     0.188588
Price_Rate_Of_Change    0.152263
MACD                    0.151979
On Balance Volume       0.121940
dtype: float64

In [48]:
rf_random.best_estimator_

In [69]:
# Predict for the future

cur_data = price_data[['RSI', 'k_percent', 'r_percent', 'Price_Rate_Of_Change', 'MACD', 'On Balance Volume']].tail(1)
preds = rf_random.predict(cur_data)
cert = rf_random.predict_proba(cur_data)[0]  # Assuming binary classification

# Determine the predicted value and its associated probability
if cert[0] > cert[1]:
    val = "Down"
    prob = cert[0]
else:
    val = "Up"
    prob = cert[1]

# Output the result
print(f"Prediction: {val}, Confidence: {prob * 100:.2f}%")



Prediction: Down, Confidence: 57.52%


In [81]:
principle = 1000
time_span = 30

data = price_data[['RSI', 'k_percent', 'r_percent', 'Price_Rate_Of_Change', 'MACD', 'On Balance Volume']].tail(time_span)
ps = rf_random.predict(data)



1.0
-1.0
-1.0
-1.0
1.0
1.0
1.0
1.0
1.0
-1.0
-1.0
1.0
-1.0
1.0
1.0
-1.0
1.0
-1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
-1.0
-1.0
-1.0
-1.0
-1.0
