In [108]:
import os
import sys
import requests

import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import confusion_matrix # , plot_confusion_matrix
# from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score, classification_report

In [109]:

def grab_price_data():
    """
    Grabs the daily price history for five tickers using the yfinance library 
    and stores the data in a CSV file in the specified format.
    
    Format: close,datetime,high,low,open,symbol,volume
    """

    # Define the list of tickers
    tickers_list = ['JPM', 'COST', 'IBM', 'HD', 'ARWR']

    # Store multiple result sets
    full_price_history = []

    # Fetch the data for all tickers
    for ticker in tickers_list:
        # Grab the daily price history for 1 year
        price_history = yf.download(ticker, start='2022-01-01', end='2023-01-01')

        # Add the required data to the full list in the specified format
        for index, row in price_history.iterrows():
            row_data = {
                'close': row['Close'],
                'datetime': index,
                'high': row['High'],
                'low': row['Low'],
                'open': row['Open'],
                'symbol': ticker,
                'volume': row['Volume']
            }
            full_price_history.append(row_data)

    # Convert the list to a DataFrame
    price_data = pd.DataFrame(full_price_history)

    # Reorder the columns
    price_data = price_data[['close', 'datetime', 'high', 'low', 'open', 'symbol', 'volume']]

    # Dump the data to a CSV file, without an index column
    price_data.to_csv('price_data.csv', index=False)

# Example usage
grab_price_data()


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [110]:
if os.path.exists('price_data.csv'):

    # Load the data
    price_data = pd.read_csv('price_data.csv')

else:

    # Grab the data and store it.
    grab_price_data()

    # Load the data
    price_data = pd.read_csv('price_data.csv')

# Display the head before moving on.
price_data.head()

Unnamed: 0,close,datetime,high,low,open,symbol,volume
0,161.699997,2022-01-03,162.639999,159.509995,159.860001,JPM,13120900.0
1,167.830002,2022-01-04,168.580002,164.229996,164.309998,JPM,20195800.0
2,163.779999,2022-01-05,168.360001,163.729996,167.820007,JPM,17539400.0
3,165.520004,2022-01-06,167.369995,163.869995,166.910004,JPM,14047500.0
4,167.160004,2022-01-07,167.529999,165.059998,165.669998,JPM,13913300.0


In [111]:
# I Just need the Close
price_data = price_data[['symbol','datetime','close','high','low','open','volume']]


'''
    First, for average investors, the return of an asset is a complete and scale–free 
    summary of the investment opportunity. Second, return series are easier to 
    handle than prices series as they have more attractive statistical properties
'''


# sort the values by symbol and then date
price_data.sort_values(by = ['symbol','datetime'], inplace = True)

# calculate the change in price
price_data['change_in_price'] = price_data['close'].diff()


In [112]:
# identify rows where the symbol changes
mask = price_data['symbol'] != price_data['symbol'].shift(1)

# For those rows, let's make the value null
price_data['change_in_price'] = np.where(mask == True, np.nan, price_data['change_in_price'])

# print the rows that have a null value, should only be 5
price_data[price_data.isna().any(axis = 1)]


Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,
251,COST,2022-01-03,566.710022,567.469971,555.51001,565.030029,2714100.0,
753,HD,2022-01-03,408.640015,417.839996,403.26001,416.570007,3715700.0,
502,IBM,2022-01-03,136.039993,136.289993,133.630005,134.070007,4605900.0,
0,JPM,2022-01-03,161.699997,162.639999,159.509995,159.860001,13120900.0,


In [113]:
# define the number of days out you want to predict
days_out = 30

# Group by symbol, then apply the rolling function and grab the Min and Max.
price_data_smoothed = price_data.groupby(['symbol'])[['close','low','high','open','volume']].transform(lambda x: x.ewm(span = days_out).mean())

# Join the smoothed columns with the symbol and datetime column from the old data frame.
smoothed_df = pd.concat([price_data[['symbol','datetime']], price_data_smoothed], axis=1, sort=False)

smoothed_df

Unnamed: 0,symbol,datetime,close,low,high,open,volume
1004,ARWR,2022-01-03,69.970001,63.720001,70.089996,66.250000,6.744000e+05
1005,ARWR,2022-01-04,67.422834,63.991252,69.955666,67.469334,7.742717e+05
1006,ARWR,2022-01-05,64.845876,62.551619,68.420177,66.359492,8.790275e+05
1007,ARWR,2022-01-06,63.758644,61.305748,66.675983,64.684370,8.044873e+05
1008,ARWR,2022-01-07,62.664561,60.733355,65.668964,63.727774,7.490426e+05
...,...,...,...,...,...,...,...
246,JPM,2022-12-23,131.134909,129.854456,132.354754,131.224043,1.031736e+07
247,JPM,2022-12-27,131.173947,129.899330,132.346060,131.246364,1.000082e+07
248,JPM,2022-12-28,131.256919,129.970986,132.414702,131.285308,9.925132e+06
249,JPM,2022-12-29,131.383569,130.121890,132.469883,131.391417,9.709652e+06


In [114]:
# define the number of days out you want to predict
days_out = 30

# create a new column that will house the flag, and for each group calculate the diff compared to 30 days ago. Then use Numpy to define the sign.
smoothed_df['Signal_Flag'] = smoothed_df.groupby('symbol')['close'].transform(lambda x : np.sign(x.diff(days_out)))

# print the first 50 rows
smoothed_df.head(50)

Unnamed: 0,symbol,datetime,close,low,high,open,volume,Signal_Flag
1004,ARWR,2022-01-03,69.970001,63.720001,70.089996,66.25,674400.0,
1005,ARWR,2022-01-04,67.422834,63.991252,69.955666,67.469334,774271.666667,
1006,ARWR,2022-01-05,64.845876,62.551619,68.420177,66.359492,879027.545354,
1007,ARWR,2022-01-06,63.758644,61.305748,66.675983,64.68437,804487.348317,
1008,ARWR,2022-01-07,62.664561,60.733355,65.668964,63.727774,749042.576659,
1009,ARWR,2022-01-10,62.051334,60.012764,64.497318,62.650265,708107.56262,
1010,ARWR,2022-01-11,61.734593,59.685398,63.838817,62.126161,682664.351296,
1011,ARWR,2022-01-12,61.18619,59.447381,63.434557,61.878663,669391.257028,
1012,ARWR,2022-01-13,60.670675,59.144696,62.792047,61.379948,646677.29105,
1013,ARWR,2022-01-14,60.291477,58.632143,62.284087,60.779478,639681.362301,


In [115]:
# Calculate the 14 day RSI
n = 14

# First make a copy of the data frame twice
up_df, down_df = price_data[['symbol','change_in_price']].copy(), price_data[['symbol','change_in_price']].copy()

# For up days, if the change is less than 0 set to 0.
up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0

# For down days, if the change is greater than 0 set to 0.
down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0

# We need change in price to be absolute.
down_df['change_in_price'] = down_df['change_in_price'].abs()

# Calculate the EWMA (Exponential Weighted Moving Average), meaning older values are given less weight compared to newer values.
ewma_up = up_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
ewma_down = down_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())

# Calculate the Relative Strength
relative_strength = ewma_up / ewma_down

# Calculate the Relative Strength Index
relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))

# Add the info to the data frame.
price_data['down_days'] = down_df['change_in_price']
price_data['up_days'] = up_df['change_in_price']
price_data['RSI'] = relative_strength_index

# Display the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,,,,
1005,ARWR,2022-01-04,65.040001,69.830002,64.245003,68.610001,867700.0,-4.93,4.93,0.0,0.0
1006,ARWR,2022-01-05,60.18,65.639999,59.945,64.349998,1068700.0,-4.860001,4.860001,0.0,0.0
1007,ARWR,2022-01-06,60.900002,62.09,58.029999,60.279999,608500.0,0.720001,0.0,0.720001,8.338192
1008,ARWR,2022-01-07,58.950001,62.25,58.790001,60.48,560800.0,-1.950001,1.950001,0.0,6.61463
1009,ARWR,2022-01-10,59.529999,59.68,57.049999,58.220001,539800.0,0.579998,0.0,0.579998,12.800605
1010,ARWR,2022-01-11,60.220001,60.689999,58.119999,59.619999,561000.0,0.690002,0.0,0.690002,20.068663
1011,ARWR,2022-01-12,58.220001,61.248001,58.16,60.540001,597600.0,-2.0,2.0,0.0,15.693833
1012,ARWR,2022-01-13,57.580002,58.939999,57.330002,58.389999,510500.0,-0.639999,0.639999,0.0,14.524741
1013,ARWR,2022-01-14,57.810001,58.959999,55.278,56.849998,593900.0,0.23,0.0,0.23,17.085939


In [116]:
# Calculate the Stochastic Oscillator
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['symbol','low']].copy(), price_data[['symbol','high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate the Stochastic Oscillator.
k_percent = 100 * ((price_data['close'] - low_14) / (high_14 - low_14))

# Add the info to the data frame.
price_data['low_14'] = low_14
price_data['high_14'] = high_14
price_data['k_percent'] = k_percent

# Display the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,,,,,,,
1005,ARWR,2022-01-04,65.040001,69.830002,64.245003,68.610001,867700.0,-4.93,4.93,0.0,0.0,,,
1006,ARWR,2022-01-05,60.18,65.639999,59.945,64.349998,1068700.0,-4.860001,4.860001,0.0,0.0,,,
1007,ARWR,2022-01-06,60.900002,62.09,58.029999,60.279999,608500.0,0.720001,0.0,0.720001,8.338192,,,
1008,ARWR,2022-01-07,58.950001,62.25,58.790001,60.48,560800.0,-1.950001,1.950001,0.0,6.61463,,,
1009,ARWR,2022-01-10,59.529999,59.68,57.049999,58.220001,539800.0,0.579998,0.0,0.579998,12.800605,,,
1010,ARWR,2022-01-11,60.220001,60.689999,58.119999,59.619999,561000.0,0.690002,0.0,0.690002,20.068663,,,
1011,ARWR,2022-01-12,58.220001,61.248001,58.16,60.540001,597600.0,-2.0,2.0,0.0,15.693833,,,
1012,ARWR,2022-01-13,57.580002,58.939999,57.330002,58.389999,510500.0,-0.639999,0.639999,0.0,14.524741,,,
1013,ARWR,2022-01-14,57.810001,58.959999,55.278,56.849998,593900.0,0.23,0.0,0.23,17.085939,,,


In [117]:
# Calculate the Williams %R
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['symbol','low']].copy(), price_data[['symbol','high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate William %R indicator.
r_percent = ((high_14 - price_data['close']) / (high_14 - low_14)) * - 100

# Add the info to the data frame.
price_data['r_percent'] = r_percent

# Display the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,,,,,,,,
1005,ARWR,2022-01-04,65.040001,69.830002,64.245003,68.610001,867700.0,-4.93,4.93,0.0,0.0,,,,
1006,ARWR,2022-01-05,60.18,65.639999,59.945,64.349998,1068700.0,-4.860001,4.860001,0.0,0.0,,,,
1007,ARWR,2022-01-06,60.900002,62.09,58.029999,60.279999,608500.0,0.720001,0.0,0.720001,8.338192,,,,
1008,ARWR,2022-01-07,58.950001,62.25,58.790001,60.48,560800.0,-1.950001,1.950001,0.0,6.61463,,,,
1009,ARWR,2022-01-10,59.529999,59.68,57.049999,58.220001,539800.0,0.579998,0.0,0.579998,12.800605,,,,
1010,ARWR,2022-01-11,60.220001,60.689999,58.119999,59.619999,561000.0,0.690002,0.0,0.690002,20.068663,,,,
1011,ARWR,2022-01-12,58.220001,61.248001,58.16,60.540001,597600.0,-2.0,2.0,0.0,15.693833,,,,
1012,ARWR,2022-01-13,57.580002,58.939999,57.330002,58.389999,510500.0,-0.639999,0.639999,0.0,14.524741,,,,
1013,ARWR,2022-01-14,57.810001,58.959999,55.278,56.849998,593900.0,0.23,0.0,0.23,17.085939,,,,


In [118]:
# Calculate the MACD
ema_26 = price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 26).mean())
ema_12 = price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 12).mean())
macd = ema_12 - ema_26

# Calculate the EMA
ema_9_macd = macd.ewm(span = 9).mean()

# Store the data in the data frame.
price_data['MACD'] = macd
price_data['MACD_EMA'] = ema_9_macd

# Print the head.
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,,,,,,,,,0.0,0.0
1005,ARWR,2022-01-04,65.040001,69.830002,64.245003,68.610001,867700.0,-4.93,4.93,0.0,0.0,,,,,-0.110609,-0.061449
1006,ARWR,2022-01-05,60.18,65.639999,59.945,64.349998,1068700.0,-4.860001,4.860001,0.0,0.0,,,,,-0.291646,-0.155792
1007,ARWR,2022-01-06,60.900002,62.09,58.029999,60.279999,608500.0,0.720001,0.0,0.720001,8.338192,,,,,-0.340842,-0.218479
1008,ARWR,2022-01-07,58.950001,62.25,58.790001,60.48,560800.0,-1.950001,1.950001,0.0,6.61463,,,,,-0.437994,-0.28378
1009,ARWR,2022-01-10,59.529999,59.68,57.049999,58.220001,539800.0,0.579998,0.0,0.579998,12.800605,,,,,-0.463362,-0.332457
1010,ARWR,2022-01-11,60.220001,60.689999,58.119999,59.619999,561000.0,0.690002,0.0,0.690002,20.068663,,,,,-0.440391,-0.359772
1011,ARWR,2022-01-12,58.220001,61.248001,58.16,60.540001,597600.0,-2.0,2.0,0.0,15.693833,,,,,-0.513027,-0.396602
1012,ARWR,2022-01-13,57.580002,58.939999,57.330002,58.389999,510500.0,-0.639999,0.639999,0.0,14.524741,,,,,-0.587273,-0.440648
1013,ARWR,2022-01-14,57.810001,58.959999,55.278,56.849998,593900.0,0.23,0.0,0.23,17.085939,,,,,-0.61955,-0.480732


In [119]:
# Calculate the Price Rate of Change
n = 9

# Calculate the Rate of Change in the Price, and store it in the Data Frame.
price_data['Price_Rate_Of_Change'] = price_data.groupby('symbol')['close'].transform(lambda x: x.pct_change(periods = n))

# Print the first 30 rows
price_data.head(30)

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,,,,,,,,,0.0,0.0,
1005,ARWR,2022-01-04,65.040001,69.830002,64.245003,68.610001,867700.0,-4.93,4.93,0.0,0.0,,,,,-0.110609,-0.061449,
1006,ARWR,2022-01-05,60.18,65.639999,59.945,64.349998,1068700.0,-4.860001,4.860001,0.0,0.0,,,,,-0.291646,-0.155792,
1007,ARWR,2022-01-06,60.900002,62.09,58.029999,60.279999,608500.0,0.720001,0.0,0.720001,8.338192,,,,,-0.340842,-0.218479,
1008,ARWR,2022-01-07,58.950001,62.25,58.790001,60.48,560800.0,-1.950001,1.950001,0.0,6.61463,,,,,-0.437994,-0.28378,
1009,ARWR,2022-01-10,59.529999,59.68,57.049999,58.220001,539800.0,0.579998,0.0,0.579998,12.800605,,,,,-0.463362,-0.332457,
1010,ARWR,2022-01-11,60.220001,60.689999,58.119999,59.619999,561000.0,0.690002,0.0,0.690002,20.068663,,,,,-0.440391,-0.359772,
1011,ARWR,2022-01-12,58.220001,61.248001,58.16,60.540001,597600.0,-2.0,2.0,0.0,15.693833,,,,,-0.513027,-0.396602,
1012,ARWR,2022-01-13,57.580002,58.939999,57.330002,58.389999,510500.0,-0.639999,0.639999,0.0,14.524741,,,,,-0.587273,-0.440648,
1013,ARWR,2022-01-14,57.810001,58.959999,55.278,56.849998,593900.0,0.23,0.0,0.23,17.085939,,,,,-0.61955,-0.480732,-0.173789


In [120]:
def obv(group):

    # Grab the volume and close column.
    volume = group['volume']
    change = group['close'].diff()

    # intialize the previous OBV
    prev_obv = 0
    obv_values = []

    # calculate the On Balance Volume
    for i, j in zip(change, volume):

        if i > 0:
            current_obv = prev_obv + j
        elif i < 0:
            current_obv = prev_obv - j
        else:
            current_obv = prev_obv

        # OBV.append(current_OBV)
        prev_obv = current_obv
        obv_values.append(current_obv)
    
    # Return a panda series.
    return pd.Series(obv_values, index = group.index)
        

# apply the function to each group
obv_groups = price_data.groupby('symbol').apply(obv)

# add to the data frame, but drop the old index, before adding it.
price_data['On Balance Volume'] = obv_groups.reset_index(level=0, drop=True)

# display the data frame.
price_data.head(30)

  obv_groups = price_data.groupby('symbol').apply(obv)


Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,,,,,,,,,0.0,0.0,,0.0
1005,ARWR,2022-01-04,65.040001,69.830002,64.245003,68.610001,867700.0,-4.93,4.93,0.0,0.0,,,,,-0.110609,-0.061449,,-867700.0
1006,ARWR,2022-01-05,60.18,65.639999,59.945,64.349998,1068700.0,-4.860001,4.860001,0.0,0.0,,,,,-0.291646,-0.155792,,-1936400.0
1007,ARWR,2022-01-06,60.900002,62.09,58.029999,60.279999,608500.0,0.720001,0.0,0.720001,8.338192,,,,,-0.340842,-0.218479,,-1327900.0
1008,ARWR,2022-01-07,58.950001,62.25,58.790001,60.48,560800.0,-1.950001,1.950001,0.0,6.61463,,,,,-0.437994,-0.28378,,-1888700.0
1009,ARWR,2022-01-10,59.529999,59.68,57.049999,58.220001,539800.0,0.579998,0.0,0.579998,12.800605,,,,,-0.463362,-0.332457,,-1348900.0
1010,ARWR,2022-01-11,60.220001,60.689999,58.119999,59.619999,561000.0,0.690002,0.0,0.690002,20.068663,,,,,-0.440391,-0.359772,,-787900.0
1011,ARWR,2022-01-12,58.220001,61.248001,58.16,60.540001,597600.0,-2.0,2.0,0.0,15.693833,,,,,-0.513027,-0.396602,,-1385500.0
1012,ARWR,2022-01-13,57.580002,58.939999,57.330002,58.389999,510500.0,-0.639999,0.639999,0.0,14.524741,,,,,-0.587273,-0.440648,,-1896000.0
1013,ARWR,2022-01-14,57.810001,58.959999,55.278,56.849998,593900.0,0.23,0.0,0.23,17.085939,,,,,-0.61955,-0.480732,-0.173789,-1302100.0


In [121]:
#  Create a column we wish to predict
'''
    In this case, let's create an output column that will be 1 if the closing price at time 't' is greater than 't-1' and 0 otherwise.
    In other words, if the today's closing price is greater than yesterday's closing price it would be 1.
'''

# Group by the `Symbol` column, then grab the `Close` column.
close_groups = price_data.groupby('symbol')['close']

# Apply the lambda function which will return -1.0 for down, 1.0 for up and 0.0 for no change.
close_groups = close_groups.transform(lambda x : np.sign(x.diff()))

# add the data to the main dataframe.
price_data['Prediction'] = close_groups

# for simplicity in later sections I'm going to make a change to our prediction column. To keep this as a binary classifier I'll change flat days and consider them up days.
price_data.loc[price_data['Prediction'] == 0.0] = 1.0

# print the head
price_data.head(50)

# OPTIONAL CODE: Dump the data frame to a CSV file to examine the data yourself.
# price_data.to_csv('final_metrics.csv')

Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
1004,ARWR,2022-01-03,69.970001,70.089996,63.720001,66.25,674400.0,,,,,,,,,0.0,0.0,,0.0,
1005,ARWR,2022-01-04,65.040001,69.830002,64.245003,68.610001,867700.0,-4.93,4.93,0.0,0.0,,,,,-0.110609,-0.061449,,-867700.0,-1.0
1006,ARWR,2022-01-05,60.18,65.639999,59.945,64.349998,1068700.0,-4.860001,4.860001,0.0,0.0,,,,,-0.291646,-0.155792,,-1936400.0,-1.0
1007,ARWR,2022-01-06,60.900002,62.09,58.029999,60.279999,608500.0,0.720001,0.0,0.720001,8.338192,,,,,-0.340842,-0.218479,,-1327900.0,1.0
1008,ARWR,2022-01-07,58.950001,62.25,58.790001,60.48,560800.0,-1.950001,1.950001,0.0,6.61463,,,,,-0.437994,-0.28378,,-1888700.0,-1.0
1009,ARWR,2022-01-10,59.529999,59.68,57.049999,58.220001,539800.0,0.579998,0.0,0.579998,12.800605,,,,,-0.463362,-0.332457,,-1348900.0,1.0
1010,ARWR,2022-01-11,60.220001,60.689999,58.119999,59.619999,561000.0,0.690002,0.0,0.690002,20.068663,,,,,-0.440391,-0.359772,,-787900.0,1.0
1011,ARWR,2022-01-12,58.220001,61.248001,58.16,60.540001,597600.0,-2.0,2.0,0.0,15.693833,,,,,-0.513027,-0.396602,,-1385500.0,-1.0
1012,ARWR,2022-01-13,57.580002,58.939999,57.330002,58.389999,510500.0,-0.639999,0.639999,0.0,14.524741,,,,,-0.587273,-0.440648,,-1896000.0,-1.0
1013,ARWR,2022-01-14,57.810001,58.959999,55.278,56.849998,593900.0,0.23,0.0,0.23,17.085939,,,,,-0.61955,-0.480732,-0.173789,-1302100.0,1.0


In [122]:
# We need to remove all rows that have an NaN value.
print('Before NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Any row that has a `NaN` value will be dropped.
price_data = price_data.dropna()

# Display how much we have left now.
print('After NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Print the head.
price_data.head()

Before NaN Drop we have 1255 rows and 20 columns
After NaN Drop we have 1190 rows and 20 columns


Unnamed: 0,symbol,datetime,close,high,low,open,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
1017,ARWR,2022-01-21,50.84,53.490002,50.82,51.619999,766400.0,-1.669998,1.669998,0.0,7.580371,50.82,70.089996,0.103791,-99.896209,-1.386952,-0.891782,-0.137574,-4137100.0,-1.0
1018,ARWR,2022-01-24,52.790001,53.32,47.292,49.310001,1130500.0,1.950001,0.0,1.950001,24.893617,47.292,69.830002,24.394359,-75.605641,-1.416202,-1.000491,-0.11322,-3006600.0,1.0
1019,ARWR,2022-01-25,50.990002,52.52,49.34,51.740002,667600.0,-1.799999,1.799999,0.0,20.752876,47.292,65.639999,20.154796,-79.845204,-1.529766,-1.109412,-0.153271,-3674200.0,-1.0
1020,ARWR,2022-01-26,50.529999,54.380001,50.259998,52.07,843300.0,-0.460003,0.460003,0.0,19.78257,47.292,62.25,21.647272,-78.352728,-1.625857,-1.21508,-0.132085,-4517500.0,-1.0
1021,ARWR,2022-01-27,48.200001,51.560001,48.07,51.200001,585800.0,-2.329998,2.329998,0.0,15.536964,47.292,62.25,6.070337,-93.929663,-1.826394,-1.339586,-0.162904,-5103300.0,-1.0


In [123]:
# Grab our X & Y Columns.
X_Cols = price_data[['RSI','k_percent','r_percent','Price_Rate_Of_Change','MACD','On Balance Volume']]
Y_Cols = price_data['Prediction']

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X_Cols, Y_Cols, random_state = 0)

# Create a Random Forest Classifier
rand_frst_clf = RandomForestClassifier(n_estimators = 100, oob_score = True, criterion = "gini", random_state = 0)

# Fit the data to the model
rand_frst_clf.fit(X_train, y_train)

# Make predictions
y_pred = rand_frst_clf.predict(X_test)

In [124]:
print('Correct Prediction (%): ', accuracy_score(y_test, rand_frst_clf.predict(X_test), normalize = True) * 100.0)

Correct Prediction (%):  70.80536912751678


In [125]:
# Define the traget names
target_names = ['Down Day', 'Up Day']

# Build a classifcation report
report = classification_report(y_true = y_test, y_pred = y_pred, target_names = target_names, output_dict = True)

# Add it to a data frame, transpose it for readability.
report_df = pd.DataFrame(report).transpose()
report_df

Unnamed: 0,precision,recall,f1-score,support
Down Day,0.710345,0.695946,0.703072,148.0
Up Day,0.705882,0.72,0.712871,150.0
accuracy,0.708054,0.708054,0.708054,0.708054
macro avg,0.708114,0.707973,0.707971,298.0
weighted avg,0.708099,0.708054,0.708004,298.0


In [126]:
feature_imp = pd.Series(rand_frst_clf.feature_importances_, index=X_Cols.columns).sort_values(ascending=False)
feature_imp

k_percent               0.219495
r_percent               0.211106
RSI                     0.169626
Price_Rate_Of_Change    0.149134
MACD                    0.140463
On Balance Volume       0.110176
dtype: float64

In [127]:
# Number of trees in random forest
# Number of trees is not a parameter that should be tuned, but just set large enough usually. There is no risk of overfitting in random forest with growing number of # trees, as they are trained independently from each other. 
n_estimators = list(range(200, 2000, 200))

# Number of features to consider at every split
max_features = ['sqrt', None, 'log2']

# Maximum number of levels in tree
# Max depth is a parameter that most of the times should be set as high as possible, but possibly better performance can be achieved by setting it lower.
max_depth = list(range(10, 110, 10))
max_depth.append(None)

# Minimum number of samples required to split a node
# Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree. Too high values can also lead to # under-fitting hence depending on the level of underfitting or overfitting, you can tune the values for min_samples_split.
min_samples_split = [2, 5, 10, 20, 30, 40]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 7, 12, 14, 16 ,20]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800], 'max_features': ['sqrt', None, 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 10, 20, 30, 40], 'min_samples_leaf': [1, 2, 7, 12, 14, 16, 20], 'bootstrap': [True, False]}


In [128]:
# New Random Forest Classifier to house optimal parameters
rf = RandomForestClassifier()

# Specfiy the details of our Randomized Search
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1, error_score="raise")

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=20, min_samples_split=40, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=20, min_samples_split=40, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=20, min_samples_split=40, n_estimators=400; total time=   0.9s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=7, min_samples_split=40, n_estimators=200; total time=   0.5s
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=16, min_samples_split=2, n_estimators=400; total time=   1.5s
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=16, min_samples_split=2, n_estimators=400; total time=   1.6s
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=16, min_samples_split=2, n_esti

In [131]:
# With the new Random Classifier trained we can proceed to our regular steps, prediction.
rf_random.predict(X_test)


'''
    ACCURACY
'''
# Once the predictions have been made, then grab the accuracy score.
print('Correct Prediction (%): ', accuracy_score(y_test, rf_random.predict(X_test), normalize = True) * 100.0)


'''
    CLASSIFICATION REPORT
'''
# Define the traget names
target_names = ['Down Day', 'Up Day']

# Build a classifcation report
report = classification_report(y_true = y_test, y_pred = y_pred, target_names = target_names, output_dict = True)

# Add it to a data frame, transpose it for readability.
report_df = pd.DataFrame(report).transpose()
display(report_df)
print('\n')

'''
    FEATURE IMPORTANCE
'''
# Calculate feature importance and store in pandas series
feature_imp = pd.Series(rand_frst_clf.feature_importances_, index=X_Cols.columns).sort_values(ascending=False)
display(feature_imp)

Correct Prediction (%):  73.15436241610739


Unnamed: 0,precision,recall,f1-score,support
Down Day,0.710345,0.695946,0.703072,148.0
Up Day,0.705882,0.72,0.712871,150.0
accuracy,0.708054,0.708054,0.708054,0.708054
macro avg,0.708114,0.707973,0.707971,298.0
weighted avg,0.708099,0.708054,0.708004,298.0






k_percent               0.219495
r_percent               0.211106
RSI                     0.169626
Price_Rate_Of_Change    0.149134
MACD                    0.140463
On Balance Volume       0.110176
dtype: float64

In [130]:
rf_random.best_estimator_