In [None]:
# Package imports 
from collections import Counter, deque
import datetime
import numpy as np 
import os
import pandas as pd
from pandas_datareader import data as pdr
import pickle
import random
import requests
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import time
   
hm_units = 10 # How many candles into the future the model will predict
req = 0.002 # % incresae to be counted as a 'buy' 'hm_units' into the future
SEQ_LEN = 60 # How many preceeding candles will be included in each data point 
BATCH_SIZE = 128 # Initial size of chunks to be fed through Neural Network
EPOCHS = 30 # Initial number of epochs the network will run through
CORRELATION_COEFFICIENT = 0.92 # (Not used for this model)

NAME = f'{SEQ_LEN}-SEQ-{hm_units}-PRED-{int(time.time())}' # Naming convention for model names

#data_split_date = '2021-07-22 13:30:00' # Date defining the split between training and testing data

#api key that will be used to access data from the source  
api = 'gDvQNVWDC2mhOlN1Z9if6JEyDM08CpeC'

In [None]:
#This function determines whether the model should buy or sell shares of a company. 
#One of two values will be returned, either a 0, or a 1
#If a '1' is returned, it means that the model predicts that the price will increase by 'req' % 
#in 'hm_units' candles, and that the price will NOT fall by -'req' % before hitting the 
#'hm_units' candle. If a '0' is returned, it simply means that neither of the requirements are met, 
#and the model should not buy shares of the company

#The function takes three parameters: the current price, the future price (after 'hm_units' candles, 
#and the list of future LOW prices that immediately follow the current price. This 'future_prices' list will be exactly 'hm_units' long.
def classify(current_price, future_price, future_lows):
    
    #Sets the threshold value for when a price movement is considered to be significant 
    requirement = req
    
    #Creates a list of %changes in relation to 'current_price' for each future low price within 'future_lows' 
    lows = []
    for price in future_lows:
        lows.append((price - current_price) / current_price)
        
    #percent change between 'current_price' and 'future_price'
    future_change = (future_price - current_price) / current_price
    
    #If the last value (value to be predicted) is greater than or equal to the threshold (a significant increase is predicted),
    #the method will prepare to return either a 1 (significant increase w/o prior decrease) or a 0 (significant increase *with*
    #prior decrease
    if future_change >= requirement:
        
        #Checks to see if any of the values in the 'lows' list falls below the requirement (hit the stop loss of the trade before the shares are sold)
        for low in future_lows:
            if low <= -requirement:
                return 0
            
        return 1
    
    
    #If, in 'hm_units' candles, there are no significant increases or decreases, the function simply returns a zero
    else:
        return 0
    

In [None]:
#Calculates the ATR of a specific stock at any given time (for minute data)
def Minute_ATR_Column(ticker):
    df = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')
    
    #Drops the 'open' and 'volume' columns because they are not needed
    df.drop([f'{ticker}_Open', f'{ticker}_Volume'], axis = 1, inplace = True)

    #List of 'true ranges' for every single candle
    true_ranges = []
    
    #Calculates the true tranges for every candle and adds each one to the 'true_ranges' list
    df[f'{ticker}_Close'] = (df[f'{ticker}_Close'].shift(1))
    
    true_ranges.append(df[f'{ticker}_High'].iloc[0] - df[f'{ticker}_Low'].iloc[0])
    
    for i in range(1, len(df)):
        ranges = []
        ranges.append(df[f'{ticker}_High'].iloc[i] - df[f'{ticker}_Low'].iloc[i])
        ranges.append(abs(df[f'{ticker}_High'].iloc[i] - df[f'{ticker}_Close'].iloc[i]))
        ranges.append(abs(df[f'{ticker}_Low'].iloc[i] - df[f'{ticker}_Close'].iloc[i]))
        true_ranges.append(max(ranges))
    
    #Converts the list of true ranges to a dictionary, then to a dataframe 
    TRS = {'Ranges': true_ranges, 'Date': df.index.values}
    true_ranges_DF = pd.DataFrame(data = TRS)
    true_ranges_DF.set_index('Date', inplace = True)
    
    #Converts each 'true range' into an 'average true range' of the last 14 candles (ATR)
    df['ATRS'] = true_ranges_DF['Ranges'].rolling(window = 14, min_periods = 0).sum().div(14)
    return df['ATRS']
       

In [None]:
#Creates the moving average columns for the minute data. Includes the 9MA, 50MA, 100MA, 200MA
#The 'daily' and 'minute' parameters determine which time frame is used to calculate the MA's
def moving_averages_column(ticker, daily = False, minute = False):
    
    if daily:
        dfd = pd.read_csv(f'DAILY_STOCK_DATA//{ticker}.csv') #.format(ticker.replace('.', '-')))
        dfd = dfd[f'{ticker}_Close'].to_frame()
        dfd[f'{ticker}_Daily_MA_9'] = dfd[f'{ticker}_Close'].rolling(window = 9, min_periods = 0).mean()
        dfd[f'{ticker}_Daily_MA_50'] = dfd[f'{ticker}_Close'].rolling(window = 50, min_periods = 0).mean()
        dfd[f'{ticker}_Daily_MA_100'] = dfd[f'{ticker}_Close'].rolling(window = 100, min_periods = 0).mean()
        dfd[f'{ticker}_Daily_MA_200'] = dfd[f'{ticker}_Close'].rolling(window = 200, min_periods = 0).mean()
    
        return dfd[f'{ticker}_Daily_MA_9'], dfd[f'{ticker}_Daily_MA_50'], dfd[f'{ticker}_Daily_MA_100'], dfd[f'{ticker}_Daily_MA_200']
    
    if minute:
        dfm = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv') #.format(ticker.replace('.', '-')))
        dfm = dfm[f'{ticker}_Close'].to_frame()
        dfm[f'{ticker}_Minute_MA_9'] = dfm[f'{ticker}_Close'].rolling(window = 9, min_periods = 0).mean()
        dfm[f'{ticker}_Minute_MA_50'] = dfm[f'{ticker}_Close'].rolling(window = 50, min_periods = 0).mean()
        dfm[f'{ticker}_Minute_MA_100'] = dfm[f'{ticker}_Close'].rolling(window = 100, min_periods = 0).mean()
        dfm[f'{ticker}_Minute_MA_200'] = dfm[f'{ticker}_Close'].rolling(window = 200, min_periods = 0).mean()
    
        return dfm[f'{ticker}_Minute_MA_9'], dfm[f'{ticker}_Minute_MA_50'], dfm[f'{ticker}_Minute_MA_100'], dfm[f'{ticker}_Minute_MA_200']
    
#moving_averages_column('AAPL')

In [None]:
#This function will get the daily and minute data for a specific ticker, and it will save it to a CSV to be accessed later.
#The desired ticker must be passed as an argument, as well as the start/end date of the data (string). 
#The 'ignore_after_hours' parameter (boolean) will dictate whether the dataset only contains active hours data, 
#or if it includes pre/post market. If the pre/post market data is also taken, the user must account for 
#a high volume of zero volume periods, since the stocks are traded less frequently during these times. 
#The 'minute' and 'daily' paramaters are booleans which dicate which type of data will be acquired:
#daily data or minute data. Both initially set to True

#Dates must be in format yyyy-mm-dd
def get_ticker_data(ticker, start_date, end_date, ignore_after_hours = True, include_minute = True, include_daily = True):
    
        print(f'Getting {ticker} data')
    
        #The 'sort' variable dictates the method by which the data is sorted. asc (Ascending) means the oldest dates are at the top
        #and desc (Descending) means the newest dates are at the top. DO NOT CHANGE. This variable was only put in place to make it easier
        #for me to develop/test the code while I was writing it. 
        sort = 'asc' #or desc
        
        #This variable dictates how many days the 'date_tracker' objecd (created below) will be advanced by each time new data is added
        #To the stock minute dataframe
        increment = 60
        
        #This is the file path to store the final Minute stock data
        minute_data_path = f'MINUTE_STOCK_DATA//{ticker}'
        
        #This is the file path to store the final Daily stock data
        daily_data_path = f'DAILY_STOCK_DATA//{ticker}'
        
        #Creates 'start'/'end' strings from the 'start_date' and 'end_date' parameters that can be accessed during execution of the method
        start = start_date
        end = end_date
        
        #Creates a datetime object that stores the 'end' date (to be compared to during the while loop)
        end_date_time = datetime.datetime(int(end[:4]), int(end[5:7]), int(end[8:10]))
        
        #This 'date_tracker' object will allow the following while loop determine when to exit the loop
        date_tracker = datetime.datetime(int(start[:4]), int(start[5:7]), int(start[8:10]))
        
        #Url string for the initial 'df1' data
        minute_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{str(date_tracker)[:10]}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'
        
        #If the 'minute' parameter is set to True
        if include_minute:
    
            #Requests the minute stock data from the url listed above and creates the proper dataframe    
            firstData = requests.get(minute_url).json()
            df1 = pd.DataFrame(firstData['results'])
            df1.reset_index(inplace = True)
            df1.drop(['vw', 'n'], axis = 1, inplace = True)

            df1['t'] = pd.to_datetime(df1['t'], unit = 'ms')
            df1.rename(columns = {'c': f'{ticker}_Close', 
                                 'o': f'{ticker}_Open', 
                                 'h': f'{ticker}_High',
                                 'l': f'{ticker}_Low',
                                 'v': f'{ticker}_Volume',
                                 't': 'Date'}, inplace = True)
            
            #List of features contained within the initial stock data
            features_list = []

            #Populates the list of features
            for col in df1.columns:
                features_list.append(col)

            #The list of lists containing data from each feature within the minute stock data. The index of each list will 
            #correspond with the index of the the list's data's respective feature within the 'features' list
            df1_data_list = []

            #Appends a list form of each column (feature) of data to the 'df1_data_list'
            for col in df1.columns:
                df1_data_list.append(df1[col].to_list())

            #Status update
            '''print("Collecting and compiling data")'''

            #Since each data call is limited to 50000 candles, this while loop is needed to join multiple 50000 candle chunks of data together
            #The 'date_tracker' object will tell the while loop when it should exit the loop, and that is when the 'date-tracker' object's date
            #Surpasses the requested end_date_time minus seventy days. The reason the seventy is subtracted, is so that an error is not thrown 
            #when the date_tracker (which is continually incremented by 70 days throughout the loop) surpasses the requested end_date_time, 
            #thereby possibly requesting data from the future (which obviously doesn't exist)
            while date_tracker < (end_date_time - datetime.timedelta(days = increment)):

                #Prints the first date of each obtained chunk of data (since data can only be collected in chunks of 50000)
                #print(date_tracker)

                #Second url that grabs the data that will be added to the end of the initial 'df1' dataframe
                minute_url2 = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{str(date_tracker + datetime.timedelta(days = increment))[:10]}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'

                #Acquires the 'to-be-added' data and formats it accordingly
                secondData = requests.get(minute_url2).json()
                df2 = pd.DataFrame(secondData['results'])
                df2.reset_index(inplace = True)
                df2.drop(['vw', 'n'], axis = 1, inplace = True)
                df2['t'] = pd.to_datetime(df2['t'], unit = 'ms')
                df2.rename(columns = {'c': f'{ticker}_Close', 
                                     'o': f'{ticker}_Open', 
                                     'h': f'{ticker}_High',
                                     'l': f'{ticker}_Low',
                                     'v': f'{ticker}_Volume',
                                     't': 'Date'}, inplace = True)

                #Creates a list that will contain list formats of each features' data from the 'df2' dataframe
                df2_data_list = []

                #Populates the above list
                for col in df2.columns:
                    df2_data_list.append(df2[col].to_list())

                #Appends the contents of the newly aqcuired list to the initial list
                for i in range(len(features_list)):
                    df1_data_list[i].extend(df2_data_list[i])

                #Increments the date_tracker object by 'increment' days
                date_tracker += datetime.timedelta(days = increment)

            #Progress tracker for the progression of the below code
            progress = 0.0

            #If the 'ignore_after_hours' paramater is read as True, excecute the following code
            if ignore_after_hours:

                #List containing all of the indeces that are to be deleted
                indeces_to_delete = []

                #Status update
                '''print("Finding all pre/post market data")'''

                #This loop will iterate through every data point in the data and determine the indeces of all 
                #pre/post market data
                for i in range(len(df1_data_list[features_list.index('Date')])):

                    #Prints the progress of this loop
                    '''if(progress % 50000 == 0):
                        print(progress / float(len(df1_data_list[features_list.index('Date')])))

                    progress += 1'''

                    #Sets the hour/minute values of each iteration
                    hour = int(str(df1_data_list[features_list.index('Date')][i])[11:13])
                    minute = int(str(df1_data_list[features_list.index('Date')][i])[14:16])

                    #Appends the index of all of the candles that occur before 1:30 pm and after
                    #8:00 pm (since those are the beginning and end times for the recorded stock data

                    if hour < 13:
                        indeces_to_delete.append(i)
                    elif hour >= 20:
                        indeces_to_delete.append(i)
                    elif hour == 13 and minute < 30:
                        indeces_to_delete.append(i)

                #Status update
                '''print("Deleting all pre/post market data")'''

                #This variable accounts for the shift in the data that occurs every time an index is deleted.
                #The 'indeces_to_delete' list contains all of the indeces of candles that occur during pre/post market,
                #but every time one of those is deleted in the loop below, the true index of these candles is shifted 
                #down by one. The 'compensation' variable keeps track of this, and acouunts for it during the filtering process.
                compensation = 0

                #Tracks the progress of the loop
                progress = 0.0

                #Iterate through every index of the 'indeces_to_delete' list
                for i in indeces_to_delete:

                    #Prints the progress to the user
                    '''if(progress % 30000 == 0):
                        print(progress / float(len(indeces_to_delete)))

                    progress += 1'''

                    #Deletes the current loop iteration's respective index for every feature within
                    #The list of features 'df1_data_list'
                    for feature in range(len(features_list)):
                        del df1_data_list[feature][i - compensation]

                    #Increases 'compensation' by one to account for the shift in data caused by index deletion
                    compensation += 1

            #Status update
            '''print(f'Exporting {ticker} minute data')'''

            #Creates the DataFrame containing the minute data and writes it to a CSV File
            minute_data_df = pd.DataFrame()

            minute_data_df[f'{ticker}_Volume'] = df1_data_list[features_list.index(f'{ticker}_Volume')]
            minute_data_df[f'{ticker}_High'] = df1_data_list[features_list.index(f'{ticker}_High')]
            minute_data_df[f'{ticker}_Low'] = df1_data_list[features_list.index(f'{ticker}_Low')]
            minute_data_df[f'{ticker}_Open'] = df1_data_list[features_list.index(f'{ticker}_Open')]
            minute_data_df[f'{ticker}_Close'] = df1_data_list[features_list.index(f'{ticker}_Close')]
            minute_data_df['Date'] = df1_data_list[features_list.index('Date')]

            minute_data_df.drop_duplicates(inplace = True)

            #minute_data_df.to_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')

            #Status update
            '''print(f'Exporting {ticker} Daily data')'''
            
        #If the 'daily' parameter is set to True
        if include_daily:
        
            #Beginning of Daily Data collection
            daily_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{start}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'

            daily_data_df = requests.get(daily_url).json()
            daily_data_df = pd.DataFrame(daily_data_df['results'])

            daily_data_df.reset_index(inplace = True)
            daily_data_df.drop(['vw', 'n'], axis = 1, inplace = True)

            daily_data_df['t'] = pd.to_datetime(daily_data_df['t'], unit = 'ms')
            daily_data_df.rename(columns = {'c': f'{ticker}_Close', 
                                 'o': f'{ticker}_Open', 
                                 'h': f'{ticker}_High',
                                 'l': f'{ticker}_Low',
                                 'v': f'{ticker}_Volume',
                                 't': 'Date'}, inplace = True)

            #daily_data.to_csv(f'DAILY_STOCK_DATA//{ticker}.csv')
            
        #Returns the proper DateFrames depending on which (or both) of the 'minute'/'daily' parameters are True
        if include_minute and include_daily:
            return minute_data_df, daily_data_df
        elif include_minute:
            return minute_data_df
        elif include_daily:
            return daily_data_df
        #If neither the 'minute' nor the 'daily' parameter is True, prints an error statement
        else:
            print("No data type specified")
        '''print(f'Done with {ticker}')'''


In [None]:
def update_ticker_data(ticker, end_date, ignore_after_hours = True, include_minute = True, include_daily = True):
    if not include_minute and not include_daily:
        print("No time window specified")
        
    if include_minute:
        df1 = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')
        df1.drop(['Unnamed: 0'], inplace = True, axis = 1)
        
        features_list = []
        
        for col in df1.columns:
            features_list.append(col)
        
        start_date = df1['Date'].to_list()[-1][:10]
        end = end_date
        
        sort = 'asc'
        
        minute_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{start_date}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'
  
        #Requests the minute stock data from the url listed above and creates the proper dataframe    
        minute_data = requests.get(minute_url).json()
        df2 = pd.DataFrame(minute_data['results'])
        df2.reset_index(inplace = True)
        #df2.drop(['vw', 'n', 'index', 'a', 'op'], axis = 1, inplace = True)
        df2.drop(['vw', 'n', 'index'], axis = 1, inplace = True)
        
        df2['t'] = pd.to_datetime(df2['t'], unit = 'ms')
        df2.rename(columns = {'c': f'{ticker}_Close', 
                             'o': f'{ticker}_Open', 
                             'h': f'{ticker}_High',
                             'l': f'{ticker}_Low',
                             'v': f'{ticker}_Volume',
                             't': 'Date'}, inplace = True)
        
        features_list2 = []
        
        for col in df2.columns:
            features_list2.append(col)
      
        df1_data_list = []
        
        for col in df1.columns:
            df1_data_list.append(df1[col].to_list())
                
        df2_data_list = []
        
        for col in df2.columns:
            df2_data_list.append(df2[col].to_list())
            
        if ignore_after_hours:

            #List containing all of the indeces that are to be deleted
            indeces_to_delete = []

            #Status update
            '''print("Finding all pre/post market data")'''
            
            progress = 0.0

            #This loop will iterate through every data point in the data and determine the indeces of all 
            #pre/post market data
            for i in range(len(df2_data_list[features_list.index('Date')])):

                #Prints the progress of this loop
                '''if(progress % 50000 == 0):
                    print(progress / float(len(df1_data_list[features_list.index('Date')])))'''

                progress += 1

                #Sets the hour/minute values of each iteration
                hour = int(str(df2_data_list[features_list.index('Date')][i])[11:13])
                minute = int(str(df2_data_list[features_list.index('Date')][i])[14:16])

                #Appends the index of all of the candles that occur before 1:30 pm and after
                #8:00 pm (since those are the beginning and end times for the recorded stock data

                if hour < 13:
                    indeces_to_delete.append(i)
                elif hour >= 20:
                    indeces_to_delete.append(i)
                elif hour == 13 and minute < 30:
                    indeces_to_delete.append(i)

            #Status update
            '''print("Deleting all pre/post market data")'''

            #This variable accounts for the shift in the data that occurs every time an index is deleted.
            #The 'indeces_to_delete' list contains all of the indeces of candles that occur during pre/post market,
            #but every time one of those is deleted in the loop below, the true index of these candles is shifted 
            #down by one. The 'compensation' variable keeps track of this, and acouunts for it during the filtering process.
            compensation = 0

            #Tracks the progress of the loop
            progress = 0.0

            #Iterate through every index of the 'indeces_to_delete' list
            for i in indeces_to_delete:

                #Prints the progress to the user
                '''if(progress % 30000 == 0):
                    print(progress / float(len(indeces_to_delete)))'''

                progress += 1

                #Deletes the current loop iteration's respective index for every feature within
                #The list of features 'df1_data_list'
                for feature in range(len(features_list)):
                    del df2_data_list[feature][i - compensation]

                #Increases 'compensation' by one to account for the shift in data caused by index deletion
                compensation += 1

        for i in range(len(df2_data_list[features_list2.index('Date')])):
            df2_data_list[features_list2.index('Date')][i] = str(df2_data_list[features_list2.index('Date')][i])
            
        for feature in features_list:
            df1_data_list[features_list.index(feature)].extend(df2_data_list[features_list2.index(feature)])
            
        minute_df = pd.DataFrame()
        
        minute_df[f'{ticker}_High'] = df1_data_list[features_list.index(f'{ticker}_High')]
        minute_df[f'{ticker}_Low'] = df1_data_list[features_list.index(f'{ticker}_Low')]
        minute_df[f'{ticker}_Open'] = df1_data_list[features_list.index(f'{ticker}_Open')]
        minute_df[f'{ticker}_Close'] = df1_data_list[features_list.index(f'{ticker}_Close')]
        minute_df[f'{ticker}_Volume'] = df1_data_list[features_list.index(f'{ticker}_Volume')]
        minute_df['Date'] = df1_data_list[features_list.index('Date')]
       
        minute_df.drop_duplicates(inplace = True)
    
    print(f'Updating {ticker}')
 
    if include_minute and include_daily:
        return minute_df, daily_df
    elif include_minute:
        return minute_df
    elif include_daily:
        return daily_df
    
tst = update_ticker_data(ticker = 'AAPL', end_date = '2022-10-10', include_daily = False)
tst.to_csv('MINUTE_STOCK_DATA//AAPL.csv')

In [None]:
new_df.to_csv(f'Test_Data//AAPL.csv')

In [None]:
data_split_date = '2021-07-22 13:30:00' # Date defining the split between training and testing data

In [None]:
#This method processes the minute data for Deep Learning. It adds a 'target_column' which signifies whether the 
#neural network should predict a long or a short position for a stock.
#If split_data is true, the method splits the data into a training and testing set, otherwise it returns all of the data
#If read_local is True, it reads data from the computer. If it is false, the 'data' parameter must be
#fed the dataframe of data. The 'delete' parameter determines whether the dataframe deletes the 'date' column
#from the returned DF
def process_minute_data(ticker, start = None, end = None, read_local = True, split_data = True, split_date = None, drop_date = True):
    
    if read_local:
        #Loads the minute data from csv file if read_local is True
        df = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')

        if split_data:
            dates_list = df['Date'].to_list()
            dates_list = [str(date) for date in dates_list]
            split_date_index = dates_list.index(split_date)
        
        if drop_date:
            #Drops the 'Date' and the other extraneous column (idk how it got there I didn't make it), since they will not be needed
            df.drop(['Date', 'Unnamed: 0'], axis = 1, inplace = True)
    else:
        df = get_ticker_data(ticker = ticker, start_date = start, end_date = end, include_daily = False)
        
        if split_data:
            dates_list = df['Date'].to_list()
            dates_list = [str(date) for date in dates_list]
            split_date_index = dates_list.index(split_date)
        
        if drop_date:
            #Drops the 'Date' and the other extraneous column (idk how it got there I didn't make it), since they will not be needed
            df.drop(['Date'], axis = 1, inplace = True)

    #This list will contain lists of the future values for a specific candle. Each list within 'future_column' will
    #be 'hm_units' long. This list is never put into the data frame that will be fed through the neural network. It is
    #created for the soul purpose of creating the 'target_column' which tells the Neural network whether or not its guess
    #was correct. Each list of futures within this list will befed through the 'classify' method, along with the close 
    #price of the candle that directly preceeds the first index of the list. Again, this is only a temporary list, and 
    #will be discarded after the function has completed its process. The 'future_column' list will also be 'hm_units'
    #shorter than the final data frame bc the 'classify' function can't classify values that run outside of the index
    #of the close prices. (ie, if the 'classify' function must create a result 'hm_units' into the future, and it 
    #is fed the very last data point of the close prices, there is no future data to calculate %discrepency against.
    future_lows_column = []
    
    #This loop appends a list of 'hm_units' future values to the 'future_column' list
    for i in range(len(df) - hm_units):
        
        #List of future lows to be appended to the 'future_lows_column'
        future_lows = []
        
        #Appends each future value to the 'futures' list, skipping the first (since that is the current value)
        for ii in range(1, hm_units + 1):
            future_lows.append(df[f'{ticker}_Low'].iloc[i + ii])
    
        future_lows_column.append(future_lows)
    
    #Target column which will be added to the data frame. This column essentially uses the 'classify' function
    #to calculate whether a stock will significantly increase or decrease during the next 'hm_units' minute candles.
    target_column = []
    
    #Feeds every close value of the current ticker and every list within the 'future_lows_column' list through the 'classify'
    #function, and appends it to the 'target_column'
    for i in range(len(future_lows_column)):
        target_column.append(classify(df[f'{ticker}_Close'].iloc[i], df[f'{ticker}_Close'].iloc[i + hm_units], future_lows_column[i]))
    
    #Since the 'target_column' can't make predictions with data it doesn't have (when reaching the end of the close prices,
    #The last 'hm_units' of the 'target_column' is simply filled with a zero. Bc each dataframe has roughly 300k to 400k
    #datapoints, these zeros should not affect accuracy in the slightest
    for i in range(hm_units):
        target_column.append(0)
        
    
    #Adds the minute atr column to the data frame
    df[f'{ticker}_Minute_ATR'] = Minute_ATR_Column(ticker)
    
    #Adds a 9, 50, 100, and 200 MA to the model
    df[f'{ticker}_9_MA'], df[f'{ticker}_50_MA'], df[f'{ticker}_100_MA'], df[f'{ticker}_200_MA'] = moving_averages_column(ticker, minute = True)
    
    #Adds the 'target_column' list to the dataframe, and returns the dataframe
    df[f'{ticker}_target'] = target_column
    
    if split_data:
        #Splits the data into training and testing data
        train_df = df.iloc[:split_date_index]
        test_df = df.iloc[split_date_index:]

        #Writes the testing data to a csv to be accessed later
        test_df.to_csv(f'Test_Data//{ticker}.csv')
  
        return train_df
    else:
        return df
    
#tst = process_minute_data('AAPL', start = '2022-06-01', end = '2022-08-01', read_local = False, split_data = False)

In [None]:
def reformat_minute_for_DL(ticker, start_date = None, end_date = None, read_local = True, split_data = True, split_date = None, drop_date = True):
    
    #Data processing
    
    df = process_minute_data(ticker = ticker, start = start_date, end = end_date, read_local = read_local, split_data = split_data, split_date = split_date, drop_date = drop_date)
   
    df.fillna(0, inplace = True)
    
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace = True)
    
    #Scales/Normalizes data
    for col in df.columns:
        #if col != f'{ticker}_target' or col != f'{ticker}_RSI_Weights':
        if col != f'{ticker}_target':
            df[col] = df[col].pct_change()
        
    df = df.replace([np.inf, -np.inf], 0)
    df.fillna(0, inplace = True)
    
    
    #Dictionary containing the data (current minute information as well as the previous 60 minutes of data) and the target
    sequential_data = []
    prev_units = deque(maxlen = SEQ_LEN)
    
    for i in df.values:
        prev_units.append([n for n in i[:-1]])
        if len(prev_units) == SEQ_LEN:
            sequential_data.append([np.array(prev_units), i[-1]])
    
    random.shuffle(sequential_data)
    
    #Balances the data so that there are an equal number of 1's and 0's (otherwise the model will attempt to 
    #achieve a higher accuracy by simply guessing the more frequently appearing value
    buys = []
    sells = []
 
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])    
   
    random.shuffle(buys)
    random.shuffle(sells)

    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    #Shows the user that the model has been balanced
    print("Num buys/sells:")
    print(len(buys))
    print(len(sells))

    sequential_data = buys + sells
    random.shuffle(sequential_data)
    
    #X contains the data and y contains the targets
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), np.array(y)

#tst = reformat_minute_for_DL(ticker = 'AAPL', start_date = '2022-08-01', end_date = '2022-08-07', read_local = False, split_data = False)

In [None]:
X, y = reformat_minute_for_DL(ticker = 'AAPL', start_date = '2022-04-01', end_date = '2022-10-10', read_local = True, split_data = True, split_date = '2022-07-22 13:30:00', drop_date = True)

In [None]:
def do_dl(ticker):
    X, y = reformat_minute_for_DL(ticker)
    
    #Splits the data into training data and validation data
    last_5pct = -int(len(X) * 0.05)
    
    X_train = X[:last_5pct]
    y_train = y[:last_5pct]
    X_test = X[last_5pct:]
    y_test = y[last_5pct:]

    print("Number of training data points:")
    print(len(X_train))
   
    model = Sequential()
    model.add(LSTM(64, input_shape = (X_train.shape[1:]), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    
    model.add(LSTM(64, input_shape = (X_train.shape[1:]), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    
    model.add(LSTM(64, input_shape = (X_train.shape[1:])))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
              
    model.add(Dense(256, activation = 'relu')) #512
    model.add(Dropout(0.2))
    model.add(Dense(2, activation = 'softmax'))
    
    opt = tf.keras.optimizers.Adam(learning_rate = 0.001, decay = 1e-6)
    
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = opt, metrics = ['accuracy']) 
    
    tensorboard = TensorBoard(log_dir = f'Records/logs/{NAME}')
    
    filepath = 'RNN_Final-{epoch: 02d}-{val_accuracy: .3f}'
    checkpoint = ModelCheckpoint('Records/models/{}.model'.format(filepath, monitor = 'val_accuracy', verbose = 1, save_best_only = True, mode = 'max'))
  
    history = model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_data = (X_test, y_test), callbacks = [tensorboard, checkpoint]) 
    

In [39]:
#Back testing

def back_test(ticker, model_location):
    
    print("Attempting model back test")
    
    #Variable used to deterime how many candles of data the model will backtest (390 = one day)
    num_candles = 390
    
    #Processing the testing data
    df = pd.read_csv(f'Test_Data//{ticker}.csv')
    df.drop(['Unnamed: 0'], axis = 1, inplace = True)
    
    df.fillna(0, inplace = True)
    
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace = True)
    
    #Scales/Normalizes data
    for col in df.columns:
        #if col != f'{ticker}_target' or col != f'{ticker}_RSI_Weights':
        if col != f'{ticker}_target':
            df[col] = df[col].pct_change()
        
    df = df.replace([np.inf, -np.inf], 0)
    df.fillna(0, inplace = True)
    
    sequential_data = []
    prev_units = deque(maxlen = SEQ_LEN)
    
    for i in df.values:
        prev_units.append([n for n in i[:-1]])
        if len(prev_units) == SEQ_LEN:
            sequential_data.append([np.array(prev_units), i[-1]])
            
    index = random.randrange(len(sequential_data) - num_candles)
    
    X = []
    y = []

    for i in range(num_candles):
        X.append(sequential_data[index + i][0])
        y.append(sequential_data[index + i][1])

    X = np.array(X)
    y = np.array(y)


    # Testing candle number 'index' and the next 389 candles as well (one day)
    print(f"Testing candle + 389: {index}")
    
    model = tf.keras.models.load_model(model_location)

    #Creates list of prediction probabilities (the model will create a list that contains the probablity
    #of the data outputting any specific classification. The max value will be returned as the model's guess
    prediction_probabilities = model.predict(X)
    predictions = np.argmax(prediction_probabilities, axis = 1)

    #Accuracy validation 
    
    total_count = 0.0
    count0 = 0.0
    count1 = 0.0

    print("Total accuracy:")
    for i in range(len(y)):
        if(predictions[i] == y[i]):
            total_count += 1.0

    print(total_count / float(len(y)))

    indeces_0 = []

    for i in range(len(y)):
        if y[i] == 0:
            indeces_0.append(i)

    print("Accuracy of '0' predictions:")
    for i in indeces_0:
        if(predictions[i] == 0):
            count0 += 1.0

    print(count0 / float(len(indeces_0)))

    indeces_1 = []

    print("Accuracy of '1' predictions:")
    for i in range(len(y)):
        if y[i] == 1:
            indeces_1.append(i)

    for i in indeces_1:
        if(predictions[i] == 1):
            count1 += 1.0

    print(count1 / float(len(indeces_1)))
    
    print("number of true buys/sells")
    print(len(indeces_1), len(indeces_0))
    
    return count1 / float(len(indeces_1))

In [None]:
BATCH_SIZE = 128 #128 - original --- 32 = optimized (potentially)
LSIZE = 128 #128 - original --- 64 = optimized (porentially)
EPOCHS = 20
hm_units = 10
req = .002

#do_dl('AAPL')

In [None]:
accuracy = []

for i in range(40):
    #accuracy.append(back_test(back_test_data, 'Records//models//RNN_Final- 24- 0.711.model'))
    accuracy.append(back_test('AAPL', 'Records//Successful Models//AAPL.model'))
    
print(sum(accuracy) / 40.0)

In [40]:
#back_test('AAPL', 'Records//models//RNN_Final- 20- 0.714.model')
back_test('AAPL', 'Records//Successful Models//AAPL.model')

Attempting model back test
Testing candle + 389: 8911
Total accuracy:
0.09230769230769231
Accuracy of '0' predictions:
0.0
Accuracy of '1' predictions:
1.0
number of true buys/sells
36 354


1.0

In [None]:
def simulate_trading(ticker, start, end):
    df = get_ticker_data(ticker = ticker, start_date = start, end_date = end, include_daily = False)
    
    