In [1]:
from collections import Counter, deque
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import os
import pandas as pd
from pandas_datareader import data as pdr
import pickle
import random
import requests
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import time

style.use('ggplot')

with open("sp500tickers_unrevised.pickle", "rb") as f:
            tickers = pickle.load(f)
       

    
dropped_tickers = ('BALL', 'BRK-B', 'BF-B', 'KMX', 'WBD', 'CEG', 'PARA', 'WTW', 'META', 'OGN', 'BBWI')

for i in dropped_tickers:
    if i in tickers:
        del tickers[tickers.index(i)]
        
hm_units = 10
req = 0.002
SEQ_LEN = 60
BATCH_SIZE = 132
EPOCHS = 30
NAME = f'{SEQ_LEN}-SEQ-{hm_units}-PRED-{int(time.time())}'
CORRELATION_COEFFICIENT = 0.92

#api keys that will be used to access data from the source
        
api = 'gDvQNVWDC2mhOlN1Z9if6JEyDM08CpeC'

In [2]:
#This function determines whether the model should buy or sell shares of a company. 
#One of three values will be returned, either a -1, 0, or 1
#If a -1 is returned, that means that the model predicts a percent decrease in price, 
#equivalant in magnitude to the 'req' variable, over the next (hm_units) candles, and it
#also predicts that the price will *not* increase in value by a significant degree before
#it reaches the prdicted candle. (ie, the model tries to predict instances of increase/decrease
#that will not result in the user being 'wicked out' of the trade, should the price fluctuate 
#between low and high values very aggressively.

#The function takes two parameters: the current price, and the list of future prices the immediately follow the
#current price. This 'future_prices' list will be exactly 'hm_units' long.
def classify(current_price, future_price, future_lows):
    
    #Sets the threshold value for when a price movement is considered to be significant 
    requirement = req
    
    #Creates a list of %changes in relation to 'current_price' for each future price within 'future_prices' 
    lows = []
    for price in future_lows:
        lows.append((price - current_price) / current_price)
        
    future_change = (future_price - current_price) / current_price
    
    #If the last value (value to be predicted) is greater than or equal to the threshold (a significant increase is predicted),
    #the method will prepare to return either a 1 (significant increase w/o prior decrease) or a 0 (significant increase *with*
    #prior decrease
    if future_change >= requirement:
        
        #Checks to see if any of the values in the 'futures' list (excluding the very last value (prediction value))
        #fall below the requirement (hit the stop loss of the trade before the shares are sold)
        for low in future_lows:
            if low <= -requirement:
                return 0
            
        return 1
    
    else:
        return 0
    
    #If, in 'hm_units' candles, there are no significant increases or decreases, the function simply returns a zero
    

In [3]:
#Calculates the ATR of a specific stock at any given time
def Minute_ATR_Column(ticker):
    df = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')
    #df.set_index('Date', inplace = True)
    
    df.drop([f'{ticker}_Open', f'{ticker}_Volume'], axis = 1, inplace = True)
    
    atrs = []
    true_ranges = []
    
    df[f'{ticker}_Close'] = (df[f'{ticker}_Close'].shift(1))
    
    true_ranges.append(df[f'{ticker}_High'].iloc[0] - df[f'{ticker}_Low'].iloc[0])
    
    for i in range(1, len(df)):
        ranges = []
        ranges.append(df[f'{ticker}_High'].iloc[i] - df[f'{ticker}_Low'].iloc[i])
        ranges.append(abs(df[f'{ticker}_High'].iloc[i] - df[f'{ticker}_Close'].iloc[i]))
        ranges.append(abs(df[f'{ticker}_Low'].iloc[i] - df[f'{ticker}_Close'].iloc[i]))
        true_ranges.append(max(ranges))
    
    TRS = {'Ranges': true_ranges, 'Date': df.index.values}
    true_ranges_DF = pd.DataFrame(data = TRS)
    true_ranges_DF.set_index('Date', inplace = True)
    
    #df = df.join(true_ranges_DF, how = 'outer')
    df['ATRS'] = true_ranges_DF['Ranges'].rolling(window = 14, min_periods = 0).sum().div(14)
    return df['ATRS']
       

In [4]:
def moving_averages_column(ticker, daily = False, minute = False):
    
    if daily:
        dfd = pd.read_csv(f'DAILY_STOCK_DATA//{ticker}.csv') #.format(ticker.replace('.', '-')))
        dfd = dfd[f'{ticker}_Close'].to_frame()
        dfd[f'{ticker}_Daily_MA_9'] = dfd[f'{ticker}_Close'].rolling(window = 9, min_periods = 0).mean()
        dfd[f'{ticker}_Daily_MA_50'] = dfd[f'{ticker}_Close'].rolling(window = 50, min_periods = 0).mean()
        dfd[f'{ticker}_Daily_MA_100'] = dfd[f'{ticker}_Close'].rolling(window = 100, min_periods = 0).mean()
        dfd[f'{ticker}_Daily_MA_200'] = dfd[f'{ticker}_Close'].rolling(window = 200, min_periods = 0).mean()
    
        return dfd[f'{ticker}_Daily_MA_9'], dfd[f'{ticker}_Daily_MA_50'], dfd[f'{ticker}_Daily_MA_100'], dfd[f'{ticker}_Daily_MA_200']
    
    if minute:
        dfm = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv') #.format(ticker.replace('.', '-')))
        dfm = dfm[f'{ticker}_Close'].to_frame()
        dfm[f'{ticker}_Minute_MA_9'] = dfm[f'{ticker}_Close'].rolling(window = 9, min_periods = 0).mean()
        dfm[f'{ticker}_Minute_MA_50'] = dfm[f'{ticker}_Close'].rolling(window = 50, min_periods = 0).mean()
        dfm[f'{ticker}_Minute_MA_100'] = dfm[f'{ticker}_Close'].rolling(window = 100, min_periods = 0).mean()
        dfm[f'{ticker}_Minute_MA_200'] = dfm[f'{ticker}_Close'].rolling(window = 200, min_periods = 0).mean()
    
        return dfm[f'{ticker}_Minute_MA_9'], dfm[f'{ticker}_Minute_MA_50'], dfm[f'{ticker}_Minute_MA_100'], dfm[f'{ticker}_Minute_MA_200']
    
#moving_averages_column('AAPL')

In [5]:
#This function will get the daily and minute data for a specific ticker, and it will save it to a CSV to be accessed later.
#The desired ticker must be passed as an argument, as well as the start/end date of the data (string). 
#The 'ignore_after_hours' parameter (boolean) will dictate whether the dataset only contains active hours data, 
#or if it includes pre/post market. If the pre/post market data is also taken, the user must account for 
#a high volume of zero volume periods, since the stocks are traded less frequently during these times. 
#The 'minute' and 'daily' paramaters are booleans which dicate which type of data will be acquired:
#daily data or minute data. Both initially set to True

#Dates must be in format yyyy-mm-dd
def get_ticker_data(ticker, start_date, end_date, ignore_after_hours = True, include_minute = True, include_daily = True):
    
        print(f'Getting {ticker} data')
    
        #The 'sort' variable dictates the method by which the data is sorted. asc (Ascending) means the oldest dates are at the top
        #and desc (Descending) means the newest dates are at the top. DO NOT CHANGE. This variable was only put in place to make it easier
        #for me to develop/test the code while I was writing it. 
        sort = 'asc' #or desc
        
        #This variable dictates how many days the 'date_tracker' objecd (created below) will be advanced by each time new data is added
        #To the stock minute dataframe
        increment = 60
        
        #This is the file path to store the final Minute stock data
        minute_data_path = f'MINUTE_STOCK_DATA//{ticker}'
        
        #This is the file path to store the final Daily stock data
        daily_data_path = f'DAILY_STOCK_DATA//{ticker}'
        
        #Creates 'start'/'end' strings from the 'start_date' and 'end_date' parameters that can be accessed during execution of the method
        start = start_date
        end = end_date
        
        #Creates a datetime object that stores the 'end' date (to be compared to during the while loop)
        end_date_time = datetime.datetime(int(end[:4]), int(end[5:7]), int(end[8:10]))
        
        #This 'date_tracker' object will allow the following while loop determine when to exit the loop
        date_tracker = datetime.datetime(int(start[:4]), int(start[5:7]), int(start[8:10]))
        
        #Url string for the initial 'df1' data
        minute_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{str(date_tracker)[:10]}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'
        
        #If the 'minute' parameter is set to True
        if include_minute:
    
            #Requests the minute stock data from the url listed above and creates the proper dataframe    
            firstData = requests.get(minute_url).json()
            df1 = pd.DataFrame(firstData['results'])
            df1.reset_index(inplace = True)
            df1.drop(['vw', 'n'], axis = 1, inplace = True)

            df1['t'] = pd.to_datetime(df1['t'], unit = 'ms')
            df1.rename(columns = {'c': f'{ticker}_Close', 
                                 'o': f'{ticker}_Open', 
                                 'h': f'{ticker}_High',
                                 'l': f'{ticker}_Low',
                                 'v': f'{ticker}_Volume',
                                 't': 'Date'}, inplace = True)
            
            #List of features contained within the initial stock data
            features_list = []

            #Populates the list of features
            for col in df1.columns:
                features_list.append(col)

            #The list of lists containing data from each feature within the minute stock data. The index of each list will 
            #correspond with the index of the the list's data's respective feature within the 'features' list
            df1_data_list = []

            #Appends a list form of each column (feature) of data to the 'df1_data_list'
            for col in df1.columns:
                df1_data_list.append(df1[col].to_list())

            #Status update
            '''print("Collecting and compiling data")'''

            #Since each data call is limited to 50000 candles, this while loop is needed to join multiple 50000 candle chunks of data together
            #The 'date_tracker' object will tell the while loop when it should exit the loop, and that is when the 'date-tracker' object's date
            #Surpasses the requested end_date_time minus seventy days. The reason the seventy is subtracted, is so that an error is not thrown 
            #when the date_tracker (which is continually incremented by 70 days throughout the loop) surpasses the requested end_date_time, 
            #thereby possibly requesting data from the future (which obviously doesn't exist)
            while date_tracker < (end_date_time - datetime.timedelta(days = increment)):

                #Prints the first date of each obtained chunk of data (since data can only be collected in chunks of 50000)
                #print(date_tracker)

                #Second url that grabs the data that will be added to the end of the initial 'df1' dataframe
                minute_url2 = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{str(date_tracker + datetime.timedelta(days = increment))[:10]}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'

                #Acquires the 'to-be-added' data and formats it accordingly
                secondData = requests.get(minute_url2).json()
                df2 = pd.DataFrame(secondData['results'])
                df2.reset_index(inplace = True)
                df2.drop(['vw', 'n'], axis = 1, inplace = True)
                df2['t'] = pd.to_datetime(df2['t'], unit = 'ms')
                df2.rename(columns = {'c': f'{ticker}_Close', 
                                     'o': f'{ticker}_Open', 
                                     'h': f'{ticker}_High',
                                     'l': f'{ticker}_Low',
                                     'v': f'{ticker}_Volume',
                                     't': 'Date'}, inplace = True)

                #Creates a list that will contain list formats of each features' data from the 'df2' dataframe
                df2_data_list = []

                #Populates the above list
                for col in df2.columns:
                    df2_data_list.append(df2[col].to_list())

                #Appends the contents of the newly aqcuired list to the initial list
                for i in range(len(features_list)):
                    df1_data_list[i].extend(df2_data_list[i])

                #Increments the date_tracker object by 'increment' days
                date_tracker += datetime.timedelta(days = increment)

            #Progress tracker for the progression of the below code
            progress = 0.0

            #If the 'ignore_after_hours' paramater is read as True, excecute the following code
            if ignore_after_hours:

                #List containing all of the indeces that are to be deleted
                indeces_to_delete = []

                #Status update
                '''print("Finding all pre/post market data")'''

                #This loop will iterate through every data point in the data and determine the indeces of all 
                #pre/post market data
                for i in range(len(df1_data_list[features_list.index('Date')])):

                    #Prints the progress of this loop
                    '''if(progress % 50000 == 0):
                        print(progress / float(len(df1_data_list[features_list.index('Date')])))

                    progress += 1'''

                    #Sets the hour/minute values of each iteration
                    hour = int(str(df1_data_list[features_list.index('Date')][i])[11:13])
                    minute = int(str(df1_data_list[features_list.index('Date')][i])[14:16])

                    #Appends the index of all of the candles that occur before 1:30 pm and after
                    #8:00 pm (since those are the beginning and end times for the recorded stock data

                    if hour < 13:
                        indeces_to_delete.append(i)
                    elif hour >= 20:
                        indeces_to_delete.append(i)
                    elif hour == 13 and minute < 30:
                        indeces_to_delete.append(i)

                #Status update
                '''print("Deleting all pre/post market data")'''

                #This variable accounts for the shift in the data that occurs every time an index is deleted.
                #The 'indeces_to_delete' list contains all of the indeces of candles that occur during pre/post market,
                #but every time one of those is deleted in the loop below, the true index of these candles is shifted 
                #down by one. The 'compensation' variable keeps track of this, and acouunts for it during the filtering process.
                compensation = 0

                #Tracks the progress of the loop
                progress = 0.0

                #Iterate through every index of the 'indeces_to_delete' list
                for i in indeces_to_delete:

                    #Prints the progress to the user
                    '''if(progress % 30000 == 0):
                        print(progress / float(len(indeces_to_delete)))

                    progress += 1'''

                    #Deletes the current loop iteration's respective index for every feature within
                    #The list of features 'df1_data_list'
                    for feature in range(len(features_list)):
                        del df1_data_list[feature][i - compensation]

                    #Increases 'compensation' by one to account for the shift in data caused by index deletion
                    compensation += 1

            #Status update
            '''print(f'Exporting {ticker} minute data')'''

            #Creates the DataFrame containing the minute data and writes it to a CSV File
            minute_data_df = pd.DataFrame()

            minute_data_df[f'{ticker}_Volume'] = df1_data_list[features_list.index(f'{ticker}_Volume')]
            minute_data_df[f'{ticker}_High'] = df1_data_list[features_list.index(f'{ticker}_High')]
            minute_data_df[f'{ticker}_Low'] = df1_data_list[features_list.index(f'{ticker}_Low')]
            minute_data_df[f'{ticker}_Open'] = df1_data_list[features_list.index(f'{ticker}_Open')]
            minute_data_df[f'{ticker}_Close'] = df1_data_list[features_list.index(f'{ticker}_Close')]
            minute_data_df['Date'] = df1_data_list[features_list.index('Date')]

            minute_data_df.drop_duplicates(inplace = True)

            #minute_data_df.to_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')

            #Status update
            '''print(f'Exporting {ticker} Daily data')'''
            
        #If the 'daily' parameter is set to True
        if include_daily:
        
            #Beginning of Daily Data collection
            daily_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{start}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'

            daily_data_df = requests.get(daily_url).json()
            daily_data_df = pd.DataFrame(daily_data_df['results'])

            daily_data_df.reset_index(inplace = True)
            daily_data_df.drop(['vw', 'n'], axis = 1, inplace = True)

            daily_data_df['t'] = pd.to_datetime(daily_data_df['t'], unit = 'ms')
            daily_data_df.rename(columns = {'c': f'{ticker}_Close', 
                                 'o': f'{ticker}_Open', 
                                 'h': f'{ticker}_High',
                                 'l': f'{ticker}_Low',
                                 'v': f'{ticker}_Volume',
                                 't': 'Date'}, inplace = True)

            #daily_data.to_csv(f'DAILY_STOCK_DATA//{ticker}.csv')
            
        #Returns the proper DateFrames depending on which (or both) of the 'minute'/'daily' parameters are True
        if include_minute and include_daily:
            return minute_data_df, daily_data_df
        elif include_minute:
            return minute_data_df
        elif include_daily:
            return daily_data_df
        #If neither the 'minute' nor the 'daily' parameter is True, prints an error statement
        else:
            print("No data type specified")
        '''print(f'Done with {ticker}')'''


In [6]:
#Calls the 'get_ticker_data' function for every ticker in the SP500 (excluding some oddities)
def acquire_all_data(start, end):
    
    for ticker in range(0, len(tickers)):
        try:
            mdf, ddf = get_ticker_data(tickers[ticker], start, end)
            mdf.to_csv(f'MINUTE_STOCK_DATA//{tickers[ticker]}.csv')
            ddf.to_csv(f'DAILY_STOCK_DATA//{tickers[ticker]}.csv')
        except:
            print("Too may calls, delaying for thirteen seconds and retrying")
            time.sleep(13)
            try:
                mdf, ddf = get_ticker_data(tickers[ticker], start, end)
                mdf.to_csv(f'MINUTE_STOCK_DATA//{tickers[ticker]}.csv')
                ddf.to_csv(f'DAILY_STOCK_DATA//{tickers[ticker]}.csv')
            except:
                print(f"Unable to acquire ticker {tickers[ticker]} daily/minute data")
            continue
            
    #Gets minute data including after hours
    for ticker in range(0, len(tickers)):
        try:
            mdfaf= get_ticker_data(tickers[ticker], start, end, include_daily = False, ignore_after_hours = False)
            mdfaf.to_csv(f'MINUTE_STOCK_DATA_AFTERHOURS//{tickers[ticker]}.csv')
        except:
            print("Too may calls, delaying for thirteen seconds and retrying")
            time.sleep(13)
            try:
                mdfaf = get_ticker_data(tickers[ticker], start, end, include_daily = False, ignore_after_hours = False)
                mdfaf.to_csv(f'MINUTE_STOCK_DATA_AFTERHOURS//{tickers[ticker]}.csv')
            except:
                print(f"Unable to acquire ticker {tickers[ticker]} after hours data")
            continue
    print('Complete')

#acquire_all_data(start = '2017-07-08', end = '2022-07-24')

In [7]:
def update_ticker_data(ticker, end_date, ignore_after_hours = True, include_minute = True, include_daily = True):
    if not include_minute and not include_daily:
        print("No time window specified")
        
    if include_minute:
        df1 = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')
        df1.drop(['Unnamed: 0'], inplace = True, axis = 1)
        
        features_list = []
        
        for col in df1.columns:
            features_list.append(col)
        
        start_date = df1['Date'].to_list()[-1][:10]
        end = end_date
        
        sort = 'asc'
        
        minute_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{start_date}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'
  
        #Requests the minute stock data from the url listed above and creates the proper dataframe    
        minute_data = requests.get(minute_url).json()
        df2 = pd.DataFrame(minute_data['results'])
        df2.reset_index(inplace = True)
        df2.drop(['vw', 'n', 'index', 'a', 'op'], axis = 1, inplace = True)

        df2['t'] = pd.to_datetime(df2['t'], unit = 'ms')
        df2.rename(columns = {'c': f'{ticker}_Close', 
                             'o': f'{ticker}_Open', 
                             'h': f'{ticker}_High',
                             'l': f'{ticker}_Low',
                             'v': f'{ticker}_Volume',
                             't': 'Date'}, inplace = True)
        
        features_list2 = []
        
        for col in df2.columns:
            features_list2.append(col)
      
        df1_data_list = []
        
        for col in df1.columns:
            df1_data_list.append(df1[col].to_list())
                
        df2_data_list = []
        
        for col in df2.columns:
            df2_data_list.append(df2[col].to_list())
            
        if ignore_after_hours:

            #List containing all of the indeces that are to be deleted
            indeces_to_delete = []

            #Status update
            '''print("Finding all pre/post market data")'''
            
            progress = 0.0

            #This loop will iterate through every data point in the data and determine the indeces of all 
            #pre/post market data
            for i in range(len(df2_data_list[features_list.index('Date')])):

                #Prints the progress of this loop
                '''if(progress % 50000 == 0):
                    print(progress / float(len(df1_data_list[features_list.index('Date')])))'''

                progress += 1

                #Sets the hour/minute values of each iteration
                hour = int(str(df2_data_list[features_list.index('Date')][i])[11:13])
                minute = int(str(df2_data_list[features_list.index('Date')][i])[14:16])

                #Appends the index of all of the candles that occur before 1:30 pm and after
                #8:00 pm (since those are the beginning and end times for the recorded stock data

                if hour < 13:
                    indeces_to_delete.append(i)
                elif hour >= 20:
                    indeces_to_delete.append(i)
                elif hour == 13 and minute < 30:
                    indeces_to_delete.append(i)

            #Status update
            '''print("Deleting all pre/post market data")'''

            #This variable accounts for the shift in the data that occurs every time an index is deleted.
            #The 'indeces_to_delete' list contains all of the indeces of candles that occur during pre/post market,
            #but every time one of those is deleted in the loop below, the true index of these candles is shifted 
            #down by one. The 'compensation' variable keeps track of this, and acouunts for it during the filtering process.
            compensation = 0

            #Tracks the progress of the loop
            progress = 0.0

            #Iterate through every index of the 'indeces_to_delete' list
            for i in indeces_to_delete:

                #Prints the progress to the user
                '''if(progress % 30000 == 0):
                    print(progress / float(len(indeces_to_delete)))'''

                progress += 1

                #Deletes the current loop iteration's respective index for every feature within
                #The list of features 'df1_data_list'
                for feature in range(len(features_list)):
                    del df2_data_list[feature][i - compensation]

                #Increases 'compensation' by one to account for the shift in data caused by index deletion
                compensation += 1

        for i in range(len(df2_data_list[features_list2.index('Date')])):
            df2_data_list[features_list2.index('Date')][i] = str(df2_data_list[features_list2.index('Date')][i])
            
        for feature in features_list:
            df1_data_list[features_list.index(feature)].extend(df2_data_list[features_list2.index(feature)])
            
        minute_df = pd.DataFrame()
        
        minute_df[f'{ticker}_High'] = df1_data_list[features_list.index(f'{ticker}_High')]
        minute_df[f'{ticker}_Low'] = df1_data_list[features_list.index(f'{ticker}_Low')]
        minute_df[f'{ticker}_Open'] = df1_data_list[features_list.index(f'{ticker}_Open')]
        minute_df[f'{ticker}_Close'] = df1_data_list[features_list.index(f'{ticker}_Close')]
        minute_df[f'{ticker}_Volume'] = df1_data_list[features_list.index(f'{ticker}_Volume')]
        minute_df['Date'] = df1_data_list[features_list.index('Date')]
       
        minute_df.drop_duplicates(inplace = True)
    
    print(f'Updating {ticker}')
 
    if include_minute and include_daily:
        return minute_df#, daily_df
    elif include_minute:
        return minute_df
    elif include_daily:
        return daily_df
    
#tst = update_ticker_data('AAPL', '2022-07-22')
#tst.to_csv('MINUTE_STOCK_DATA//AAPL.csv')

In [8]:
def update_all_ticker_data(end):
    for ticker in range(0, len(tickers)):
        try:
            mdf, ddf = update_ticker_data(tickers[ticker], end_date = end)
            mdf.to_csv(f'MINUTE_STOCK_DATA//{tickers[ticker]}.csv')
            ddf.to_csv(f'DAILY_STOCK_DATA//{tickers[ticker]}.csv')
        except:
            print("Too may calls, delaying for thirteen seconds and retrying")
            time.sleep(13)
            try:
                mdf, ddf = get_ticker_data(tickers[ticker], start, end)
                mdf.to_csv(f'MINUTE_STOCK_DATA//{tickers[ticker]}.csv')
                ddf.to_csv(f'DAILY_STOCK_DATA//{tickers[ticker]}.csv')
            except:
                print(f"Unable to update ticker {tickers[ticker]} daily/minute data")
            continue
    
    #Updates after hours minute data
    for ticker in range(0, len(tickers)):
        try:
            mdfaf = update_ticker_data(tickers[ticker], end_date = end, include_daily = False, ignore_after_hours = False)
            mdfaf.to_csv(f'MINUTE_STOCK_DATA_AFTERHOURS//{tickers[ticker]}.csv')
        except:
            print("Too may calls, delaying for thirteen seconds and retrying")
            time.sleep(13)
            try:
                mdfaf = get_ticker_data(tickers[ticker], start, end, include_daily = False, ignore_after_hours = False)
                mdfaf.to_csv(f'MINUTE_STOCK_DATA_AFTERHOURS//{tickers[ticker]}.csv')
            except:
                print(f"Unable to update ticker {tickers[ticker]} after hours data")
            continue
    print('Complete')


In [9]:
#This method processes the minute data for Deep Learning. It adds a 'target_column' which signifies whether the 
#neural network should predict a long or a short position for a stoc. 
def process_minute_data(ticker):
    
    #Loads the minute data from csv file
    df = pd.read_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')
    
    #Drops the 'Date' and the other extraneous column (idk how it got there I didn't make it), since they will not be needed
    df.drop(['Date', 'Unnamed: 0'], axis = 1, inplace = True)
    
    #This list will contain lists of the future values for a specific candle. Each list within 'future_column' will
    #be 'hm_units' long. This list is never put into the data frame that will be fed through the neural network. It is
    #created for the soul purpose of creating the 'target_column' which tells the Neural network whether or not its guess
    #was correct. Each list of futures within this list will befed through the 'classify' method, along with the close 
    #price of the candle that directly preceeds the first index of the list. Again, this is only a temporary list, and 
    #will be discarded after the function has completed its process. The 'future_column' list will also be 'hm_units'
    #shorter than the final data frame bc the 'classify' function can't classify values that run outside of the index
    #of the close prices. (ie, if the 'classify' function must create a result 'hm_units' into the future, and it 
    #is fed the very last data point of the close prices, there is no future data to calculate %discrepency against.
    future_lows_column = []
    
    #This loop appends a list of 'hm_units' future values to the 'future_column' list
    for i in range(len(df) - hm_units):
        
        
        future_lows = []
        
        #Appends each future value to the 'futures' list, skipping the first (since that is the current value)
        for ii in range(1, hm_units + 1):
            future_lows.append(df[f'{ticker}_Low'].iloc[i + ii])
    
        future_lows_column.append(future_lows)
    
    #Target column which will be added to the data frame. This column essentially uses the 'classify' function
    #to calculate whether a stock will significantly increase or decrease during the next 'hm_units' minute candles.
    target_column = []
    
    #Feeds every close value of the current ticker and every list within the 'future_column' list through the 'classify'
    #function, and appends it to the 'target_column'
    for i in range(len(future_lows_column)):
        target_column.append(classify(df[f'{ticker}_Close'].iloc[i], df[f'{ticker}_Close'].iloc[i + hm_units], future_lows_column[i]))
    
    #Since the 'target_column' can't make predictions with data it doesn't have (when reaching the end of the close prices,
    #The last 'hm_units' of the 'target_column' is simply filled with a zero. Bc each dataframe has roughly 150k to 200k 
    #datapoints, these zeros should not affect accuracy in the slightest
    for i in range(hm_units):
        target_column.append(0)
        
    
    #Adds the minute atr column to the data frame
    df[f'{ticker}_Minute_ATR'] = Minute_ATR_Column(ticker)
    
    #Adds a 9, 50, 100, and 200 MA to the model
    df[f'{ticker}_9_MA'], df[f'{ticker}_50_MA'], df[f'{ticker}_100_MA'], df[f'{ticker}_200_MA'] = moving_averages_column(ticker, minute = True)
    
    #Adds the 'target_column' list to the dataframe, and returns the dataframe
    df[f'{ticker}_target'] = target_column
    
    return df


In [10]:
#splits sequential data into a training and testing set
def split_sequential_data(seq_data):
    
    last_20pct = -int(len(seq_data) * 0.20)
    
    training_data = seq_data[:last_20pct]
    backtesting_data = seq_data[last_20pct:]
    
    return training_data, backtesting_data

In [11]:
def reformat_minute_for_DL(ticker):
    
    df = process_minute_data(ticker)
    
    df.fillna(0, inplace = True)
    
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace = True)
    
    #Scales/Normalizes data
    for col in df.columns:
        #if col != f'{ticker}_target' or col != f'{ticker}_RSI_Weights':
        if col != f'{ticker}_target':
            df[col] = df[col].pct_change()
        
    df = df.replace([np.inf, -np.inf], 0)
    df.fillna(0, inplace = True)
    
    sequential_data = []
    prev_units = deque(maxlen = SEQ_LEN)
    
    for i in df.values:
        prev_units.append([n for n in i[:-1]])
        if len(prev_units) == SEQ_LEN:
            sequential_data.append([np.array(prev_units), i[-1]])
            
    print("Number of total data points:")
    print(len(sequential_data))
    
    training_sequential, testing_sequential = split_sequential_data(sequential_data)
    
    random.shuffle(training_sequential)
    
    buys = []
    sells = []
 
    for seq, target in training_sequential:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])    
   
    random.shuffle(buys)
    random.shuffle(sells)

    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    print("Num buys/sells:")
    print(len(buys))
    print(len(sells))

    training_sequential = buys + sells
    random.shuffle(training_sequential)
    
    
    X = []
    y = []
    
    for seq, target in training_sequential:
        X.append(seq)
        y.append(target)
    
    return np.array(X), np.array(y), testing_sequential

#X, y = preprocess_df_minute('AAPL')


In [12]:
def do_dl(ticker):
    X, y, back_testing_data = reformat_minute_for_DL(ticker)
    
    last_5pct = -int(len(X) * 0.05)
    
    X_train = X[:last_5pct]
    y_train = y[:last_5pct]
    X_test = X[last_5pct:]
    y_test = y[last_5pct:]
    
    print("Number of training data points:")
    print(len(X_train))
   
    model = Sequential()
    model.add(LSTM(64, input_shape = (X_train.shape[1:]), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    
    model.add(LSTM(64, input_shape = (X_train.shape[1:]), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    
    model.add(LSTM(64, input_shape = (X_train.shape[1:])))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
              
    model.add(Dense(256, activation = 'relu')) #512
    model.add(Dropout(0.2))
    model.add(Dense(2, activation = 'softmax'))
    
    opt = tf.keras.optimizers.Adam(learning_rate = 0.001, decay = 1e-6)
    
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = opt, metrics = ['accuracy']) 
    
    tensorboard = TensorBoard(log_dir = f'Records/logs/{NAME}')
    
    filepath = 'RNN_Final-{epoch: 02d}-{val_accuracy: .3f}'
    checkpoint = ModelCheckpoint('Records/models/{}.model'.format(filepath, monitor = 'val_accuracy', verbose = 1, save_best_only = True, mode = 'max'))
  
    history = model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_data = (X_test, y_test), callbacks = [tensorboard, checkpoint]) 
    
    return back_testing_data  

In [13]:
#Back testing

def back_test(data, model_location, test_all = False, test_random_week = True):
    
    print("Attempting model back test")
    
    if not test_random_week and not test_all:
        print("No testing method specified")
        
    if test_random_week:
        
        model = tf.keras.models.load_model(model_location)

        index = random.randrange(len(data) - 390)
        
        print(f"Testing day: {index}")
        
        X = []
        y = []
        
        for i in range(390):
            X.append(data[index + i][0])
            y.append(data[index + i][1])
 
        X = np.array(X)
        y = np.array(y)

        prediction_probabilities = model.predict(X)
        predictions = np.argmax(prediction_probabilities, axis = 1)

        total_count = 0.0
        count0 = 0.0
        count1 = 0.0

        print("Total accuracy:")
        for i in range(len(y)):
            if(predictions[i] == y[i]):
                total_count += 1.0

        print(total_count / float(len(y)))
        
        indeces_0 = []

        for i in range(len(y)):
            if y[i] == 0:
                indeces_0.append(i)

        print("Accuracy of '0' predictions:")
        for i in indeces_0:
            if(predictions[i] == 0):
                count0 += 1.0

        print(count0 / float(len(indeces_0)))
        
        indeces_1 = []

        print("Accuracy of '1' predictions:")
        for i in range(len(y)):
            if y[i] == 1:
                indeces_1.append(i)

        for i in indeces_1:
            if(predictions[i] == 1):
                count1 += 1.0

        print(count1 / float(len(indeces_1)))
    
    print("number of true buys/sells")
    print(len(indeces_1), len(indeces_0))
    
    return count1 / float(len(indeces_1))

In [177]:
BATCH_SIZE = 128 #128 - original --- 32 = optimized (potentially)
LSIZE = 128 #128 - original --- 64 = optimized (porentially)
EPOCHS = 20
hm_units = 10
req = .002

back_test_data = do_dl('AAPL')


Number of total data points:
487236
Num buys/sells:
49979
49979
Number of training data points:
94961
Epoch 1/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 1- 0.570.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 1- 0.570.model\assets


Epoch 2/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 2- 0.639.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 2- 0.639.model\assets


Epoch 3/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 3- 0.650.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 3- 0.650.model\assets


Epoch 4/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 4- 0.517.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 4- 0.517.model\assets


Epoch 5/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 5- 0.653.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 5- 0.653.model\assets


Epoch 6/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 6- 0.641.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 6- 0.641.model\assets


Epoch 7/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 7- 0.667.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 7- 0.667.model\assets


Epoch 8/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 8- 0.569.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 8- 0.569.model\assets


Epoch 9/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 9- 0.516.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 9- 0.516.model\assets


Epoch 10/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 10- 0.630.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 10- 0.630.model\assets


Epoch 11/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 11- 0.646.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 11- 0.646.model\assets


Epoch 12/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 12- 0.657.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 12- 0.657.model\assets


Epoch 13/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 13- 0.719.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 13- 0.719.model\assets


Epoch 14/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 14- 0.698.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 14- 0.698.model\assets


Epoch 15/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 15- 0.684.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 15- 0.684.model\assets


Epoch 16/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 16- 0.695.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 16- 0.695.model\assets


Epoch 17/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 17- 0.685.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 17- 0.685.model\assets


Epoch 18/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 18- 0.694.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 18- 0.694.model\assets


Epoch 19/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 19- 0.730.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 19- 0.730.model\assets


Epoch 20/20



INFO:tensorflow:Assets written to: Records/models\RNN_Final- 20- 0.746.model\assets


INFO:tensorflow:Assets written to: Records/models\RNN_Final- 20- 0.746.model\assets




In [178]:
accuracy = []

for i in range(40):
    #accuracy.append(back_test(back_test_data, 'Records//models//RNN_Final- 24- 0.711.model'))
    accuracy.append(back_test(back_test_data, 'Records//Successful Models//AAPL.model'))
    
print(sum(accuracy) / 40.0)

Attempting model back test
Testing day: 86208
Total accuracy:
0.41794871794871796
Accuracy of '0' predictions:
0.33974358974358976
Accuracy of '1' predictions:
0.7307692307692307
number of true buys/sells
78 312
Attempting model back test
Testing day: 67061
Total accuracy:
0.41025641025641024
Accuracy of '0' predictions:
0.38461538461538464
Accuracy of '1' predictions:
0.6410256410256411
number of true buys/sells
39 351
Attempting model back test
Testing day: 95339
Total accuracy:
0.6948717948717948
Accuracy of '0' predictions:
0.7376093294460642
Accuracy of '1' predictions:
0.3829787234042553
number of true buys/sells
47 343
Attempting model back test
Testing day: 61330
Total accuracy:
0.3923076923076923
Accuracy of '0' predictions:
0.30194805194805197
Accuracy of '1' predictions:
0.7317073170731707
number of true buys/sells
82 308
Attempting model back test
Testing day: 85916
Total accuracy:
0.49230769230769234
Accuracy of '0' predictions:
0.4720670391061452
Accuracy of '1' predictio

In [152]:
back_test(back_test_data, 'Records//models//RNN_Final- 20- 0.735.model')

Attempting model back test
Testing index (+ one week): 28552
Total accuracy:
0.6194871794871795
Accuracy of '0' predictions:
0.6577095329494562
Accuracy of '1' predictions:
0.46511627906976744
number of true buys/sells
387 1563


0.46511627906976744

In [None]:
def simulate_trading: