In [2]:
from collections import Counter, deque
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import os
import pandas as pd
from pandas_datareader import data as pdr
import pickle
import random
import requests
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import time

style.use('ggplot')

with open("sp500tickers_unrevised.pickle", "rb") as f:
            tickers = pickle.load(f)
       

    
dropped_tickers = ('BALL', 'BRK-B', 'BF-B', 'KMX', 'WBD', 'CEG', 'PARA', 'WTW', 'META', 'OGN', 'BBWI')

for i in dropped_tickers:
    if i in tickers:
        del tickers[tickers.index(i)]
        
hm_units = 10
req = 0.002
SEQ_LEN = 60
BATCH_SIZE = 132
EPOCHS = 30
NAME = f'{SEQ_LEN}-SEQ-{hm_units}-PRED-{int(time.time())}'
CORRELATION_COEFFICIENT = 0.92

#api keys that will be used to access data from the source
        
api = 'gDvQNVWDC2mhOlN1Z9if6JEyDM08CpeC'

In [2]:
#This function will get the daily and minute data for a specific ticker, and it will save it to a CSV to be accessed later.
#The desired ticker must be passed as an argument, as well as the start/end date of the data (string). 
#The 'ignore_after_hours' parameter (boolean) will dictate whether the dataset only contains active hours data, 
#or if it includes pre/post market. If the pre/post market data is also taken, the user must account for 
#a high volume of zero volume periods, since the stocks are traded less frequently during these times. 
#The 'minute' and 'daily' paramaters are booleans which dicate which type of data will be acquired:
#daily data or minute data. Both initially set to True

#Dates must be in format yyyy-mm-dd
def get_ticker_data(ticker, start_date, end_date, ignore_after_hours = True, include_minute = True, include_daily = True):
    
        print(f'Getting {ticker} data')
    
        #The 'sort' variable dictates the method by which the data is sorted. asc (Ascending) means the oldest dates are at the top
        #and desc (Descending) means the newest dates are at the top. DO NOT CHANGE. This variable was only put in place to make it easier
        #for me to develop/test the code while I was writing it. 
        sort = 'asc' #or desc
        
        #This variable dictates how many days the 'date_tracker' objecd (created below) will be advanced by each time new data is added
        #To the stock minute dataframe
        increment = 60
        
        #This is the file path to store the final Minute stock data
        minute_data_path = f'MINUTE_STOCK_DATA//{ticker}'
        
        #This is the file path to store the final Daily stock data
        daily_data_path = f'DAILY_STOCK_DATA//{ticker}'
        
        #Creates 'start'/'end' strings from the 'start_date' and 'end_date' parameters that can be accessed during execution of the method
        start = start_date
        end = end_date
        
        #Creates a datetime object that stores the 'end' date (to be compared to during the while loop)
        end_date_time = datetime.datetime(int(end[:4]), int(end[5:7]), int(end[8:10]))
        
        #This 'date_tracker' object will allow the following while loop determine when to exit the loop
        date_tracker = datetime.datetime(int(start[:4]), int(start[5:7]), int(start[8:10]))
        
        #Url string for the initial 'df1' data
        minute_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{str(date_tracker)[:10]}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'
        
        #If the 'minute' parameter is set to True
        if include_minute:
    
            #Requests the minute stock data from the url listed above and creates the proper dataframe    
            firstData = requests.get(minute_url).json()
            df1 = pd.DataFrame(firstData['results'])
            df1.reset_index(inplace = True)
            df1.drop(['vw', 'n'], axis = 1, inplace = True)

            df1['t'] = pd.to_datetime(df1['t'], unit = 'ms')
            df1.rename(columns = {'c': f'{ticker}_Close', 
                                 'o': f'{ticker}_Open', 
                                 'h': f'{ticker}_High',
                                 'l': f'{ticker}_Low',
                                 'v': f'{ticker}_Volume',
                                 't': 'Date'}, inplace = True)
            
            #List of features contained within the initial stock data
            features_list = []

            #Populates the list of features
            for col in df1.columns:
                features_list.append(col)

            #The list of lists containing data from each feature within the minute stock data. The index of each list will 
            #correspond with the index of the the list's data's respective feature within the 'features' list
            df1_data_list = []

            #Appends a list form of each column (feature) of data to the 'df1_data_list'
            for col in df1.columns:
                df1_data_list.append(df1[col].to_list())

            #Status update
            '''print("Collecting and compiling data")'''

            #Since each data call is limited to 50000 candles, this while loop is needed to join multiple 50000 candle chunks of data together
            #The 'date_tracker' object will tell the while loop when it should exit the loop, and that is when the 'date-tracker' object's date
            #Surpasses the requested end_date_time minus seventy days. The reason the seventy is subtracted, is so that an error is not thrown 
            #when the date_tracker (which is continually incremented by 70 days throughout the loop) surpasses the requested end_date_time, 
            #thereby possibly requesting data from the future (which obviously doesn't exist)
            while date_tracker < (end_date_time - datetime.timedelta(days = increment)):

                #Prints the first date of each obtained chunk of data (since data can only be collected in chunks of 50000)
                #print(date_tracker)

                #Second url that grabs the data that will be added to the end of the initial 'df1' dataframe
                minute_url2 = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{str(date_tracker + datetime.timedelta(days = increment))[:10]}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'

                #Acquires the 'to-be-added' data and formats it accordingly
                secondData = requests.get(minute_url2).json()
                df2 = pd.DataFrame(secondData['results'])
                df2.reset_index(inplace = True)
                df2.drop(['vw', 'n'], axis = 1, inplace = True)
                df2['t'] = pd.to_datetime(df2['t'], unit = 'ms')
                df2.rename(columns = {'c': f'{ticker}_Close', 
                                     'o': f'{ticker}_Open', 
                                     'h': f'{ticker}_High',
                                     'l': f'{ticker}_Low',
                                     'v': f'{ticker}_Volume',
                                     't': 'Date'}, inplace = True)

                #Creates a list that will contain list formats of each features' data from the 'df2' dataframe
                df2_data_list = []

                #Populates the above list
                for col in df2.columns:
                    df2_data_list.append(df2[col].to_list())

                #Appends the contents of the newly aqcuired list to the initial list
                for i in range(len(features_list)):
                    df1_data_list[i].extend(df2_data_list[i])

                #Increments the date_tracker object by 'increment' days
                date_tracker += datetime.timedelta(days = increment)

            #Progress tracker for the progression of the below code
            progress = 0.0

            #If the 'ignore_after_hours' paramater is read as True, excecute the following code
            if ignore_after_hours:

                #List containing all of the indeces that are to be deleted
                indeces_to_delete = []

                #Status update
                '''print("Finding all pre/post market data")'''

                #This loop will iterate through every data point in the data and determine the indeces of all 
                #pre/post market data
                for i in range(len(df1_data_list[features_list.index('Date')])):

                    #Prints the progress of this loop
                    '''if(progress % 50000 == 0):
                        print(progress / float(len(df1_data_list[features_list.index('Date')])))

                    progress += 1'''

                    #Sets the hour/minute values of each iteration
                    hour = int(str(df1_data_list[features_list.index('Date')][i])[11:13])
                    minute = int(str(df1_data_list[features_list.index('Date')][i])[14:16])

                    #Appends the index of all of the candles that occur before 1:30 pm and after
                    #8:00 pm (since those are the beginning and end times for the recorded stock data

                    if hour < 13:
                        indeces_to_delete.append(i)
                    elif hour >= 20:
                        indeces_to_delete.append(i)
                    elif hour == 13 and minute < 30:
                        indeces_to_delete.append(i)

                #Status update
                '''print("Deleting all pre/post market data")'''

                #This variable accounts for the shift in the data that occurs every time an index is deleted.
                #The 'indeces_to_delete' list contains all of the indeces of candles that occur during pre/post market,
                #but every time one of those is deleted in the loop below, the true index of these candles is shifted 
                #down by one. The 'compensation' variable keeps track of this, and acouunts for it during the filtering process.
                compensation = 0

                #Tracks the progress of the loop
                progress = 0.0

                #Iterate through every index of the 'indeces_to_delete' list
                for i in indeces_to_delete:

                    #Prints the progress to the user
                    '''if(progress % 30000 == 0):
                        print(progress / float(len(indeces_to_delete)))

                    progress += 1'''

                    #Deletes the current loop iteration's respective index for every feature within
                    #The list of features 'df1_data_list'
                    for feature in range(len(features_list)):
                        del df1_data_list[feature][i - compensation]

                    #Increases 'compensation' by one to account for the shift in data caused by index deletion
                    compensation += 1

            #Status update
            '''print(f'Exporting {ticker} minute data')'''

            #Creates the DataFrame containing the minute data and writes it to a CSV File
            minute_data_df = pd.DataFrame()

            minute_data_df[f'{ticker}_Volume'] = df1_data_list[features_list.index(f'{ticker}_Volume')]
            minute_data_df[f'{ticker}_High'] = df1_data_list[features_list.index(f'{ticker}_High')]
            minute_data_df[f'{ticker}_Low'] = df1_data_list[features_list.index(f'{ticker}_Low')]
            minute_data_df[f'{ticker}_Open'] = df1_data_list[features_list.index(f'{ticker}_Open')]
            minute_data_df[f'{ticker}_Close'] = df1_data_list[features_list.index(f'{ticker}_Close')]
            minute_data_df['Date'] = df1_data_list[features_list.index('Date')]

            minute_data_df.drop_duplicates(inplace = True)

            #minute_data_df.to_csv(f'MINUTE_STOCK_DATA//{ticker}.csv')

            #Status update
            '''print(f'Exporting {ticker} Daily data')'''
            
        #If the 'daily' parameter is set to True
        if include_daily:
        
            #Beginning of Daily Data collection
            daily_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{start}/{end}?adjusted=true&sort={sort}&limit=50000&apiKey={api}'

            daily_data_df = requests.get(daily_url).json()
            daily_data_df = pd.DataFrame(daily_data_df['results'])

            daily_data_df.reset_index(inplace = True)
            daily_data_df.drop(['vw', 'n'], axis = 1, inplace = True)

            daily_data_df['t'] = pd.to_datetime(daily_data_df['t'], unit = 'ms')
            daily_data_df.rename(columns = {'c': f'{ticker}_Close', 
                                 'o': f'{ticker}_Open', 
                                 'h': f'{ticker}_High',
                                 'l': f'{ticker}_Low',
                                 'v': f'{ticker}_Volume',
                                 't': 'Date'}, inplace = True)

            #daily_data.to_csv(f'DAILY_STOCK_DATA//{ticker}.csv')
            
        #Returns the proper DateFrames depending on which (or both) of the 'minute'/'daily' parameters are True
        if include_minute and include_daily:
            return minute_data_df, daily_data_df
        elif include_minute:
            return minute_data_df
        elif include_daily:
            return daily_data_df
        #If neither the 'minute' nor the 'daily' parameter is True, prints an error statement
        else:
            print("No data type specified")
        '''print(f'Done with {ticker}')'''


In [3]:
#Calls the 'get_ticker_data' function for every ticker in the SP500 (excluding some oddities)
def acquire_all_data(start, end):
    
    '''for ticker in range(0, len(tickers)):
        try:
            mdf, ddf = get_ticker_data(tickers[ticker], start, end)
            mdf.to_csv(f'MINUTE_STOCK_DATA//{tickers[ticker]}.csv')
            ddf.to_csv(f'DAILY_STOCK_DATA//{tickers[ticker]}.csv')
        except:
            print("Too may calls, delaying for thirteen seconds and retrying")
            time.sleep(13)
            try:
                mdf, ddf = get_ticker_data(tickers[ticker], start, end)
                mdf.to_csv(f'MINUTE_STOCK_DATA//{tickers[ticker]}.csv')
                ddf.to_csv(f'DAILY_STOCK_DATA//{tickers[ticker]}.csv')
            except:
                print(f"Unable to acquire ticker {tickers[ticker]} daily/minute data")
            continue
            '''
    #Gets minute data including after hours
    for ticker in range(191, len(tickers)):
        try:
            mdfaf= get_ticker_data(tickers[ticker], start, end, include_daily = False, ignore_after_hours = False)
            mdfaf.to_csv(f'MINUTE_STOCK_DATA_AFTERHOURS//{tickers[ticker]}.csv')
        except:
            print("Too may calls, delaying for thirteen seconds and retrying")
            time.sleep(13)
            try:
                mdfaf = get_ticker_data(tickers[ticker], start, end, include_daily = False, ignore_after_hours = False)
                mdfaf.to_csv(f'MINUTE_STOCK_DATA_AFTERHOURS//{tickers[ticker]}.csv')
            except:
                print(f"Unable to acquire ticker {tickers[ticker]} after hours data")
            continue
    print('Complete')

acquire_all_data(start = '2017-07-08', end = '2022-07-24')

Getting FRC data
Getting FE data
Getting FIS data
Getting FISV data
Getting FLT data
Getting FMC data
Getting F data
Getting FTNT data
Getting FTV data
Getting FBHS data
Getting FOXA data
Getting FOX data
Getting BEN data
Getting FCX data
Getting GRMN data
Getting IT data
Getting GE data
Getting GNRC data
Getting GD data
Getting GIS data
Getting GM data
Getting GPC data
Getting GILD data
Getting GL data
Getting GPN data
Getting GS data
Getting HAL data
Getting HIG data
Getting HAS data
Getting HCA data
Getting PEAK data
Getting HSIC data
Getting HSY data
Getting HES data
Getting HPE data
Getting HLT data
Getting HOLX data
Getting HD data
Getting HON data
Getting HRL data
Getting HST data
Getting HWM data
Getting HPQ data
Getting HUM data
Getting HII data
Getting HBAN data
Getting IBM data
Getting IEX data
Getting IDXX data
Getting ITW data
Getting ILMN data
Getting INCY data
Getting IR data
Getting INTC data
Getting ICE data
Getting IP data
Getting IPG data
Getting IFF data
Getting INT