In [27]:
import bs4 as bs
from collections import Counter, deque
import datetime as dt
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import os
import pandas as pd
from pandas_datareader import data as pdr
import pickle
import random
import requests
from sklearn import svm, neighbors
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import time

style.use('ggplot')

#Variables
tickers = []
with open("Deep Learning//sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
        
with open("Deep Learning//AAPLMinute.pickle", "rb") as f:
            AAPL_Minute_df = pickle.load(f)
        
        
hm_days = 7
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 1
BATCH_SIZE = 128
EPOCHS = 5
NAME = f'{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}'
features = ('_Open', '_High', '_Low', '', '_Volume', '_ATR', '_RSI', '_RSI_Weights', '_MA_9', '_MA_50' , '_MA_100', '_MA_200')
CORRELATION_COEFFICIENT = 0.98

In [10]:
#AAPL_Minute_df = AAPL_Minute_df.drop(['t', 'vw', 'n'], axis = 1)
#AAPL_Minute_df

In [11]:
#Classifies if a percent change should be a 1 or a 0
#based on the 'requirement' variable
def classify(*args):
    cols = [c for c in args]
    requirement = 0.004
    for col in cols:
        if col >= requirement:
            return 1
        else:
            return 0

In [12]:
#Calculates the ATR of a specific stock at any given time
def ATR_Column(ticker):
    df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
    df.set_index('Date', inplace = True)
    
    df.drop(['Close', 'Open', 'Volume'], axis = 1, inplace = True)
    
    df.rename(columns = {'Adj Close': '{}'.format(ticker), 
                             'High': '{}_High'.format(ticker),
                             'Low': '{}_Low'.format(ticker)}, inplace = True)
    atrs = []
    true_ranges = []
    
    df[ticker] = (df[ticker].shift(1))
    
    true_ranges.append(df['{}_High'.format(ticker)].iloc[0] - df['{}_Low'.format(ticker)].iloc[0])
    for i in range(1, len(df)):
        ranges = []
        ranges.append(df['{}_High'.format(ticker)].iloc[i] - df['{}_Low'.format(ticker)].iloc[i])
        ranges.append(abs(df['{}_High'.format(ticker)].iloc[i] - df['{}'.format(ticker)].iloc[i]))
        ranges.append(abs(df['{}_Low'.format(ticker)].iloc[i] - df['{}'.format(ticker)].iloc[i]))
        true_ranges.append(max(ranges))
    
    TRS = {'Ranges': true_ranges, 'Date': df.index.values}
    true_ranges_DF = pd.DataFrame(data = TRS)
    true_ranges_DF.set_index('Date', inplace = True)
    
    #df = df.join(true_ranges_DF, how = 'outer')
    df['ATRS'] = true_ranges_DF['Ranges'].rolling(window = 14, min_periods = 0).sum().div(14)
    return df['ATRS']
       
#ATR_Column('AAPL')

In [13]:
def RSI_Column(ticker):
    df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
    df.set_index('Date', inplace = True)
    
    df = df['Adj Close'].to_frame()
    
    close_prices = df.values.tolist()
    
    changes_list = []
    changes_list.append(0)
    
    for i in range(1, len(close_prices)):
        changes_list.append(close_prices[i][0] - close_prices[i - 1][0])
       
    changes_dict = {'Changes': changes_list, 'Date': df.index.values}
    
    changes_column = pd.DataFrame(data = changes_dict)
    changes_column.set_index('Date', inplace = True)
    
    df['Changes'] = changes_column['Changes']
    
    #for i in range(1, len(close_prices)):
        
    gain_list = []
    
    for i in range(len(close_prices)):
        if(df['Changes'].iloc[i] >= 0):
            gain_list.append(df['Changes'].iloc[i])
        else:
            gain_list.append(0.0)
    
    gain_dict = {'Gain': gain_list, 'Date': df.index.values}
    gain_column = pd.DataFrame(data = gain_dict)
    gain_column.set_index('Date', inplace = True)
    
    df['Gains'] = gain_column['Gain']
            
    loss_list = []
    
    for i in range(len(close_prices)):
        if(df['Changes'].iloc[i] < 0):
            loss_list.append(abs(df['Changes'].iloc[i]))
        else:
            loss_list.append(0.0)
    
    loss_dict = {'Loss': loss_list, 'Date': df.index.values}
    loss_column = pd.DataFrame(data = loss_dict)
    loss_column.set_index('Date', inplace = True)
    
    df['Loss'] = loss_column['Loss']
    
    df['Avg_Gain'] = df['Gains'].rolling(window = 14, min_periods = 0).mean()
    df['Avg_Loss'] = df['Loss'].rolling(window = 14, min_periods = 0).mean()
    
    df['RS'] = df['Avg_Gain'] / (df['Avg_Loss'])
    df['RS'].iloc[0] = 0
    
    RSI_List = []
    
    for i in range(len(close_prices)):
        RSI_List.append(100 - (100 / (df['RS'].iloc[i] + 1)))
        
    RSI_Dict = {'RSI': RSI_List, 'Date': df.index.values}
    
    RSI_Column = pd.DataFrame(data = RSI_Dict)
    RSI_Column.set_index('Date', inplace = True)
    
    df['RSI'] = RSI_Column['RSI']
    
    weights_list = []
    
    for i in range(len(df)):
        if(df['RSI'].iloc[i] >= 70):
            weights_list.append(1)
        elif(df['RSI'].iloc[i] <= 30):
            weights_list.append(-1)
        else:
            weights_list.append(0)
            
    weights_dict = {'RSI_Weights': weights_list, 'Date': df.index.values}
    
    RSI_Weights_Column = pd.DataFrame(data = weights_dict)
    RSI_Weights_Column.set_index('Date', inplace = True)
    
    df['RSI_Weights'] = RSI_Weights_Column['RSI_Weights']
    
    
    
    return df['RSI'], df['RSI_Weights']
    
#RSI_Column('AAPL')

In [14]:
def moving_averages_column(ticker):
    df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
    df.set_index('Date', inplace = True)
    
    df = df['Adj Close'].to_frame()
    
    df['MA_9'] = df['Adj Close'].rolling(window = 9, min_periods = 0).mean()
    df['MA_50'] = df['Adj Close'].rolling(window = 50, min_periods = 0).mean()
    df['MA_100'] = df['Adj Close'].rolling(window = 100, min_periods = 0).mean()
    df['MA_200'] = df['Adj Close'].rolling(window = 200, min_periods = 0).mean()
    
    return df['MA_9'], df['MA_50'], df['MA_100'], df['MA_200']
    
#moving_averages_column('AAPL')

In [28]:
#Saves the list of tickers contained within the SP 500
def save_sp500_tickers():
    resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    
    for row in table.findAll('tr') [1:]:
        ticker = row.findAll('td')[0].text.replace('.','-')
        ticker = ticker[:-1]
        tickers.append(ticker)
    with open("Deep Learning//sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)

#save_sp500_tickers()

In [29]:
#Retrieves the stock data from each ticker on the sp500 from yahoo
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("Deep Learning//sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
    if not os.path.exists('Deep Learning//stock_dfs'):
        os.makedirs('Deep Learning//stock_dfs')

    start = dt.datetime(2000, 1, 1)
    end = dt.datetime(2022, 5, 1)
    
    for ticker in tickers:
        path = 'Deep Learning//stock_dfs//{}'.format(ticker)
        print(ticker)
        if not os.path.exists(path):
            df = pdr.get_data_yahoo(ticker, start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df.to_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))


get_data_from_yahoo()

MMM
AOS
ABT
ABBV
ABMD
ACN
ATVI
ADM
ADBE
ADP
AAP
AES
AFL
A
APD
AKAM
ALK
ALB
ARE
ALGN
ALLE
LNT
ALL
GOOGL
GOOG
MO
AMZN
AMCR
AMD
AEE
AAL
AEP
AXP
AIG
AMT
AWK
AMP
ABC
AME
AMGN
APH
ADI
ANSS
ANTM
AON
APA
AAPL
AMAT
APTV
ANET
AJG
AIZ
T
ATO
ADSK
AZO
AVB
AVY
BKR
BALL
BAC
BBWI
BAX
BDX
WRB
BRK-B
BBY
BIO
TECH
BIIB
BLK
BK
BA
BKNG
BWA
BXP
BSX
BMY
AVGO
BR
BRO
BF-B
CHRW
CDNS
CZR
CPT
CPB
COF
CAH
KMX
CCL
CARR
CTLT
CAT
CBOE
CBRE
CDW
CE
CNC
CNP
CDAY
CF
CRL
SCHW
CHTR
CVX
CMG
CB
CHD
CI
CINF
CTAS
CSCO
C
CFG
CTXS
CLX
CME
CMS
KO
CTSH
CL
CMCSA
CMA
CAG
COP
ED
STZ
CEG
COO
CPRT
GLW
CTVA
COST
CTRA
CCI
CSX
CMI
CVS
DHI
DHR
DRI
DVA
DE
DAL
XRAY
DVN
DXCM
FANG
DLR
DFS
DISH


RemoteDataError: Unable to read URL: https://finance.yahoo.com/quote/DISH/history?period1=946728000&period2=1651489199&interval=1d&frequency=1d&filter=history
Response Text:
b'<html><meta charset=\'utf-8\'><script>if(window!=window.top){document.write(\'<p>Content is currently unavailable.</p><img src="//geo.yahoo.com/p?s=1197757039&t=\'+new Date().getTime()+\'&_R=\'+encodeURIComponent(document.referrer)+\'&err=404&err_url=\'+\'https%3A%2F%2Ffinance.yahoo.com%2Fquote%2FDISH%2Fhistory%3Fperiod1%3D946728000%26period2%3D1651489199%26interval%3D1d%26frequency%3D1d%26filter%3Dhistory\'+\'" width="0px" height="0px"/>\');}else{window.location.replace(\'https://www.yahoo.com/?err=404&err_url=https%3A%2F%2Ffinance.yahoo.com%2Fquote%2FDISH%2Fhistory%3Fperiod1%3D946728000%26period2%3D1651489199%26interval%3D1d%26frequency%3D1d%26filter%3Dhistory\');}</script><noscript><META http-equiv="refresh" content="0;URL=\'https://www.yahoo.com/?err=404&err_url=https%3A%2F%2Ffinance.yahoo.com%2Fquote%2FDISH%2Fhistory%3Fperiod1%3D946728000%26period2%3D1651489199%26interval%3D1d%26frequency%3D1d%26filter%3Dhistory\'"></noscript></html>'

In [None]:
#Creates a single dataframe containing all stock data for 
#Each stock on the sp 500

#features = ('_Open', '_High', '_Low', '', '_Volume', '_ATR', '_RSI', '_RSI_Weights', '_MA_9', '_MA_50' , '_MA_100', '_MA_200')

def create_joint_df(target_ticker):
    main_df = pd.DataFrame()
    
    for count, ticker in enumerate(tickers):
        df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
        df.set_index('Date', inplace = True)
        df.rename(columns = {'Adj Close': '{}'.format(ticker), 
                             'Open': '{}_Open'.format(ticker),
                             'Volume': '{}_Volume'.format(ticker),
                             'High': '{}_High'.format(ticker),
                             'Low': '{}_Low'.format(ticker)}, inplace = True)
        df.drop(['Close'], axis = 1, inplace = True)
   
    df[f'{target_ticker}_ATR'] = ATR_Column(target_ticker)
    df[f'{target_ticker}_RSI'], df[f'{target_ticker}_RSI_Weights'] = RSI_Column(target_ticker)
    df[f'{target_ticker}_MA_9'], df[f'{target_ticker}_MA_50'], df[f'{target_ticker}_MA_100'], df[f'{target_ticker}_MA_200'] = moving_averages_column(target_ticker)


    #Extra optimization if needed
    '''df[f'{ticker}'] = df[f'{ticker}'].astype(np.float32)
    df[f'{ticker}_Open'] = df[f'{ticker}_Open'].astype(np.float32)
    df[f'{ticker}_High'] = df[f'{ticker}_High'].astype(np.float32)
    df[f'{ticker}_Low'] = df[f'{ticker}_Low'].astype(np.float32)
    df[f'{ticker}_Volume'] = df[f'{ticker}_Volume'].astype(np.int32)
    df[f'{ticker}_ATR'] = df[f'{ticker}_ATR'].astype(np.float16)
    df[f'{ticker}_RSI'] = df[f'{ticker}_RSI'].astype(np.float16)
    df[f'{ticker}_RSI_Weights'] = df[f'{ticker}_RSI_Weights'].astype(np.int8)
    df[f'{ticker}_MA_9'] = df[f'{ticker}_MA_9'].astype(np.float16)
    df[f'{ticker}_MA_50'] = df[f'{ticker}_MA_50'].astype(np.float16)
    df[f'{ticker}_MA_100'] = df[f'{ticker}_MA_100'].astype(np.float16)
    df[f'{ticker}_MA_200'] = df[f'{ticker}_MA_200'].astype(np.float16)'''

        
        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how = 'outer')
        if count % 30 == 0:
            print(count)
   
    main_df.to_csv('Deep Learning//sp500_joined_closes.csv')
                         
create_joint_df()

In [None]:
def filter_high_correlation(df, ticker, corr_level):
    
    df_corr = df.corr()
    df_corr = df_corr[f'{ticker}']
    
    for stock_ticker in tickers:
        
        for i in range(len(features)):
            if(i == 3):
                continue
            df_corr = df_corr.drop(f'{stock_ticker}{features[i]}')
        
    df_corr = df_corr.drop(f'{ticker}')

    correlated_tickers = []
    
    correlated_data = pd.DataFrame()
    
    for i in range(len(df_corr)):
        if abs(df_corr.iloc[i]) >= corr_level:
            correlated_tickers.append(df_corr.index.tolist()[i])
    
    for stock_ticker in correlated_tickers:
        if correlated_data.empty:
            correlated_data = df[f'{stock_ticker}{features[0]}'].to_frame()
            for feature in range(1, len(features)):
                correlated_data = correlated_data.join(df[f'{stock_ticker}{features[feature]}'], how = 'outer')
        else:
            for feature in range(len(features)):
                correlated_data = correlated_data.join(df[f'{stock_ticker}{features[feature]}'], how = 'outer')
                
    for feature in features:
        correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
    
    correlated_data.to_csv(f'Deep Learning//Correlated_Data//{ticker}-Correlated_stock_data.csv')
        
    #return correlated_data

data = pd.read_csv('Deep Learning//sp500_joined_closes.csv', index_col = 0)
data.fillna(0, inplace = True)
    
data.fillna(0, inplace = True)

filter_high_correlation(data, 'AAPL', CORRELATION_COEFFICIENT)


In [None]:
#Removes NA values and adds a shifted 'future' column
#Calls the function to filter out the low correlations
def process_data_for_labels(ticker):
    #df = pd.read_csv('Deep Learning//sp500_joined_closes.csv', index_col = 0)
    #df.fillna(0, inplace = True)
    
    #df.fillna(0, inplace = True)
    
    #df = filter_high_correlation(df, ticker, CORRELATION_COEFFICIENT)
    
    df = pd.read_csv(f'Deep Learning//Correlated_Data//{ticker}-Correlated_stock_data.csv')
    df.set_index('Date', inplace = True)
    
    df['{}_future'.format(ticker)] = ((df[ticker].shift(-hm_days) - df[ticker]) / df[ticker]) 
    df['{}_target'.format(ticker)] = list(map(classify,  df['{}_future'.format(ticker)]))

    
    print(df.info())
    return df

process_data_for_labels('AAPL')