In [1]:
#Imports
import bs4 as bs
from collections import Counter, deque
import datetime as dt
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import os
import pandas as pd
from pandas_datareader import data as pdr
import pickle
import random
import requests
from sklearn import svm, neighbors
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import time

style.use('ggplot')

#Variables
tickers = []

with open("Deep Learning//sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
        
with open("Deep Learning//AAPLMinute.pickle", "rb") as f:
            AAPL_Minute_df = pickle.load(f)
        
        
hm_days = 7
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 1
BATCH_SIZE = 128
EPOCHS = 5
NAME = f'{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}'
features = ('_Open', '_High', '_Low', '', '_Volume', '_ATR', '_RSI', '_RSI_Weights', '_MA_9', '_MA_50' , '_MA_100', '_MA_200')
CORRELATION_COEFFICIENT = 0.98

In [3]:
#Classifies if a percent change should be a 1 or a 0
#based on the 'requirement' variable
def classify(*args):
    cols = [c for c in args]
    requirement = 0.004
    for col in cols:
        if col >= requirement:
            return 1
        else:
            return 0

In [4]:
#Calculates the ATR of a specific stock at any given time
def ATR_Column(ticker):
    df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
    df.set_index('Date', inplace = True)
    
    df.drop(['Close', 'Open', 'Volume'], axis = 1, inplace = True)
    
    df.rename(columns = {'Adj Close': '{}'.format(ticker), 
                             'High': '{}_High'.format(ticker),
                             'Low': '{}_Low'.format(ticker)}, inplace = True)
    atrs = []
    true_ranges = []
    
    df[ticker] = (df[ticker].shift(1))
    
    true_ranges.append(df['{}_High'.format(ticker)].iloc[0] - df['{}_Low'.format(ticker)].iloc[0])
    for i in range(1, len(df)):
        ranges = []
        ranges.append(df['{}_High'.format(ticker)].iloc[i] - df['{}_Low'.format(ticker)].iloc[i])
        ranges.append(abs(df['{}_High'.format(ticker)].iloc[i] - df['{}'.format(ticker)].iloc[i]))
        ranges.append(abs(df['{}_Low'.format(ticker)].iloc[i] - df['{}'.format(ticker)].iloc[i]))
        true_ranges.append(max(ranges))
    
    TRS = {'Ranges': true_ranges, 'Date': df.index.values}
    true_ranges_DF = pd.DataFrame(data = TRS)
    true_ranges_DF.set_index('Date', inplace = True)
    
    #df = df.join(true_ranges_DF, how = 'outer')
    df['ATRS'] = true_ranges_DF['Ranges'].rolling(window = 14, min_periods = 0).sum().div(14)
    return df['ATRS']
       
#ATR_Column('AAPL')

In [5]:
def RSI_Column(ticker):
    df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
    df.set_index('Date', inplace = True)
    
    df = df['Adj Close'].to_frame()
    
    close_prices = df.values.tolist()
    
    changes_list = []
    changes_list.append(0)
    
    for i in range(1, len(close_prices)):
        changes_list.append(close_prices[i][0] - close_prices[i - 1][0])
       
    changes_dict = {'Changes': changes_list, 'Date': df.index.values}
    
    changes_column = pd.DataFrame(data = changes_dict)
    changes_column.set_index('Date', inplace = True)
    
    df['Changes'] = changes_column['Changes']
    
    #for i in range(1, len(close_prices)):
        
    gain_list = []
    
    for i in range(len(close_prices)):
        if(df['Changes'].iloc[i] >= 0):
            gain_list.append(df['Changes'].iloc[i])
        else:
            gain_list.append(0.0)
    
    gain_dict = {'Gain': gain_list, 'Date': df.index.values}
    gain_column = pd.DataFrame(data = gain_dict)
    gain_column.set_index('Date', inplace = True)
    
    df['Gains'] = gain_column['Gain']
            
    loss_list = []
    
    for i in range(len(close_prices)):
        if(df['Changes'].iloc[i] < 0):
            loss_list.append(abs(df['Changes'].iloc[i]))
        else:
            loss_list.append(0.0)
    
    loss_dict = {'Loss': loss_list, 'Date': df.index.values}
    loss_column = pd.DataFrame(data = loss_dict)
    loss_column.set_index('Date', inplace = True)
    
    df['Loss'] = loss_column['Loss']
    
    df['Avg_Gain'] = df['Gains'].rolling(window = 14, min_periods = 0).mean()
    df['Avg_Loss'] = df['Loss'].rolling(window = 14, min_periods = 0).mean()
    
    df['RS'] = df['Avg_Gain'] / (df['Avg_Loss'])
    df['RS'].iloc[0] = 0
    
    RSI_List = []
    
    for i in range(len(close_prices)):
        RSI_List.append(100 - (100 / (df['RS'].iloc[i] + 1)))
        
    RSI_Dict = {'RSI': RSI_List, 'Date': df.index.values}
    
    RSI_Column = pd.DataFrame(data = RSI_Dict)
    RSI_Column.set_index('Date', inplace = True)
    
    df['RSI'] = RSI_Column['RSI']
    
    weights_list = []
    
    for i in range(len(df)):
        if(df['RSI'].iloc[i] >= 70):
            weights_list.append(1)
        elif(df['RSI'].iloc[i] <= 30):
            weights_list.append(-1)
        else:
            weights_list.append(0)
            
    weights_dict = {'RSI_Weights': weights_list, 'Date': df.index.values}
    
    RSI_Weights_Column = pd.DataFrame(data = weights_dict)
    RSI_Weights_Column.set_index('Date', inplace = True)
    
    df['RSI_Weights'] = RSI_Weights_Column['RSI_Weights']
    
    
    
    return df['RSI'], df['RSI_Weights']
    
#RSI_Column('AAPL')

In [6]:
def moving_averages_column(ticker):
    df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
    df.set_index('Date', inplace = True)
    
    df = df['Adj Close'].to_frame()
    
    df['MA_9'] = df['Adj Close'].rolling(window = 9, min_periods = 0).mean()
    df['MA_50'] = df['Adj Close'].rolling(window = 50, min_periods = 0).mean()
    df['MA_100'] = df['Adj Close'].rolling(window = 100, min_periods = 0).mean()
    df['MA_200'] = df['Adj Close'].rolling(window = 200, min_periods = 0).mean()
    
    return df['MA_9'], df['MA_50'], df['MA_100'], df['MA_200']
    
#moving_averages_column('AAPL')

In [7]:
#Saves the list of tickers contained within the SP 500
def save_sp500_tickers():
    resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    
    for row in table.findAll('tr') [1:]:
        ticker = row.findAll('td')[0].text.replace('.','-')
        ticker = ticker[:-1]
        tickers.append(ticker)
    with open("Deep Learning//sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)

#save_sp500_tickers()

In [8]:
#Retrieves the stock data from each ticker on the sp500 from yahoo
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("Deep Learning//sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
    if not os.path.exists('Deep Learning//stock_dfs'):
        os.makedirs('Deep Learning//stock_dfs')

    start = dt.datetime(2000, 1, 1)
    end = dt.datetime(2022, 5, 1)
    
    for ticker in tickers:
        path = 'Deep Learning//stock_dfs//{}'.format(ticker)
        print(ticker)
        if not os.path.exists(path):
            df = pdr.get_data_yahoo(ticker, start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df.to_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))


#get_data_from_yahoo()

In [9]:
#Creates a single dataframe containing all stock data for 
#Each stock on the sp 500

#features = ('_Open', '_High', '_Low', '', '_Volume', '_ATR', '_RSI', '_RSI_Weights', '_MA_9', '_MA_50' , '_MA_100', '_MA_200')

def create_joint_df(target_ticker):
    main_df = pd.DataFrame()
    
    for count, ticker in enumerate(tickers):
        df = pd.read_csv('Deep Learning//stock_dfs//{}.csv'.format(ticker.replace('.', '-')))
        df.set_index('Date', inplace = True)
        df.rename(columns = {'Adj Close': '{}'.format(ticker), 
                             'Open': '{}_Open'.format(ticker),
                             'Volume': '{}_Volume'.format(ticker),
                             'High': '{}_High'.format(ticker),
                             'Low': '{}_Low'.format(ticker)}, inplace = True)
        df.drop(['Close'], axis = 1, inplace = True)
    
    df[f'{target_ticker}_ATR'] = ATR_Column(target_ticker)
    df[f'{target_ticker}_RSI'], df[f'{target_ticker}_RSI_Weights'] = RSI_Column(target_ticker)
    df[f'{target_ticker}_MA_9'], df[f'{target_ticker}_MA_50'], df[f'{target_ticker}_MA_100'], df[f'{target_ticker}_MA_200'] = moving_averages_column(target_ticker)

        
        #Extra optimization if needed
        '''df[f'{ticker}'] = df[f'{ticker}'].astype(np.float32)
        df[f'{ticker}_Open'] = df[f'{ticker}_Open'].astype(np.float32)
        df[f'{ticker}_High'] = df[f'{ticker}_High'].astype(np.float32)
        df[f'{ticker}_Low'] = df[f'{ticker}_Low'].astype(np.float32)
        df[f'{ticker}_Volume'] = df[f'{ticker}_Volume'].astype(np.int32)
        df[f'{ticker}_ATR'] = df[f'{ticker}_ATR'].astype(np.float16)
        df[f'{ticker}_RSI'] = df[f'{ticker}_RSI'].astype(np.float16)
        df[f'{ticker}_RSI_Weights'] = df[f'{ticker}_RSI_Weights'].astype(np.int8)
        df[f'{ticker}_MA_9'] = df[f'{ticker}_MA_9'].astype(np.float16)
        df[f'{ticker}_MA_50'] = df[f'{ticker}_MA_50'].astype(np.float16)
        df[f'{ticker}_MA_100'] = df[f'{ticker}_MA_100'].astype(np.float16)
        df[f'{ticker}_MA_200'] = df[f'{ticker}_MA_200'].astype(np.float16)'''
        
        
        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how = 'outer')
        if count % 30 == 0:
            print(count)
   
    main_df.to_csv('Deep Learning//sp500_joined_closes.csv')
                         
create_joint_df()

In [10]:
def filter_high_correlation(df, ticker, corr_level):
    
    df_corr = df.corr()
    df_corr = df_corr[f'{ticker}']
    
    for stock_ticker in tickers:
        
        for i in range(len(features)):
            if(i == 3):
                continue
            df_corr = df_corr.drop(f'{stock_ticker}{features[i]}')
        
    df_corr = df_corr.drop(f'{ticker}')

    correlated_tickers = []
    
    correlated_data = pd.DataFrame()
    
    for i in range(len(df_corr)):
        if abs(df_corr.iloc[i]) >= corr_level:
            correlated_tickers.append(df_corr.index.tolist()[i])
    
    for stock_ticker in correlated_tickers:
        if correlated_data.empty:
            correlated_data = df[f'{stock_ticker}{features[0]}'].to_frame()
            for feature in range(1, len(features)):
                correlated_data = correlated_data.join(df[f'{stock_ticker}{features[feature]}'], how = 'outer')
        else:
            for feature in range(len(features)):
                correlated_data = correlated_data.join(df[f'{stock_ticker}{features[feature]}'], how = 'outer')
                
    for feature in features:
        correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
    
    correlated_data.to_csv(f'Deep Learning//Correlated_Data//{ticker}-Correlated_stock_data.csv')
        
    #return correlated_data

data = pd.read_csv('Deep Learning//sp500_joined_closes.csv', index_col = 0)
data.fillna(0, inplace = True)
    
data.fillna(0, inplace = True)

filter_high_correlation(data, 'AAPL', CORRELATION_COEFFICIENT)


  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']
  correlated_data[f'{ticker}{feature}'] = df[f'{ticker}{feature}']


In [12]:
#Removes NA values and adds a shifted 'future' column
#Calls the function to filter out the low correlations
def process_data_for_labels(ticker):
    #df = pd.read_csv('Deep Learning//sp500_joined_closes.csv', index_col = 0)
    #df.fillna(0, inplace = True)
    
    #df.fillna(0, inplace = True)
    
    #df = filter_high_correlation(df, ticker, CORRELATION_COEFFICIENT)
    
    df = pd.read_csv(f'Deep Learning//Correlated_Data//{ticker}-Correlated_stock_data.csv')
    df.set_index('Date', inplace = True)
    
    df['{}_future'.format(ticker)] = ((df[ticker].shift(-hm_days) - df[ticker]) / df[ticker]) 
    df['{}_target'.format(ticker)] = list(map(classify,  df['{}_future'.format(ticker)]))

    
    print(df.info())
    return df

process_data_for_labels('AAPL')

<class 'pandas.core.frame.DataFrame'>
Index: 5618 entries, 2000-01-03 to 2022-04-29
Columns: 194 entries, DHR_Open to AAPL_target
dtypes: float64(167), int64(27)
memory usage: 8.4+ MB
None


Unnamed: 0_level_0,DHR_Open,DHR_High,DHR_Low,DHR,DHR_Volume,DHR_ATR,DHR_RSI,DHR_RSI_Weights,DHR_MA_9,DHR_MA_50,...,AAPL_Volume,AAPL_ATR,AAPL_RSI,AAPL_RSI_Weights,AAPL_MA_9,AAPL_MA_50,AAPL_MA_100,AAPL_MA_200,AAPL_future,AAPL_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,9.097801,9.168878,8.718726,5.626462,1474642,0.03217,0.00,-1,5.625,5.625,...,535796800,0.006897,0.000,-1,0.8594,0.8594,0.8594,0.8594,-0.221105,0
2000-01-04,8.860879,8.872726,8.434420,5.422277,2424850,0.26400,0.00,-1,5.523,5.523,...,512377600,0.016070,0.000,-1,0.8230,0.8230,0.8230,0.8230,-0.056098,0
2000-01-05,8.576573,8.766111,8.410728,5.482780,2854844,0.50300,22.86,-1,5.510,5.510,...,778321600,0.030360,13.710,-1,0.8150,0.8150,0.8150,0.8150,-0.034255,0
2000-01-06,8.493650,8.884572,8.292267,5.543276,1281013,0.74600,37.22,0,5.520,5.520,...,767972800,0.041560,7.523,-1,0.7935,0.7935,0.7935,0.7935,0.094079,1
2000-01-07,8.718726,8.813495,8.671342,5.588651,1751632,0.97950,44.90,0,5.530,5.530,...,460734400,0.053900,24.550,-1,0.7876,0.7876,0.7876,0.7876,0.070980,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-25,259.450000,260.880000,253.000000,260.460000,4322500,9.26000,30.52,0,274.800,278.200,...,96046400,4.360000,25.920,-1,166.0000,166.8000,169.5000,159.0000,,0
2022-04-26,257.790000,259.470000,250.550000,250.980000,3287500,9.58000,27.67,-1,271.500,277.800,...,95623200,4.510000,23.920,-1,164.8000,166.6000,169.4000,159.1000,,0
2022-04-27,253.340000,257.820000,251.180000,252.280000,2852000,9.46000,27.08,-1,268.000,277.200,...,88063200,4.477000,26.170,-1,163.2000,166.4000,169.2000,159.1000,,0
2022-04-28,255.740000,259.260000,252.120000,257.000000,4352100,9.02000,21.64,-1,265.800,277.200,...,130216800,4.793000,39.030,0,163.1000,166.1000,169.4000,159.2000,,0


In [10]:
def preprocess_df(ticker):
    
    df = process_data_for_labels('AAPL')
    df = df.drop('{}_future'.format(ticker), axis = 1)
    
    df.fillna(0, inplace = True)
    
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace = True)
    
    #Scales/Normalizes data
    for col in df.columns:
        if col != f'{ticker}_target' or col != f'{ticker}_RSI_Weights':
            df[col] = df[col].pct_change()
        
    df = df.replace([np.inf, -np.inf], 0)
    df.fillna(0, inplace = True)
    
    sequential_data = []
    prev_days = deque(maxlen = SEQ_LEN)
    
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
            
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    sequential_data = buys + sells
    random.shuffle(sequential_data)
    
    
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), np.array(y)

#preprocess_df('AAPL')

In [11]:
def do_dl(ticker):
    X, y = preprocess_df(ticker)
    #print(X.dtype, y.dtype)
    #print(np.shape(X))
  
    last_5pct = -int(0.05 * len(X))
    
    X_test= X[last_5pct:]
    X_train = X[:last_5pct]
    y_test = y[last_5pct:]
    y_train = y[:last_5pct]
    
    #print(type(X_test), type(y_test))
    model = Sequential()
    model.add(LSTM(256, input_shape = (X_train.shape[1:]), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    
    model.add(LSTM(256, input_shape = (X_train.shape[1:]), return_sequences = True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
    
    model.add(LSTM(256, input_shape = (X_train.shape[1:])))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())
              
    model.add(Dense(512, activation = 'relu'))
    model.add(Dropout(0.2))
    
    model.add(Dense(2, activation = 'softmax'))
    
    opt = tf.keras.optimizers.Adam(learning_rate = 0.001, decay = 1e-6)
    
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
    
    tensorboard = TensorBoard(log_dir = f'Deep Learning/logs/{NAME}')
    
    filepath = 'RNN_Final-{epoch: 02d}-{val_accuracy: .3f}'
    checkpoint = ModelCheckpoint('Deep Learning/models/{}.model'.format(filepath, monitor = 'val_accuracy', verbose = 1, save_best_only = True, mode = 'max'))
    
    history = model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_data = (X_test, y_test), callbacks = [tensorboard, checkpoint]) 
          

In [163]:
do_dl('AAPL')

<class 'pandas.core.frame.DataFrame'>
Index: 5618 entries, 2000-01-03 to 2022-04-29
Columns: 194 entries, DHR_Open to AAPL_target
dtypes: float64(167), int64(27)
memory usage: 8.4+ MB
None
Epoch 1/5



INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 1- 0.542.model\assets


INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 1- 0.542.model\assets


Epoch 2/5



INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 2- 0.542.model\assets


INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 2- 0.542.model\assets


Epoch 3/5



INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 3- 0.594.model\assets


INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 3- 0.594.model\assets


Epoch 4/5



INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 4- 0.590.model\assets


INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 4- 0.590.model\assets


Epoch 5/5



INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 5- 0.618.model\assets


INFO:tensorflow:Assets written to: Deep Learning/models\RNN_Final- 5- 0.618.model\assets




In [212]:
#Next Goal:

#Maybe VWAP
#Support resistance algs

