## Importing Dependencies

In [1]:
import pandas as pd
import requests
from tqdm.notebook import tqdm
import yfinance as yf
import datetime
from concurrent.futures import ThreadPoolExecutor
from sklearn.preprocessing import MinMaxScaler

## Fetching tickers from NASDAQ Screener

In [2]:
def fetch_nasdaq_data():
    url = "https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=25&offset=0&exchange=nasdaq&download=true"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0'
    }
    response = requests.get(url, headers=headers)
    return response.json()

In [3]:
nasdaq_data = fetch_nasdaq_data()
df = pd.DataFrame(nasdaq_data['data']['rows'])
print(df.shape)
df.head()

(3887, 12)


Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
0,AACG,ATA Creativity Global American Depositary Shares,$0.8779,-0.0176,-1.965%,45932,28092914.0,China,2008.0,Other Consumer Services,Real Estate,/market-activity/stocks/aacg
1,AADI,Aadi Bioscience Inc. Common Stock,$1.9221,-0.0379,-1.934%,73898,47312172.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
2,AAL,American Airlines Group Inc. Common Stock,$10.794,0.054,0.503%,25740995,7088462037.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
3,AAME,Atlantic American Corporation Common Stock,$1.68,0.03,1.818%,3815,34271593.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
4,AAOI,Applied Optoelectronics Inc. Common Stock,$14.34,0.59,4.291%,889691,586444553.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi


In [4]:
# Extracing stocks based on some terms in name column

to_include = 'common share|common stock|common|stock|share'
df = df[df['name'].str.lower().str.contains(to_include, case=False, na=False)]
print(df.shape)
df.head()

(3384, 12)


Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
0,AACG,ATA Creativity Global American Depositary Shares,$0.8779,-0.0176,-1.965%,45932,28092914.0,China,2008.0,Other Consumer Services,Real Estate,/market-activity/stocks/aacg
1,AADI,Aadi Bioscience Inc. Common Stock,$1.9221,-0.0379,-1.934%,73898,47312172.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
2,AAL,American Airlines Group Inc. Common Stock,$10.794,0.054,0.503%,25740995,7088462037.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
3,AAME,Atlantic American Corporation Common Stock,$1.68,0.03,1.818%,3815,34271593.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
4,AAOI,Applied Optoelectronics Inc. Common Stock,$14.34,0.59,4.291%,889691,586444553.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi


In [5]:
# Filtering out irrelevant tickers that include ADRs, ETFs, Rights, Units etc.

df['name'] = df['name'].str.lower()
to_exclude = ['preferred', 'depositary', 'preference', 'unit ', ' right', 'units ']
pattern = '|'.join(to_exclude)
df = df[~df['name'].str.contains(pattern, case=False, na=False)]
df = df[df['country'] == 'United States']

In [6]:
df.head()

Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
1,AADI,aadi bioscience inc. common stock,$1.9221,-0.0379,-1.934%,73898,47312172.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
2,AAL,american airlines group inc. common stock,$10.794,0.054,0.503%,25740995,7088462037.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
3,AAME,atlantic american corporation common stock,$1.68,0.03,1.818%,3815,34271593.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
4,AAOI,applied optoelectronics inc. common stock,$14.34,0.59,4.291%,889691,586444553.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi
5,AAON,aaon inc. common stock,$107.26,-0.06,-0.056%,164716,8689470254.0,United States,,Industrial Machinery/Components,Industrials,/market-activity/stocks/aaon


In [7]:
# Filtering out other irrelevant assets that may contain the filter words

filter_words = ['ordinary', 'common']
pattern = '|'.join(filter_words)
exclude_df = df[~df['name'].str.contains(pattern, case=False, na=False)]

In [8]:
# Cleaning out the irrelevant assets

merged = df.merge(exclude_df, how='outer', indicator=True)
df = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')

In [9]:
print(df.shape)
df.head()

(2509, 12)


Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
0,AADI,aadi bioscience inc. common stock,$1.9221,-0.0379,-1.934%,73898,47312172.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
1,AAL,american airlines group inc. common stock,$10.794,0.054,0.503%,25740995,7088462037.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
2,AAME,atlantic american corporation common stock,$1.68,0.03,1.818%,3815,34271593.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
3,AAOI,applied optoelectronics inc. common stock,$14.34,0.59,4.291%,889691,586444553.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi
4,AAON,aaon inc. common stock,$107.26,-0.06,-0.056%,164716,8689470254.0,United States,,Industrial Machinery/Components,Industrials,/market-activity/stocks/aaon


## Filtering stocks for training

In [10]:
symbols = list(df['symbol'])

In [11]:
first_trade_dates = {}

def fetch(symbol):
    stock = yf.Ticker(symbol)
    if 'firstTradeDateEpochUtc' in stock.info:
        first_trade_timestamp = stock.info['firstTradeDateEpochUtc']
        return symbol, datetime.datetime.fromtimestamp(first_trade_timestamp, datetime.UTC).strftime('%Y-%m-%d %H:%M:%S')
    return symbol, None

with ThreadPoolExecutor(max_workers=8) as executor:
    with tqdm(total=len(symbols), desc='Fetching First Trade Date', unit=' Stock') as pbar:
        futures = {executor.submit(fetch, symbol): symbol for symbol in symbols}
        for future in futures:
            symbol, first_trade_date = future.result()
            first_trade_dates[symbol] = first_trade_date
            pbar.update(1)

Fetching First Trade Date:   0%|          | 0/2509 [00:00<?, ? Stock/s]

In [12]:
cutoff_date = datetime.datetime.strptime('2014-09-28', '%Y-%m-%d')

stocks_before_cutoff = {
    symbol: date for symbol, date in first_trade_dates.items()
    if date is not None and datetime.datetime.strptime(date.split(' ')[0], '%Y-%m-%d') <= cutoff_date
}

In [13]:
stocks_list = list(stocks_before_cutoff.keys())

In [14]:
percentage_of_stocks = len(stocks_list) / len(df) * 100
total_rows = len(stocks_list) * 2517
print(f'Percentage of stocks used for training: {percentage_of_stocks:.2f}%')
print(f'Total expected rows: {total_rows}')

Percentage of stocks used for training: 51.26%
Total expected rows: 3236862


## Technical Indicators calculation

In [15]:
def exponential_moving_avg(df, window_size=15, center=True):
    if center == True:
        ema_df = df['Adj Close'].shift(int(window_size/2)).ewm(span=window_size).mean()
        return _remove_trailing_data(ema_df,     window_size)
    else:
        ema_df = pd.Series.ewm(df['Adj Close'], span=window_size).mean()
        return ema_df

In [16]:
def macd_line(df, ema1_window_size=12, ema2_window_size=26, center=True):
    macd_line_df = exponential_moving_avg(df, window_size=ema1_window_size, center=center) - exponential_moving_avg(df, window_size=ema2_window_size, center=center)
    return macd_line_df

In [17]:
def macd_signal(df, window_size=9, ema1_window_size=12, ema2_window_size=26, center=True):
    macd_line_df = pd.DataFrame()
    macd_line_df['Adj Close'] = macd_line(df, ema1_window_size=ema1_window_size, ema2_window_size=ema2_window_size, center=center)
    return exponential_moving_avg(macd_line_df, window_size=window_size, center=center)

## Fetching and storing data

In [18]:
end_date = pd.Timestamp.today()
start_date = end_date - pd.DateOffset(years=10)

In [19]:
df = pd.DataFrame()

with tqdm(total=len(stocks_list), desc='Fetching Stock Data', unit=' Ticker') as pbar:
    for ticker in stocks_list:
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        stock_data['Ticker'] = ticker
        stock_data = stock_data[['Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']].copy()
        stock_data['EMA'] = exponential_moving_avg(stock_data, window_size=10, center=False)
        stock_data['MACD Line'] = macd_line(stock_data, ema1_window_size=10, ema2_window_size=20, center=False)
        stock_data['MACD Signal'] = macd_signal(stock_data, window_size=10, ema1_window_size=10, ema2_window_size=20, center=False)
        df = pd.concat([df, stock_data], axis=0)
        pbar.update(1)

Fetching Stock Data:   0%|          | 0/1286 [00:00<?, ? Ticker/s]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [24]:
print(df.shape)
df.head()

(3233027, 10)


Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Adj Close,Volume,EMA,MACD Line,MACD Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-10-06,AAL,36.310001,36.5,34.860001,34.93,33.013432,12789300,33.013432,0.0,0.0
2014-10-07,AAL,34.75,35.02,34.060001,34.09,32.219528,10030100,32.576785,-0.019848,-0.010916
2014-10-08,AAL,34.529999,34.549999,32.599998,33.0,31.189327,22753600,32.019036,-0.060842,-0.030986
2014-10-09,AAL,33.5,33.700001,31.940001,31.98,30.225296,22634100,31.428079,-0.11641,-0.05913
2014-10-10,AAL,31.549999,32.25,29.780001,30.780001,29.091137,31018700,30.757206,-0.193838,-0.097801


In [21]:
df.to_csv('10_Year_Historical.csv')

## TODO

In [22]:
# TODO: Experiment with technical indicators