## Importing Dependencies

In [1]:
import pandas as pd
import requests
from tqdm.notebook import tqdm
import yfinance as yf
import datetime
from concurrent.futures import ThreadPoolExecutor

## Fetching tickers from NASDAQ Screener

In [2]:
def fetch_nasdaq_data():
    url = "https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=25&offset=0&exchange=nasdaq&download=true"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0'
    }
    response = requests.get(url, headers=headers)
    return response.json()

In [3]:
nasdaq_data = fetch_nasdaq_data()
df = pd.DataFrame(nasdaq_data['data']['rows'])
print(df.shape)
df.head()

(3933, 12)


Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
0,AACG,ATA Creativity Global American Depositary Shares,$1.005,0.005,0.50%,37054,32160131.0,China,2008.0,Other Consumer Services,Real Estate,/market-activity/stocks/aacg
1,AADI,Aadi Bioscience Inc. Common Stock,$2.11,-0.04,-1.86%,126223,51937300.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
2,AAL,American Airlines Group Inc. Common Stock,$13.09,0.28,2.186%,38275131,8596254221.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
3,AAME,Atlantic American Corporation Common Stock,$1.67,0.04,2.454%,3371,34067596.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
4,AAOI,Applied Optoelectronics Inc. Common Stock,$18.57,-0.43,-2.263%,2810086,759433428.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi


In [4]:
# Extracing stocks based on some terms in name column

to_include = 'common share|common stock|common|stock|share'
df = df[df['name'].str.lower().str.contains(to_include, case=False, na=False)]
print(df.shape)
df.head()

(3395, 12)


Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
0,AACG,ATA Creativity Global American Depositary Shares,$1.005,0.005,0.50%,37054,32160131.0,China,2008.0,Other Consumer Services,Real Estate,/market-activity/stocks/aacg
1,AADI,Aadi Bioscience Inc. Common Stock,$2.11,-0.04,-1.86%,126223,51937300.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
2,AAL,American Airlines Group Inc. Common Stock,$13.09,0.28,2.186%,38275131,8596254221.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
3,AAME,Atlantic American Corporation Common Stock,$1.67,0.04,2.454%,3371,34067596.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
4,AAOI,Applied Optoelectronics Inc. Common Stock,$18.57,-0.43,-2.263%,2810086,759433428.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi


In [5]:
# Filtering out irrelevant tickers that include ADRs, ETFs, Rights, Units etc.

df['name'] = df['name'].str.lower()
to_exclude = ['preferred', 'depositary', 'preference', 'unit ', ' right', 'units ']
pattern = '|'.join(to_exclude)
df = df[~df['name'].str.contains(pattern, case=False, na=False)]
df = df[df['country'] == 'United States']

In [6]:
df.head()

Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
1,AADI,aadi bioscience inc. common stock,$2.11,-0.04,-1.86%,126223,51937300.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
2,AAL,american airlines group inc. common stock,$13.09,0.28,2.186%,38275131,8596254221.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
3,AAME,atlantic american corporation common stock,$1.67,0.04,2.454%,3371,34067596.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
4,AAOI,applied optoelectronics inc. common stock,$18.57,-0.43,-2.263%,2810086,759433428.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi
5,AAON,aaon inc. common stock,$109.34,-1.77,-1.593%,524547,8857977602.0,United States,,Industrial Machinery/Components,Industrials,/market-activity/stocks/aaon


In [7]:
# Filtering out other irrelevant assets that may contain the filter words

filter_words = ['ordinary', 'common']
pattern = '|'.join(filter_words)
exclude_df = df[~df['name'].str.contains(pattern, case=False, na=False)]

In [8]:
# Cleaning out the irrelevant assets

merged = df.merge(exclude_df, how='outer', indicator=True)
df = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')

In [9]:
print(df.shape)
df.head()

(2512, 12)


Unnamed: 0,symbol,name,lastsale,netchange,pctchange,volume,marketCap,country,ipoyear,industry,sector,url
0,AADI,aadi bioscience inc. common stock,$2.11,-0.04,-1.86%,126223,51937300.0,United States,,Biotechnology: Pharmaceutical Preparations,Health Care,/market-activity/stocks/aadi
1,AAL,american airlines group inc. common stock,$13.09,0.28,2.186%,38275131,8596254221.0,United States,,Air Freight/Delivery Services,Consumer Discretionary,/market-activity/stocks/aal
2,AAME,atlantic american corporation common stock,$1.67,0.04,2.454%,3371,34067596.0,United States,,Life Insurance,Finance,/market-activity/stocks/aame
3,AAOI,applied optoelectronics inc. common stock,$18.57,-0.43,-2.263%,2810086,759433428.0,United States,2013.0,Semiconductors,Technology,/market-activity/stocks/aaoi
4,AAON,aaon inc. common stock,$109.34,-1.77,-1.593%,524547,8857977602.0,United States,,Industrial Machinery/Components,Industrials,/market-activity/stocks/aaon


## Filtering stocks for training

In [10]:
symbols = list(df['symbol'])

In [11]:
first_trade_dates = {}

def fetch(symbol):
    stock = yf.Ticker(symbol)
    if 'firstTradeDateEpochUtc' in stock.info:
        first_trade_timestamp = stock.info['firstTradeDateEpochUtc']
        return symbol, datetime.datetime.fromtimestamp(first_trade_timestamp, datetime.UTC).strftime('%Y-%m-%d %H:%M:%S')
    return symbol, None

with ThreadPoolExecutor(max_workers=8) as executor:
    with tqdm(total=len(symbols), desc='Fetching First Trade Date', unit=' Stock') as pbar:
        futures = {executor.submit(fetch, symbol): symbol for symbol in symbols}
        for future in futures:
            symbol, first_trade_date = future.result()
            first_trade_dates[symbol] = first_trade_date
            pbar.update(1)

Fetching First Trade Date:   0%|          | 0/2512 [00:00<?, ? Stock/s]

In [12]:
cutoff_date = datetime.datetime.strptime('2014-09-28', '%Y-%m-%d')

stocks_before_cutoff = {
    symbol: date for symbol, date in first_trade_dates.items()
    if date is not None and datetime.datetime.strptime(date.split(' ')[0], '%Y-%m-%d') <= cutoff_date
}

In [13]:
stocks_list = list(stocks_before_cutoff.keys())

In [14]:
percentage_of_stocks = len(stocks_list) / len(df) * 100
total_rows = len(stocks_list) * 2517
print(f'Percentage of stocks used for training: {percentage_of_stocks:.2f}%')
print(f'Total expected rows: {total_rows}')

Percentage of stocks used for training: 50.96%
Total expected rows: 3221760


## Fetching and storing data

In [15]:
end_date = pd.Timestamp.today()
start_date = end_date - pd.DateOffset(years=10)

In [16]:
with tqdm(total=len(stocks_list), desc='Fetching Stock Data', unit=' Ticker') as pbar:
    for ticker in stocks_list:
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        stock_data['Ticker'] = ticker
        stock_data = stock_data[['Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]
        pbar.update(1)

Fetching Stock Data:   0%|          | 0/1280 [00:00<?, ? Ticker/s]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [18]:
df.to_csv('10_Year_Historical.csv')