## Importing Dependencies

In [3]:
import pandas as pd
import requests
from tqdm.notebook import tqdm
import yfinance as yf
import datetime
from concurrent.futures import ThreadPoolExecutor
from sklearn.preprocessing import MinMaxScaler

## Fetching tickers from NASDAQ Screener

In [4]:
def fetch_nasdaq_data():
    url = "https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=25&offset=0&exchange=nasdaq&download=true"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0'
    }
    response = requests.get(url, headers=headers)
    return response.json()

In [None]:
nasdaq_data = fetch_nasdaq_data()
df = pd.DataFrame(nasdaq_data['data']['rows'])
print(df.shape)
df.head()

In [None]:
# Extracing stocks based on some terms in name column

to_include = 'common share|common stock|common|stock|share'
df = df[df['name'].str.lower().str.contains(to_include, case=False, na=False)]
print(df.shape)
df.head()

In [7]:
# Filtering out irrelevant tickers that include ADRs, ETFs, Rights, Units etc.

df['name'] = df['name'].str.lower()
to_exclude = ['preferred', 'depositary', 'preference', 'unit ', ' right', 'units ']
pattern = '|'.join(to_exclude)
df = df[~df['name'].str.contains(pattern, case=False, na=False)]
df = df[df['country'] == 'United States']

In [None]:
df.head()

In [9]:
# Filtering out other irrelevant assets that may contain the filter words

filter_words = ['ordinary', 'common']
pattern = '|'.join(filter_words)
exclude_df = df[~df['name'].str.contains(pattern, case=False, na=False)]

In [10]:
# Cleaning out the irrelevant assets

merged = df.merge(exclude_df, how='outer', indicator=True)
df = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')

In [None]:
print(df.shape)
df.head()

## Filtering stocks for training

In [12]:
symbols = list(df['symbol'])

In [None]:
first_trade_dates = {}

def fetch(symbol):
    stock = yf.Ticker(symbol)
    if 'firstTradeDateEpochUtc' in stock.info:
        first_trade_timestamp = stock.info['firstTradeDateEpochUtc']
        return symbol, datetime.datetime.fromtimestamp(first_trade_timestamp, datetime.UTC).strftime('%Y-%m-%d %H:%M:%S')
    return symbol, None

with ThreadPoolExecutor(max_workers=8) as executor:
    with tqdm(total=len(symbols), desc='Fetching First Trade Date', unit=' Stock') as pbar:
        futures = {executor.submit(fetch, symbol): symbol for symbol in symbols}
        for future in futures:
            symbol, first_trade_date = future.result()
            first_trade_dates[symbol] = first_trade_date
            pbar.update(1)

In [12]:
cutoff_date = datetime.datetime.strptime('2014-09-28', '%Y-%m-%d')

stocks_before_cutoff = {
    symbol: date for symbol, date in first_trade_dates.items()
    if date is not None and datetime.datetime.strptime(date.split(' ')[0], '%Y-%m-%d') <= cutoff_date
}

In [13]:
stocks_list = list(stocks_before_cutoff.keys())

In [None]:
percentage_of_stocks = len(stocks_list) / len(df) * 100
total_rows = len(stocks_list) * 2517
print(f'Percentage of stocks used for training: {percentage_of_stocks:.2f}%')
print(f'Total expected rows: {total_rows}')

## Technical Indicators calculation

In [29]:
def exponential_moving_avg(df, window_size=15, center=True):
    if center == True:
        ema_df = df['Adj Close'].shift(int(window_size/2)).ewm(span=window_size).mean()
        return _remove_trailing_data(ema_df, window_size)
    else:
        ema_df = pd.Series.ewm(df['Adj Close'], span=window_size).mean()
        return ema_df

In [30]:
def macd_line(df, ema1_window_size=12, ema2_window_size=26, center=True):
    macd_line_df = exponential_moving_avg(df, window_size=ema1_window_size, center=center) - exponential_moving_avg(df, window_size=ema2_window_size, center=center)
    return macd_line_df

In [31]:
def macd_signal(df, window_size=9, ema1_window_size=12, ema2_window_size=26, center=True):
    macd_line_df = pd.DataFrame()
    macd_line_df['Adj Close'] = macd_line(df, ema1_window_size=ema1_window_size, ema2_window_size=ema2_window_size, center=center)
    return exponential_moving_avg(macd_line_df, window_size=window_size, center=center)

## Fetching and storing data

In [32]:
end_date = pd.Timestamp.today()
start_date = end_date - pd.DateOffset(years=10)

In [None]:
df = pd.DataFrame()

with tqdm(total=len(stocks_list), desc='Fetching Stock Data', unit=' Ticker') as pbar:
    for ticker in stocks_list:
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        stock_data['Ticker'] = ticker
        stock_data = stock_data[['Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']].copy()
        stock_data['EMA'] = exponential_moving_avg(stock_data, window_size=10, center=False)
        stock_data['MACD Line'] = macd_line(stock_data, ema1_window_size=10, ema2_window_size=20, center=False)
        stock_data['MACD Signal'] = macd_signal(stock_data, window_size=10, ema1_window_size=10, ema2_window_size=20, center=False)
        df = pd.concat([df, stock_data], axis=0)
        pbar.update(1)

In [None]:
print(df.shape)
df.head()

In [60]:
df.to_csv('10_Year_Historical.csv')

## TODO

In [None]:
# TODO: Experiment with technical indicators