In [101]:
import sqlite3 as sq
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
from aiohttp import ClientSession, TCPConnector
import random
import nest_asyncio
import datetime

In [102]:
tickers_csv = pd.read_excel('..\src\Copy of NYSE_Stocks.xlsx')
tickers_from_file = tickers_csv['Symbol'].tolist()
ticker_count = len(tickers_from_file)
print(f"Number of unique tickers: {ticker_count}")

Number of unique tickers: 8970


In [103]:
def get_unprocessed_tickers(tickers_from_file):
    # Try to load current data from DB, if it exists
    try:
        conn = sq.connect('../market_data.db')
        df = pd.read_sql('SELECT * FROM mkt_cap_next_earnings', conn)
        conn.close()
        
        # Identify tickers that are missing data or have outdated earnings dates
        today = datetime.date.today()
        na_tickers = df[df['Market Cap'] == 'N/A']['Ticker'].tolist()
        
        df['Earnings Date Max'] = pd.to_datetime(df['Earnings Date Max'], format="%b %d, %Y", errors='coerce')
        outdated_earnings_tickers = df[(df['Earnings Date Max'].notna()) & (df['Earnings Date Max'].dt.date < today)]['Ticker'].tolist()
        
        # Get unique list of tickers that need to be reprocessed
        to_reprocess = list(set(na_tickers + outdated_earnings_tickers))

        # Identify tickers that are not yet in the DB
        tickers_in_db = df['Ticker'].tolist()
        not_in_db = [ticker for ticker in tickers_from_file if ticker not in tickers_in_db]

        # Combine the lists
        all_tickers_to_process = list(set(to_reprocess + not_in_db))

        return all_tickers_to_process

    except Exception as e:
        print(f"Error: {str(e)}. Probably the table doesn't exist yet.")
        # If table doesn't exist, process all tickers from file
        return tickers_from_file

In [104]:
MAX_CONCURRENT_REQUESTS = 20
REQUEST_TIMEOUT = 20
RETRY_DELAY = 2
MAX_RETRIES = 3
nest_asyncio.apply()
USER_AGENTS = [
    # Google Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
    # Mozilla Firefox
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:87.0) Gecko/20100101 Firefox/87.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
    # Safari
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15',
    # Microsoft Edge
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.48',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36 Edg/88.0.705.68',
    # Opera
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36 OPR/58.0.3135.127',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 OPR/60.0.3255.170'
]

async def fetch(session, url, headers, ticker, idx, total):
    for attempt in range(MAX_RETRIES):
        try:
            async with session.get(url, headers=headers, timeout=REQUEST_TIMEOUT) as response:
                response_text = await response.text()
                print(f"Processed {idx}/{total}: {ticker}")
                await asyncio.sleep(random.uniform(1, 3))  # Sleep between requests
                return ticker, response_text
        except asyncio.TimeoutError:
            print(f"Timeout for {ticker} on attempt {attempt + 1}. Retrying...")
            if attempt + 1 == MAX_RETRIES:
                return ticker, ''
            await asyncio.sleep(RETRY_DELAY)
        except Exception as e:
            print(f"Error for {ticker} on attempt {attempt + 1}. Reason: {str(e)}. Retrying...")
            if attempt + 1 == MAX_RETRIES:
                return ticker, ''

async def fetch_all(ticker_list, headers):
    async with ClientSession(connector=TCPConnector(limit=MAX_CONCURRENT_REQUESTS)) as session:
        tasks = [fetch(session, f'https://finance.yahoo.com/quote/{ticker}?p={ticker}', headers, ticker, idx+1, len(ticker_list)) for idx, ticker in enumerate(ticker_list)]
        return await asyncio.gather(*tasks)

def extract_main_data(html_content, ticker):
    try:
        soup = BeautifulSoup(html_content, 'lxml')
        
        market_cap_tag = soup.find("td", {"data-test": "MARKET_CAP-value"})
        market_cap = market_cap_tag.text if market_cap_tag else "N/A"
        
        earnings_date_tag = soup.find("td", {"data-test": "EARNINGS_DATE-value"})
        if earnings_date_tag and "-" in earnings_date_tag.text:
            earnings_date_min, earnings_date_max = map(str.strip, earnings_date_tag.text.split("-"))
        elif earnings_date_tag:
            earnings_date_min, earnings_date_max = earnings_date_tag.text, earnings_date_tag.text
        else:
            earnings_date_min, earnings_date_max = "N/A", "N/A"
        
        return {
            'Ticker': ticker,
            'Market Cap': market_cap,
            'Earnings Date Min': earnings_date_min,
            'Earnings Date Max': earnings_date_max
        }
    except Exception as e:
        print(f"Error extracting data for {ticker}. Error: {e}")
        return {
            'Ticker': ticker,
            'Market Cap': 'N/A',
            'Earnings Date Min': 'N/A',
            'Earnings Date Max': 'N/A'
        }

def get_main_data_for_tickers(tickers_from_file):
    unprocessed_tickers = get_unprocessed_tickers(tickers_from_file)
    
    if not unprocessed_tickers:
        print("No new tickers to process. Exiting.")
        return

    headers = {
        'User-Agent': random.choice(USER_AGENTS)
    }
    
    loop = asyncio.get_event_loop()
    new_results = loop.run_until_complete(fetch_all(unprocessed_tickers, headers))
    new_data_list = [extract_main_data(html, ticker) for ticker, html in new_results]
    new_df = pd.DataFrame(new_data_list)

    # Load current data from DB
    conn = sq.connect('../market_data.db')
    current_df = pd.read_sql('SELECT * FROM mkt_cap_next_earnings', conn)
    conn.close()

    # Update current data with new results
    consolidated_df = pd.concat([current_df, new_df]).drop_duplicates(subset='Ticker', keep='last')

    # Save the consolidated data back to DB
    conn = sq.connect('../market_data.db')
    consolidated_df.to_sql('mkt_cap_next_earnings', conn, if_exists='replace', index=False)
    conn.close()

    na_counter = consolidated_df[consolidated_df['Market Cap'] == 'N/A'].shape[0]
    print(f"Number of tickers with 'N/A' Market Cap: {na_counter}")

    return consolidated_df


In [105]:
tickers_to_process = get_unprocessed_tickers(tickers_from_file)
random.shuffle(tickers_to_process)
df = get_main_data_for_tickers(tickers_to_process)

Processed 15/3850: AVMU
Processed 7/3850: ENRG
Processed 2/3850: CQQQ
Processed 11/3850: PFLD
Processed 8/3850: BNKD
Processed 14/3850: IUS
Processed 18/3850: IWD
Processed 3/3850: SCHZ
Processed 1/3850: OBOR
Processed 17/3850: XC
Processed 13/3850: PWZ
Processed 10/3850: FXB
Processed 5/3850: IVES
Processed 20/3850: BSMS
Processed 9/3850: QLD
Processed 16/3850: SCYB
Processed 21/3850: BBIP
Processed 19/3850: MEDX
Processed 4/3850: WKSP
Processed 6/3850: CLOZ
Processed 22/3850: QPX
Processed 23/3850: FXA
Processed 24/3850: TWIO
Processed 25/3850: UST
Processed 30/3850: GDX
Processed 33/3850: FDNI
Processed 36/3850: EAOR
Processed 31/3850: MFV
Processed 35/3850: ICOW
Processed 27/3850: JULZ
Processed 29/3850: BSMO
Processed 26/3850: QVMM
Processed 28/3850: RODE
Processed 34/3850: FPAG
Processed 38/3850: FLNC
Processed 39/3850: ECLN
Processed 40/3850: JULT
Processed 41/3850: STXT
Processed 42/3850: BBUS
Processed 32/3850: ORTX
Processed 44/3850: BWEB
Processed 37/3850: GRID
Processed 45/