In [6]:
import sqlite3 as sq
import pandas as pd
import asyncio
import aiohttp
from aiohttp import ClientSession, TCPConnector
import nest_asyncio
nest_asyncio.apply()


CHUNK_SIZE = 100
MAX_RETRIES = 3
RETRY_DELAY = 2
MAX_CONCURRENT_REQUESTS = 20

In [7]:
tickers_csv = pd.read_excel('..\src\Copy of NYSE_Stocks.xlsx')
tickers_list= tickers_csv['Symbol'].tolist()

ticker_count = len(tickers_list)
print(f"Number of unique tickers: {ticker_count}")

  self._parser.feed(data)


Number of unique tickers: 8970


In [8]:
async def get_earnings_history_for_ticker(session, ticker, current, total, headers):
    print(f"Processing {ticker} {current}/{total}")
    url = f'https://finance.yahoo.com/quote/{ticker}/analysis?p={ticker}'
    
    for attempt in range(MAX_RETRIES):
        try:
            async with session.get(url, headers=headers) as response:
                response_text = await response.text()

                tables = pd.read_html(response_text)
                for table in tables:
                    if 'Earnings History' in table.columns:
                        table['Ticker'] = ticker
                        columns_mapping = {
                            table.columns[0]: 'Metrics',
                            table.columns[1]: 'Q-4',
                            table.columns[2]: 'Q-3',
                            table.columns[3]: 'Q-2',
                            table.columns[4]: 'Q-1'
                        }
                        table.rename(columns=columns_mapping, inplace=True)
                        cols = table.columns.tolist()
                        cols = [cols[-1]] + cols[:-1]
                        table = table[cols]
                        return table
        except (aiohttp.ClientError, ValueError, Exception) as e:
            print(f"Error for {ticker} on attempt {attempt + 1}. Reason: {str(e)}. Retrying...")
            if attempt + 1 == MAX_RETRIES:
                return pd.DataFrame()  # Return empty dataframe if table not found
            await asyncio.sleep(RETRY_DELAY)

async def fetch_chunk(ticker_chunk, headers):
    async with ClientSession(connector=TCPConnector(limit=MAX_CONCURRENT_REQUESTS)) as session:
        tasks = [get_earnings_history_for_ticker(session, ticker, idx+1, len(ticker_chunk), headers) for idx, ticker in enumerate(ticker_chunk)]
        return await asyncio.gather(*tasks)

def get_data_for_tickers(ticker_list):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    loop = asyncio.get_event_loop()
    
    all_data_dfs = []
    for i in range(0, len(ticker_list), CHUNK_SIZE):
        ticker_chunk = ticker_list[i:i+CHUNK_SIZE]
        chunk_data = loop.run_until_complete(fetch_chunk(ticker_chunk, headers))
        all_data_dfs.extend(chunk_data)

    all_data = pd.concat(all_data_dfs, ignore_index=True)
    return all_data


In [9]:
df = get_data_for_tickers(tickers_list)
df

Processing A 1/100
Processing AA 2/100
Processing AAA 3/100
Processing AAAU 4/100
Processing AAC 5/100
Processing AACG 6/100
Processing AACI 7/100
Processing AACT 8/100
Processing AADI 9/100
Processing AADR 10/100
Processing AAIC 11/100
Processing AAL 12/100
Processing AAMC 13/100
Processing AAME 14/100
Processing AAN 15/100
Processing AAOI 16/100
Processing AAON 17/100
Processing AAP 18/100
Processing AAPB 19/100
Processing AAPD 20/100
Processing AAPL 21/100
Processing AAPU 22/100
Processing AAT 23/100
Processing AAXJ 24/100
Processing AB 25/100
Processing ABBV 26/100
Processing ABC 27/100
Processing ABCB 28/100
Processing ABCL 29/100
Processing ABCM 30/100
Processing ABEO 31/100
Processing ABEQ 32/100
Processing ABEV 33/100
Processing ABG 34/100
Processing ABIO 35/100
Processing ABL 36/100
Processing ABM 37/100
Processing ABNB 38/100
Processing ABOS 39/100
Processing ABR 40/100
Processing ABSI 41/100
Processing ABT 42/100
Processing ABUS 43/100
Processing ABVC 44/100
Processing AC 45

KeyboardInterrupt: 

Error for ALCC on attempt 2. Reason: Session is closed. Retrying...


: 

In [None]:
# Connect to SQLite database (will be created if doesn't exist)
conn = sq.connect('../market_data.db')

# Insert data into a table called "stock_data"
df.to_sql('earnings', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()