In [9]:
import requests
import pandas as pd
import time
import sqlite3 as sq

In [10]:
tickers_csv = pd.read_excel('..\src\Copy of NYSE_Stocks.xlsx')
tickers_list= tickers_csv['Symbol'].tolist()

ticker_count = len(tickers_list)
print(f"Number of unique tickers: {ticker_count}")

Number of unique tickers: 8970


In [11]:
def get_earnings_history_for_ticker(ticker, current, total):
    print(f"Processing {ticker} {current}/{total}")
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    url = f'https://finance.yahoo.com/quote/{ticker}/analysis?p={ticker}'
    response = requests.get(url, headers=headers)
    
    try:
        # Use pandas to directly extract tables from the HTML
        tables = pd.read_html(response.text)
        
        for table in tables:
            if 'Earnings History' in table.columns:
                table['Ticker'] = ticker
                
                # Rename columns to standard quarter names
                columns_mapping = {
                    table.columns[0]: 'Metrics',
                    table.columns[1]: 'Q-4',
                    table.columns[2]: 'Q-3',
                    table.columns[3]: 'Q-2',
                    table.columns[4]: 'Q-1'
                }
                table.rename(columns=columns_mapping, inplace=True)
                
                # Rearrange columns to make 'Ticker' the first column
                cols = table.columns.tolist()
                cols = [cols[-1]] + cols[:-1]
                table = table[cols]
                
                return table
    except ValueError:
        print(f"No tables found for ticker {ticker}")
    
    return pd.DataFrame()  # Return empty dataframe if table not found

def get_data_for_tickers(ticker_list):
    all_data = pd.DataFrame()
    total_tickers = len(ticker_list)
    
    for idx, ticker in enumerate(ticker_list, start=1):
        df = get_earnings_history_for_ticker(ticker, idx, total_tickers)
        all_data = pd.concat([all_data, df], ignore_index=True)
    
    return all_data

In [12]:
df = get_data_for_tickers(tickers_list)
df

Processing A 1/20
Processing AA 2/20
Processing AAA 3/20
Processing AAAU 4/20
Processing AAC 5/20
No tables found for ticker AAC
Processing AACG 6/20
Processing AACI 7/20
No tables found for ticker AACI
Processing AACT 8/20
No tables found for ticker AACT
Processing AADI 9/20
Processing AADR 10/20
Processing AAIC 11/20
Processing AAL 12/20
Processing AAMC 13/20
No tables found for ticker AAMC
Processing AAME 14/20
Processing AAN 15/20
Processing AAOI 16/20
Processing AAON 17/20
Processing AAP 18/20
Processing AAPB 19/20
Processing AAPD 20/20


Unnamed: 0,Ticker,Metrics,Q-4,Q-3,Q-2,Q-1
0,A,EPS Est.,1.39,1.3,1.26,1.36
1,A,EPS Actual,1.53,1.37,1.27,1.43
2,A,Difference,0.14,0.07,0.01,0.07
3,A,Surprise %,10.10%,5.40%,0.80%,5.10%
4,AA,EPS Est.,0.19,-0.92,-0.11,-0.52
5,AA,EPS Actual,-0.33,-0.7,-0.23,-0.35
6,AA,Difference,-0.52,0.22,-0.12,0.17
7,AA,Surprise %,-273.70%,23.90%,-109.10%,32.70%
8,AACG,EPS Est.,,-0.15,-0.27,-0.25
9,AACG,EPS Actual,,-0.13,-0.01,0.08


In [13]:
# Connect to SQLite database (will be created if doesn't exist)
conn = sq.connect('../market_data.db')

# Insert data into a table called "stock_data"
df.to_sql('earnings', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()

In [14]:
# df.to_excel('earnings.xlsx', engine='xlsxwriter')