In [25]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import sqlite3 as sq

In [26]:
tickers_csv = pd.read_excel('Copy of NYSE_Stocks.xlsx')
tickers_list= tickers_csv['Symbol'].tolist()

ticker_count = len(tickers_list)
print(f"Number of unique tickers: {ticker_count}")

Number of unique tickers: 8970


In [27]:
def extract_main_data(html_content, ticker):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extract Market Cap
        market_cap_tag = soup.find("td", {"data-test": "MARKET_CAP-value"})
        market_cap = market_cap_tag.text if market_cap_tag else "N/A"
        
        # Extract Earnings Date
        earnings_date_tag = soup.find("td", {"data-test": "EARNINGS_DATE-value"})
        if earnings_date_tag and "-" in earnings_date_tag.text:
            earnings_date_min, earnings_date_max = map(str.strip, earnings_date_tag.text.split("-"))
        elif earnings_date_tag:
            earnings_date_min, earnings_date_max = earnings_date_tag.text, earnings_date_tag.text
        else:
            earnings_date_min, earnings_date_max = "N/A", "N/A"
        
        data = {
            'Ticker': [ticker],
            'Market Cap': [market_cap],
            'Earnings Date Min': [earnings_date_min],
            'Earnings Date Max': [earnings_date_max]
        }
        
        df = pd.DataFrame(data)
        return df
    except Exception as e:
        print(f"Error extracting data for {ticker}. Error: {e}")
        return pd.DataFrame(columns=['Ticker', 'Market Cap', 'Earnings Date Min', 'Earnings Date Max'])

def get_main_data_for_tickers(ticker_list, sleep_duration=5): 
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    base_url = 'https://finance.yahoo.com/quote/{}?p={}'
    
    all_data = pd.DataFrame()
    
    for ticker in ticker_list:
        url = base_url.format(ticker, ticker)
        response = requests.get(url, headers=headers)
        df = extract_main_data(response.text, ticker)
        all_data = pd.concat([all_data, df], ignore_index=True)
        
        time.sleep(sleep_duration)
    
    return all_data

In [28]:
df = get_main_data_for_tickers(tickers_list)
df


In [None]:
# Connect to SQLite database (will be created if doesn't exist)
conn = sq.connect('main_data.db')

# Insert data into a table called "stock_data"
df.to_sql('main_data', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()