In [6]:
# Load US listed stock symbols and their listing market/exchange from NASDAQ Trader files
import pandas as pd

nasdaq_url = "https://ftp.nasdaqtrader.com/dynamic/SymDir/nasdaqlisted.txt"
other_url = "https://ftp.nasdaqtrader.com/dynamic/SymDir/otherlisted.txt"

nasdaq_raw = pd.read_csv(nasdaq_url, sep="|", dtype=str)
other_raw = pd.read_csv(other_url, sep="|", dtype=str)

nasdaq_raw.head(), other_raw.head()

URLError: <urlopen error [Errno 110] Connection timed out>

In [4]:
# Build a US stock universe (symbols + market) from Wikipedia index constituents (fallback when NASDAQ Trader FTP is unreachable)
# Then we'll enrich industry using yfinance in a later cell.
import pandas as pd
import io
import requests

sp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
nasdaq100_url = "https://en.wikipedia.org/wiki/Nasdaq-100"
dow30_url = "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"

# Wikipedia often blocks default Python user agents; fetch HTML with a browser-like UA then parse.
WIKI_HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

def read_html_url(url: str, **read_html_kwargs):
    r = requests.get(url, headers=WIKI_HEADERS, timeout=30)
    r.raise_for_status()
    # Use bytes to avoid encoding edge cases
    return pd.read_html(io.BytesIO(r.content), **read_html_kwargs)

# S&P 500 table
sp500_tbls = read_html_url(sp500_url, match="Symbol")
sp500 = sp500_tbls[0].copy()
sp500 = sp500.rename(columns={"Symbol": "symbol", "Security": "name"})
sp500["market"] = "NYSE/NASDAQ"  # mixed; we'll refine via yfinance later if needed
sp500 = sp500[["symbol", "name", "market"]]

# NASDAQ-100: there are multiple tables; pick the one that has 'Ticker'
ndx_tbls = read_html_url(nasdaq100_url)
ndx = None
for t in ndx_tbls:
    cols = [c.lower() for c in t.columns.astype(str)]
    if "ticker" in cols and ("company" in cols or "name" in cols):
        ndx = t.copy()
        break
if ndx is None:
    raise ValueError("Could not find NASDAQ-100 constituents table with a Ticker column")

# Normalize column names
colmap = {c: c.lower() for c in ndx.columns}
ndx = ndx.rename(columns=colmap)
name_col = "company" if "company" in ndx.columns else ("name" if "name" in ndx.columns else None)
ndx = ndx.rename(columns={"ticker": "symbol", name_col: "name"})
ndx["market"] = "NASDAQ"
ndx = ndx[["symbol", "name", "market"]]

# Dow 30 constituents table
dow_tbls = read_html_url(dow30_url)
dow = None
for t in dow_tbls:
    cols = [c.lower() for c in t.columns.astype(str)]
    if any("symbol" == c for c in cols) and any("company" in c or "component" in c for c in cols):
        dow = t.copy()
        break
if dow is None:
    # fallback: just take first table and try common columns
    dow = dow_tbls[0].copy()

# Try to map likely columns
lower_cols = {c: str(c).lower() for c in dow.columns}
dow = dow.rename(columns={c: lower_cols[c] for c in dow.columns})
name_col = None
for cand in ["company", "component", "name"]:
    if cand in dow.columns:
        name_col = cand
        break
sym_col = "symbol" if "symbol" in dow.columns else ("ticker" if "ticker" in dow.columns else None)
if sym_col is None or name_col is None:
    raise ValueError(f"Could not normalize Dow table columns: {dow.columns.tolist()}")

dow = dow.rename(columns={sym_col: "symbol", name_col: "name"})
dow["market"] = "NYSE/NASDAQ"
dow = dow[["symbol", "name", "market"]]

# Combine and de-duplicate
universe = pd.concat([sp500, ndx, dow], ignore_index=True)
universe["symbol"] = universe["symbol"].astype(str).str.strip().str.replace(".", "-", regex=False)  # BRK.B -> BRK-B
universe["name"] = universe["name"].astype(str).str.strip()
universe = universe.dropna(subset=["symbol"]).drop_duplicates(subset=["symbol"], keep="first").reset_index(drop=True)

universe.head(), universe.shape

(  symbol                 name       market
 0    MMM                   3M  NYSE/NASDAQ
 1    AOS          A. O. Smith  NYSE/NASDAQ
 2    ABT  Abbott Laboratories  NYSE/NASDAQ
 3   ABBV               AbbVie  NYSE/NASDAQ
 4    ACN            Accenture  NYSE/NASDAQ,
 (517, 3))

In [5]:
# Enrich the universe with industry (and sector) using yfinance; then build an index-by-name CSV
import time
import numpy as np
import yfinance as yf

# Work on a copy to avoid mutating upstream data unexpectedly
univ = universe.copy()

# yfinance can be rate-limited; keep batches modest
symbols = univ["symbol"].astype(str).unique().tolist()

industry_map = {}
sector_map = {}

batch_size = 50
sleep_s = 1.0

for i in range(0, len(symbols), batch_size):
    batch = symbols[i:i+batch_size]
    tickers = yf.Tickers(" ".join(batch))
    # tickers.tickers is a dict symbol->Ticker
    for sym, tkr in tickers.tickers.items():
        try:
            info = tkr.fast_info  # fast_info doesn't include industry; try .info fallback
            _ = info  # no-op
            full = tkr.info
            industry_map[sym] = full.get("industry")
            sector_map[sym] = full.get("sector")
        except Exception:
            industry_map[sym] = None
            sector_map[sym] = None
    time.sleep(sleep_s)

univ["industry"] = univ["symbol"].map(industry_map)
univ["sector"] = univ["symbol"].map(sector_map)

# Create output indexed by stock name as requested.
# If names collide (e.g., share classes / similar names), keep first and warn via a quick count.
name_dupes = univ["name"].value_counts()
num_dup_names = int((name_dupes > 1).sum())

out = univ[["name", "industry", "market", "symbol", "sector"]].copy()
out = out.sort_values(["name", "symbol"], kind="stable")

# Set index to name; if duplicates exist, keep all rows (index not unique) to avoid dropping data.
out = out.set_index("name")

csv_path = "stocks_market_industry_indexed_by_name.csv"
out.to_csv(csv_path)

csv_path, out.shape, num_dup_names, out.head(10)

('stocks_market_industry_indexed_by_name.csv',
 (517, 4),
 0,
                                                 industry       market symbol  \
 name                                                                           
 3M                                         Conglomerates  NYSE/NASDAQ    MMM   
 A. O. Smith               Specialty Industrial Machinery  NYSE/NASDAQ    AOS   
 AES Corporation                  Utilities - Diversified  NYSE/NASDAQ    AES   
 APA Corporation                            Oil & Gas E&P  NYSE/NASDAQ    APA   
 ASML Holding         Semiconductor Equipment & Materials       NASDAQ   ASML   
 AT&T                                    Telecom Services  NYSE/NASDAQ      T   
 AbbVie                      Drug Manufacturers - General  NYSE/NASDAQ   ABBV   
 Abbott Laboratories                      Medical Devices  NYSE/NASDAQ    ABT   
 Accenture                Information Technology Services  NYSE/NASDAQ    ACN   
 Adobe Inc.                        Software - A