In [None]:
# In fetch_market_data.py

import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import logging
import io
import yfinance as yf
from tqdm import tqdm
import time # Import the time module

# --- Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
OUTPUT_DIR = "index_market_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Ticker Scraping Engine ---

INDEX_CONFIG = {
    # ... (sp500, dowjones, nasdaq100, nifty50, ftse100 configs are the same) ...
    "sp500": {"name": "S&P 500", "url": "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies", "table_identifier": {'id': 'constituents'}, "ticker_column": "Symbol", "clean_fn": lambda s: s.replace('.', '-')},
    "dowjones": {"name": "Dow Jones Industrial Average", "url": "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average", "table_identifier": {'class': 'wikitable'}, "ticker_column": "Symbol", "clean_fn": lambda s: s},
    "nasdaq100": {"name": "NASDAQ 100", "url": "https://en.wikipedia.org/wiki/Nasdaq-100", "table_identifier": {'id': 'constituents'}, "ticker_column": "Ticker", "clean_fn": lambda s: s.replace('.', '-')},
    "nifty50": {"name": "Nifty 50", "url": "https://en.wikipedia.org/wiki/NIFTY_50", "table_identifier": {'class': 'wikitable sortable'}, "ticker_column": "Symbol", "clean_fn": lambda s: f"{s}.NS"},
    "ftse100": {"name": "FTSE 100", "url": "https://en.wikipedia.org/wiki/FTSE_100_Index", "table_identifier": {'id': 'constituents'}, "ticker_column": "Ticker", "clean_fn": lambda s: f"{s}.L" if '.' not in s else s},
    
    # <<< FIX #1: Replace the old Russell 2000 scraper with a more reliable Wikipedia source.
    "russell2000": {
        "name": "Russell 2000",
        "url": "https://en.wikipedia.org/wiki/List_of_Russell_2000_companies",
        "table_identifier": {'id': 'constituents'},
        "ticker_column": "Ticker",
        "clean_fn": lambda s: s.replace('.', '-')
    }
}

# (The get_tickers_from_wikipedia function remains the same)
def get_tickers_from_wikipedia(config):
    # ... (paste the function from the previous step here) ...
    name = config['name']
    url = config['url']
    logging.info(f"Scraping {name} tickers from Wikipedia...")
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')
        table = soup.find('table', config['table_identifier'])
        if table is None:
            logging.error(f"Could not find the specified table for {name}.")
            return []
        df = pd.read_html(io.StringIO(str(table)))[0]
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ['_'.join(map(str, col)).strip() for col in df.columns.values]
        ticker_col = config['ticker_column']
        if ticker_col not in df.columns:
            logging.error(f"Ticker column '{ticker_col}' not found for {name}. Available columns: {df.columns.tolist()}")
            return []
        tickers = df[ticker_col].astype(str).apply(config['clean_fn']).tolist()
        logging.info(f"SUCCESS: Found {len(tickers)} tickers for {name}.")
        return tickers
    except Exception as e:
        logging.error(f"An error occurred while scraping {name}: {e}")
        return []

# We no longer need the get_russell_2000_tickers() function, so you can delete it.

def get_usd_conversion_rates(currencies):
    rates = {'USD': 1.0}
    logging.info("Fetching currency conversion rates to USD...")
    for currency in set(currencies):
        if currency == 'USD' or pd.isna(currency):
            continue
        try:
            rate_ticker = f"{currency}USD=X"
            rate_info = yf.Ticker(rate_ticker).info
            rate = rate_info.get('previousClose') or rate_info.get('regularMarketPrice')
            if rate:
                rates[currency] = rate
                logging.info(f"  > Rate for {currency} to USD: {rate}")
            else:
                rates[currency] = None
            time.sleep(0.1) # <<< FIX #2: Add a small delay to avoid being rate-limited.
        except Exception as e:
            logging.warning(f"Could not fetch conversion rate for {currency}: {e}")
            rates[currency] = None
    return rates

def get_market_data(ticker_list):
    # ... (fetching part is the same) ...
    market_data = []
    for ticker_symbol in tqdm(ticker_list, desc="Fetching Market Caps"):
        # ...
        try:
            #...
            ticker = yf.Ticker(ticker_symbol)
            info = ticker.info
            market_cap = info.get('marketCap')
            currency = info.get('currency')
            if market_cap is not None and currency is not None:
                market_data.append({
                    "Ticker": ticker_symbol,
                    "Company Name": info.get('longName', ticker_symbol),
                    "Market Cap (Local)": market_cap,
                    "Currency": currency
                })
        except Exception:
            continue
    df = pd.DataFrame(market_data)
    
    if not df.empty:
        unique_currencies = df['Currency'].unique()
        conversion_rates = get_usd_conversion_rates(unique_currencies)
        
        # <<< FIX #3: Make the conversion logic robust to handle failed lookups (None values).
        def convert_to_usd(row):
            rate = conversion_rates.get(row['Currency'])
            if rate is not None:
                return row['Market Cap (Local)'] * rate
            return 0 # Return 0 if conversion rate is missing

        df['Market Cap (USD)'] = df.apply(convert_to_usd, axis=1)
        df = df[df['Market Cap (USD)'] > 0]

    return df

if __name__ == "__main__":
    all_ticker_mappings = []
    
    # This loop now handles all indices, including Russell 2000.
    for config in INDEX_CONFIG.values():
        tickers = get_tickers_from_wikipedia(config)
        for ticker in tickers:
            all_ticker_mappings.append({"Ticker": ticker, "Index": config['name']})
    
    # We no longer need the separate call for Russell 2000 tickers.

    df_mappings = pd.DataFrame(all_ticker_mappings).drop_duplicates()
    
    # ... (rest of the __main__ block is the same) ...
    unique_tickers = sorted(list(df_mappings['Ticker'].unique()))
    df_market_data = get_market_data(unique_tickers)
    
    if not df_market_data.empty:
        final_df = pd.merge(df_mappings, df_market_data, on="Ticker", how="inner")
        final_indices = final_df['Index'].unique()
        logging.info(f"Final dataset contains data for the following indices: {list(final_indices)}")
        if len(final_indices) < len(INDEX_CONFIG):
             logging.warning("Some indices may be missing from the final output.")
        
        output_path = os.path.join(OUTPUT_DIR, "market_data_with_caps.csv")
        final_df.to_csv(output_path, index=False)
        logging.info(f"Successfully saved combined data to: {output_path}")
    else:
        logging.error("No market data could be fetched. The output file was not created.")

    logging.info("Data fetching process complete.")

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd

# --- Helper function for clear number formatting ---
def format_large_number(n):
    """Formats a number into a clean string like $20B, $500B, $1T."""
    if pd.isna(n):
        return "N/A"
    if n >= 1e12:
        return f"${n/1e12:.0f}T"
    if n >= 1e9:
        return f"${n/1e9:.0f}B"
    if n >= 1e6:
        return f"${n/1e6:.0f}M"
    return f"${n:,.0f}"

# --- Load and Prepare the Data ---
try:
    df = pd.read_csv("index_market_data/market_data_with_caps.csv")
    df.dropna(subset=['Market Cap (USD)'], inplace=True)
except FileNotFoundError:
    print("Error: 'market_data_with_caps.csv' not found. Please run the data fetching script first.")
    exit()

index_options = sorted(df['Index'].unique())

# --- Initialize the Dash App ---
app = dash.Dash(__name__)
server = app.server 

# --- Define the App Layout ---
app.layout = html.Div(style={'fontFamily': 'Arial, sans-serif', 'backgroundColor': '#f9f9f9', 'padding': '20px'}, children=[
    html.Div([
        html.H1("Stock Index Market Cap Explorer", style={'textAlign': 'center', 'color': '#2c3e50'}),
        html.P("Select an index to visualize the market capitalization (in USD) of its components. Bubble size is proportional to market cap.", style={'textAlign': 'center', 'maxWidth': '800px', 'margin': '0 auto 30px auto', 'color': '#555'}),
        dcc.Dropdown(id='index-dropdown', options=[{'label': i, 'value': i} for i in index_options], value='S&P 500' if 'S&P 500' in index_options else index_options[0], clearable=False, style={'width': '50%', 'margin': '0 auto'}),
    ]),
    dcc.Loading(id="loading-spinner", type="circle", children=dcc.Graph(id='market-cap-bubble-chart', style={'height': '75vh'}))
])

# --- Define the Callback for Interactivity ---
@app.callback(
    Output('market-cap-bubble-chart', 'figure'),
    [Input('index-dropdown', 'value')]
)
def update_bubble_chart(selected_index):
    filtered_df = df[df['Index'] == selected_index].copy()

    # Create a more detailed format for the hover text
    filtered_df['Market Cap Formatted USD'] = filtered_df['Market Cap (USD)'].apply(
        lambda n: f"${n/1e12:.2f} Trillion" if n >= 1e12 else (f"${n/1e9:.2f} Billion" if n >= 1e9 else f"${n/1e6:.2f} Million")
    )

    fig = px.scatter(
        filtered_df,
        x="Ticker",
        y="Market Cap (USD)",
        size="Market Cap (USD)",
        color="Market Cap (USD)",
        hover_name="Company Name",
        custom_data=['Market Cap (Local)', 'Currency', 'Market Cap Formatted USD'], 
        log_y=True,
        size_max=80,
        color_continuous_scale=px.colors.sequential.Plasma,
        title=f"Market Cap Distribution for the {selected_index} (in USD)"
    )
    
    # <<< THE FIX: A much cleaner and sparser list of tick values.
    tick_values = [
        2e10, 5e10,           # $20B, $50B
        1e11, 2e11, 5e11,     # $100B, $200B, $500B
        1e12, 2e12, 3e12,     # $1T, $2T, $3T
    ]
    # Use our clean formatting function to create the text for the labels
    tick_text = [format_large_number(v) for v in tick_values]

    fig.update_layout(
        xaxis={'title': 'Companies (Hover for Details)', 'showticklabels': False, 'showgrid': False},
        yaxis={
            'title': 'Market Cap in USD (Logarithmic Scale)', 
            'gridcolor': '#e0e0e0',
            'tickvals': tick_values, # Set the positions for the ticks
            'ticktext': tick_text    # Set the text for those ticks
        },
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#2c3e50', size=14), # Slightly larger font for readability
        title_font_size=24
    )
    
    fig.update_traces(
        hovertemplate=(
            '<b>%{hovertext}</b> (%{x})<br>'
            '<br>'
            '<b>Market Cap (USD): %{customdata[2]}</b><br>'
            'Market Cap (Local): %{customdata[0]:,.0f} %{customdata[1]}'
            '<extra></extra>'
        )
    )
    
    return fig

# --- Run the App ---
if __name__ == '__main__':
    app.run(debug=False)