# MSCI World Equity Index Inclusion Analysis

This notebook analyzes the impact of stock returns following their inclusion in the MSCI World Equity Index.

**Note:** A complete programmatic list of historical index changes typically requires a paid data subscription (e.g., Bloomberg, FactSet, MSCI). 
Below, we use a manually compiled sample of recent additions (2024-2025) and provide code to fetch their data using `yfinance`.

In [None]:
!pip install yfinance pandas matplotlib seaborn statsmodels

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime, timedelta

In [None]:
def get_msci_additions_from_csv(csv_path='msci_additions.csv'):
    """
    Reads MSCI World Index additions from a CSV file.
    """
    try:
        df = pd.read_csv(csv_path)
        # Parse dates
        df['Announcement Date'] = pd.to_datetime(df['Announcement Date'])
        df['Effective Date'] = pd.to_datetime(df['Effective Date'])
        return df
    except FileNotFoundError:
        print(f"File {csv_path} not found. Please ensure the CSV exists.")
        return None

additions_df = get_msci_additions_from_csv()
if additions_df is not None:
    print("Loaded additions data:")
    print(additions_df.head())
    tickers = additions_df['Ticker'].tolist()
    print(f"\nAnalyzing {len(tickers)} stocks: {tickers}")
else:
    tickers = []

In [None]:
def fetch_data(tickers, period="5y"):
    data = {}
    for ticker in tickers:
        try:
            # Fetch data
            stock = yf.Ticker(ticker)
            hist = stock.history(period=period)
            if not hist.empty:
                # Calculate daily returns
                hist['Return'] = hist['Close'].pct_change()
                data[ticker] = hist
                print(f"Fetched {ticker}")
            else:
                print(f"No data for {ticker}")
        except Exception as e:
            print(f"Error fetching {ticker}: {e}")
    return data

stock_data = fetch_data(tickers)

In [None]:
def load_ff5_data(filepath='F-F_Research_Data_5_Factors_2x3_daily.csv'):
    """
    Loads and processes Fama-French 5 Factor daily data.
    Assumes the CSV has a date column (YYYYMMDD) and factor columns.
    """
    try:
        # Read CSV, skipping header rows if necessary (adjust 'skiprows' as needed)
        # FF data usually has a few header lines
        ff_data = pd.read_csv(filepath, skiprows=3)
        
        # Rename the first column to Date
        ff_data.rename(columns={ff_data.columns[0]: 'Date'}, inplace=True)
        
        # Convert Date to datetime
        ff_data['Date'] = pd.to_datetime(ff_data['Date'], format='%Y%m%d', errors='coerce')
        ff_data = ff_data.dropna(subset=['Date'])
        ff_data.set_index('Date', inplace=True)
        
        # Convert percentages to decimals (FF data is usually in percent)
        ff_data = ff_data / 100.0
        
        # Rename factors for consistency
        # Expected columns: Mkt-RF, SMB, HML, RMW, CMA, RF
        ff_data.rename(columns={'Mkt-RF': 'Mkt_RF'}, inplace=True)
        
        print("Loaded Fama-French 5 Factor Data")
        return ff_data
    except Exception as e:
        print(f"Error loading FF5 data: {e}")
        return None

ff5_data = load_ff5_data()
if ff5_data is not None:
    print(ff5_data.head())

In [None]:
def calculate_abnormal_returns_ff5(stock_data, ff5_data, additions_df, 
                                   estimation_window=250, 
                                   gap_window=30, 
                                   event_window=30):
    """
    Calculates Abnormal Returns (AR) using the Fama-French 5 Factor Model.
    
    Methodology:
    1. Estimation Window: [-250, -30] days relative to Announcement Date.
    2. Estimate Betas (Mkt, SMB, HML, RMW, CMA) using OLS.
    3. Calculate Expected Returns in the Event Window [-event_window, +event_window] relative to Effective Date.
       (Or we can center around Announcement Date, but usually Effective Date is the trade target).
       Let's center around Effective Date for the final visual, but we need to ensure the estimation window is clean (pre-announcement).
    4. AR = Realized Return - Expected Return (Risk Free + Factor Loadings)
    """
    
    all_ar_data = []
    
    for index, row in additions_df.iterrows():
        ticker = row['Ticker']
        announce_date = row['Announcement Date']
        effective_date = row['Effective Date']
        
        if ticker not in stock_data:
            continue
            
        stock_df = stock_data[ticker].copy()
        
        # Align stock data with FF5 data
        # Ensure timezone compatibility (remove tz from stock data if present, or localize FF)
        stock_df.index = stock_df.index.tz_localize(None)
        merged = stock_df[['Return']].join(ff5_data, how='inner')
        merged['Excess_Return'] = merged['Return'] - merged['RF']
        
        if merged.empty:
            continue

        # --- Step 1: Estimation Window ---
        # End estimation 'gap_window' days BEFORE Announcement to avoid information leakage
        est_end_date = announce_date - timedelta(days=gap_window)
        est_start_date = est_end_date - timedelta(days=estimation_window)
        
        estimation_data = merged.loc[est_start_date:est_end_date].dropna()
        
        if len(estimation_data) < 50: # Require at least 50 days of data for regression
            # print(f"Insufficient data for {ticker}")
            continue
            
        # FF5 Regression
        X = estimation_data[['Mkt_RF', 'SMB', 'HML', 'RMW', 'CMA']]
        X = sm.add_constant(X)
        y = estimation_data['Excess_Return']
        
        try:
            model = sm.OLS(y, X).fit()
            # alphas = model.params['const'] # We usually assume alpha is 0 for expected returns or include it
            # For standard event study, we use the estimated parameters to predict 'normal' return
            params = model.params
        except:
            continue
            
        # --- Step 2: Event Window Calculation ---
        # Define Event Window relative to Effective Date (t=0)
        # We want data from [Effective - event_window] to [Effective + event_window]
        evt_start = effective_date - timedelta(days=event_window*2) # Buffer for non-trading days
        evt_end = effective_date + timedelta(days=event_window*2)
        
        event_data = merged.loc[evt_start:evt_end].copy()
        
        if event_data.empty:
            continue
            
        # Calculate Expected Return using estimated betas
        X_event = event_data[['Mkt_RF', 'SMB', 'HML', 'RMW', 'CMA']]
        X_event = sm.add_constant(X_event)
        
        # If constant was dropped/not in index, ensure shape matches
        if 'const' not in X_event.columns:
             X_event['const'] = 1.0
             
        expected_excess_return = model.predict(X_event)
        
        # Abnormal Return (AR) = Actual Excess Return - Expected Excess Return
        event_data['AR'] = event_data['Excess_Return'] - expected_excess_return
        
        # Create relative time index (t=0 is effective date)
        # Find nearest date to effective date
        dates = event_data.index
        nearest_date = dates[np.argmin(np.abs(dates - effective_date))]
        event_data['Event_Day'] = 0 # Placeholder
        
        # Assign event days simply by trading day count from nearest date
        loc_zero = event_data.index.get_loc(nearest_date)
        event_data['Event_Day'] = np.arange(len(event_data)) - loc_zero
        
        # Filter to strictly +/- event_window
        event_data = event_data[(event_data['Event_Day'] >= -event_window) & 
                                (event_data['Event_Day'] <= event_window)]
        
        event_data['Ticker'] = ticker
        all_ar_data.append(event_data[['Event_Day', 'AR', 'Ticker']])

    if not all_ar_data:
        return pd.DataFrame()
        
    return pd.concat(all_ar_data)

ar_df = calculate_abnormal_returns_ff5(stock_data, ff5_data, additions_df, event_window=40)
if not ar_df.empty:
    print("Calculated FF5 Abnormal Returns")
    print(ar_df.head())

In [None]:
# Statistical Analysis of CAR (Cumulative Abnormal Returns)

if not ar_df.empty:
    # Pivot to have tickers as columns, Event_Day as index
    ar_pivot = ar_df.pivot_table(index='Event_Day', columns='Ticker', values='AR')
    
    # 1. Average Abnormal Return (AAR) per day
    aar = ar_pivot.mean(axis=1)
    
    # 2. Cumulative Average Abnormal Return (CAAR)
    # We start cumulating from the beginning of the window (-40)
    caar = aar.cumsum()
    
    # 3. T-Test Statistics
    # Calculate standard deviation of ARs across the cross-section (tickers) for each day
    std_ar = ar_pivot.std(axis=1)
    n_stocks = ar_pivot.count(axis=1)
    
    # Standard error of AAR
    # A simple test statistic: sqrt(N) * (AAR / StdDev_AR)
    # (Note: There are more complex adjustments for serial correlation, but this is the standard base)
    t_stats = np.sqrt(n_stocks) * (aar / std_ar)
    
    # Plotting CAAR
    plt.figure(figsize=(12, 6))
    plt.plot(caar.index, caar.values, label='CAAR (FF5 Adjusted)', color='blue', linewidth=2)
    
    # Confidence Intervals (Approximate)
    # For CAAR, variance accumulates. Variance(CAAR_t) = Sum(Variance(AAR_i))
    # StdErr(CAAR_t) = sqrt(Sum(StdErr(AAR_i)^2))
    # This is a simplification assuming independence over time.
    
    plt.title('Cumulative Average Abnormal Returns (CAAR) around Effective Date')
    plt.xlabel('Days Relative to Effective Date (0)')
    plt.ylabel('Cumulative Abnormal Return')
    plt.axvline(x=0, color='red', linestyle='--', label='Effective Date')
    plt.axhline(y=0, color='black', linewidth=1)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()
    
    # Identifying Optimal Windows
    # Find the day with the minimum CAAR before t=0 (Potential Buy point)
    pre_event_caar = caar[caar.index <= 0]
    buy_signal_day = pre_event_caar.idxmin()
    
    # Find day with maximum CAAR (Potential Sell point)
    # We look for max in the whole window or just post-announcement?
    # Usually run-up happens pre-effective.
    sell_signal_day = caar.idxmax()
    
    print(f"Analysis Results:")
    print(f"Lowest Cumulative Return Day (Potential Entry): Day {buy_signal_day} (CAAR: {pre_event_caar.min():.2%})")
    print(f"Peak Cumulative Return Day (Potential Exit): Day {sell_signal_day} (CAAR: {caar.max():.2%})")
    
    print("\nDaily Statistics around Event:")
    stats_df = pd.DataFrame({'AAR': aar, 'CAAR': caar, 'T-Stat': t_stats})
    # Show window from -10 to +10
    print(stats_df.loc[-10:10])