# MSCI additions â€” data collection
Fetch Polygon daily bars for MSCI additions and cache to parquet.


In [13]:
import pandas as pd
import requests
import time
from pathlib import Path
import keys

PG_KEY = getattr(keys, "PG_KEY", None)
if not PG_KEY:
    raise ValueError("Polygon API key missing; set PG_KEY in keys.py")


In [14]:
# Load factors and MSCI additions (dates for windows)
FF5 = pd.read_csv('F-F_Research_Data_5_Factors_2x3_daily.csv')
FF5['Date'] = pd.to_datetime(FF5['Date'], format="%Y%m%d")
FF5 = FF5.set_index('Date').dropna()
FF5 /= 100

adds_MSCI = pd.read_csv('msci_additions.csv')
adds_MSCI['Announcement Date'] = pd.to_datetime(adds_MSCI['Announcement Date'])
adds_MSCI['Effective Date'] = pd.to_datetime(adds_MSCI['Effective Date'])


In [15]:
# Polygon helpers

def fetch_polygon_daily(symbol: str, start: str, end: str, adjusted: bool = True, api_key: str = None):
    key = api_key or PG_KEY
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/day/{start}/{end}"
    params = {"adjusted": str(adjusted).lower(), "sort": "asc", "limit": 50000, "apiKey": key}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    rows = r.json().get("results", []) or []
    if not rows:
        return pd.DataFrame()
    df = pd.DataFrame(rows)
    df['Date'] = pd.to_datetime(df['t'], unit='ms')
    df = df.rename(columns={'o': 'Open', 'h': 'High', 'l': 'Low', 'c': 'Close', 'v': 'Volume'})
    return df.set_index('Date').sort_index()


def fetch_polygon_window(additions_df: pd.DataFrame, days_back: int = 365, delay_sec: float = 0):
    data = {}
    frames = []
    for _, row in additions_df.iterrows():
        ticker = row['Ticker']
        announce = row['Announcement Date']
        effective = row['Effective Date']
        start_date = (min(announce, effective) - pd.Timedelta(days=days_back)).strftime('%Y-%m-%d')
        end_date = effective.strftime('%Y-%m-%d')
        try:
            df = fetch_polygon_daily(ticker, start_date, end_date, adjusted=True)
            if df.empty:
                print(f"No data for {ticker} ({start_date} to {end_date})")
                time.sleep(delay_sec)
                continue
            df['Return'] = df['Close'].pct_change()
            data[ticker] = df
            tmp = df.reset_index()
            tmp['Ticker'] = ticker
            tmp = tmp.set_index(['Ticker', 'Date']).sort_index()
            frames.append(tmp)
            print(f"Fetched {ticker}: {len(df)} rows")
        except Exception as e:
            print(f"Error fetching {ticker}: {e}")
        finally:
            time.sleep(delay_sec)
    combined = pd.concat(frames) if frames else pd.DataFrame()
    return data, combined


#### Fetch Polygon prices
Pull ~1 year of data before announcement/effective dates.


In [16]:
polygon_data, polygon_prices = fetch_polygon_window(adds_MSCI, days_back=365, delay_sec=0)
print(f"Combined rows: {len(polygon_prices)}")


Fetched NBIS: 264 rows
No data for CWV (2024-11-06 to 2025-11-25)
Fetched INSM: 264 rows
Fetched RKLB: 264 rows
Fetched SOFI: 264 rows
Fetched AFRM: 264 rows
No data for RYA (2024-05-13 to 2025-05-30)
No data for SIG.AX (2024-05-13 to 2025-05-30)
No data for IAG.L (2024-05-13 to 2025-05-30)
Fetched UAL: 263 rows
Fetched RDDT: 236 rows
Fetched NTRA: 263 rows
No data for CEN.NZ (2024-02-12 to 2025-02-28)
No data for MCY.NZ (2024-02-12 to 2025-02-28)
No data for SMDS.L (2024-02-12 to 2025-02-28)
Fetched SPOT: 265 rows
Fetched CVNA: 265 rows
No data for CVC.AS (2023-11-07 to 2024-11-25)
No data for GALD.SW (2023-08-13 to 2024-08-30)
No data for ZEAL.CO (2023-08-13 to 2024-08-30)
No data for ADDT-B.ST (2023-08-13 to 2024-08-30)
Fetched MSTR: 264 rows
Fetched PSTG: 264 rows
Fetched EME: 264 rows
Fetched IOT: 263 rows
Fetched CRBG: 263 rows
Fetched APP: 263 rows
Fetched VST: 263 rows
Fetched SMCI: 263 rows
Fetched VRT: 263 rows
Fetched CELH: 263 rows
No data for LDO.MI (2022-11-14 to 2023-11-

#### Cache to parquet
Save MultiIndex `polygon_prices` for reuse in analysis.


In [18]:
prices_path = Path('polygon_prices.parquet')
polygon_prices.to_parquet(prices_path)
print(f"Saved to {prices_path} (rows={len(polygon_prices)})")


Saved to polygon_prices.parquet (rows=11842)
