In [19]:
from pathlib import Path
import os

import pandas as pd
import numpy as np

try:
    import yfinance as yf
except ImportError:
    !pip install -q yfinance
    import yfinance as yf



In [11]:
data_dir = 'data' + os.sep
if not os.path.exists('data'):
    os.makedirs(data_dir)

In [12]:
start_date = '2014-01-01'
end_date = '2025-12-13'

## Downalod data from Yahoo Finance

In [13]:
def download_data(tickers, description):
    col_names = ['Ticker', 'Company', 'Date', 'Adj Close Price', 'Volume']
    rows = []
    for i, ticker in enumerate(tickers):
        company = description[i]
        data = yf.download(ticker, start=start_date, end=end_date)
        if data.empty:
            print(f"Warning: no data for {ticker}")
            continue
        data = data.reset_index()
        
        # Handle column names robustly (could be MultiIndex or different names)
        cols = list(data.columns)
        
        # Find date column
        date_col = next((c for c in cols if 'Date' in str(c) or 'date' in str(c)), cols[0])
        
        # Find adj close column
        adj_col_candidates = [c for c in cols if 'Adj' in str(c)]
        if not adj_col_candidates:
            adj_col_candidates = [c for c in cols if 'Close' in str(c)]
        
        # Find volume column
        vol_col_candidates = [c for c in cols if 'Volume' in str(c) or 'Vol' in str(c)]
        
        if not adj_col_candidates or not vol_col_candidates:
            raise KeyError(f"Adj Close or Volume column not found for {ticker}. Columns: {cols}")
        
        adj_col_sel = adj_col_candidates[0]
        vol_col_sel = vol_col_candidates[0]
        
        # Select and rename columns
        data = data[[date_col, adj_col_sel, vol_col_sel]].copy()
        data.insert(0, 'Company', company)
        data.insert(0, 'Ticker', ticker)
        data.columns = col_names
        rows.append(data)
    
    df = pd.concat(rows, ignore_index=True)
    return df

In [14]:
def get_log_ret(df):
    stocks = df.pivot(index='Date', columns='Ticker', values='Adj Close Price')
    log_ret = np.log(stocks/stocks.shift(1))

    return log_ret.dropna()

## Risk-free rates

In [44]:
def get_rf(start_date, end_date, data_dir=data_dir, ticker="^IRX"):
    """Get 13-week T-bill (^IRX) data and return annual log risk-free rate."""

    fname = f"{data_dir}rf_annual_{start_date}_to_{end_date}.csv"
    p = Path(fname)

    if p.exists():
        rf_df = pd.read_csv(p, parse_dates=['Date'])
    else:
        df = yf.download(ticker, start=start_date, end=end_date)
        df = df.reset_index()

        rate_col = 'Adj Close' if 'Adj Close' in df.columns else 'Close'
        df = df[['Date', rate_col]].copy()
        df.rename(columns={rate_col: 'annual_rate_pct'}, inplace=True)

        df['rf_annual'] = np.log(1 + df['annual_rate_pct'] / 100)

        rf_df = df[['Date', 'rf_annual']].dropna()
        rf_df.to_csv(p, index=False)

    return rf_df.set_index('Date')['rf_annual']



In [46]:
rf_df = get_rf(start_date, end_date)

In [17]:
def get_ticker_data(tickers, descriptions, data_dir=data_dir, start_date=start_date, end_date=end_date, n_ports=None):
    """Download and cache ticker data, return log returns and prices"""
    ticker_str = f"{len(tickers)}" if len(tickers) > 4 else ""
    fname = f"{data_dir}df{ticker_str}_{start_date}_to_{end_date}.csv"
    p = Path(fname)
    
    if p.exists():
        df = pd.read_csv(p, parse_dates=['Date'])
    else:
        df = download_data(tickers, descriptions)
        df.to_csv(p, index=False)
    
    lr = get_log_ret(df)
    return df, lr

## Download data and compute log returns for four tickers

In [22]:
tickers = ["AAPL", "AMZN", "CSCO", "IBM"]
description = ["Apple", "Amazon", "CISCO", "IBM"]

df, lr = get_ticker_data(tickers, description)

lr.to_csv(f"{data_dir}log_returns_{start_date}_to_{end_date}.csv")


## Download data and compute log returns for ten tickers

In [47]:
tickers1 = ["AAPL", "AMZN", "CSCO", "IBM", "TSLA", "META", "ABNB", "UPS", "NFLX", "MRNA"]
description1 = ["Apple", "Amazon", "CISCO", "IBM", "Tesla", "Meta", "Airbnb", "UPS", "Netflix", "Moderna"]

df1, lr1 = get_ticker_data(tickers1, description1)

lr1.to_csv(f"{data_dir}log_returns10_{start_date}_to_{end_date}.csv")

## Download data and compute log returns for twenty tickers

In [48]:
tickers2 = ["AAPL", "AMZN", "CSCO","IBM","TSLA","META","ABNB","UPS","NFLX","MRNA","^IXIC", "T","GE","FMC","AMC","JPM","DIS","CVX","GOOGL","BA"]
description2 = ["Apple", "Amazon", "CISCO", "IBM","Tesla","Meta","Airbnb","UPS","Netflix","Moderna","NASDAQ","AT&T","General Electric","FMC","AMC","JPMorgan","Disney","Chevron","Google","Boeing"]

df2, lr2 = get_ticker_data(tickers2, description2)

lr2.to_csv(f"{data_dir}log_returns20_{start_date}_to_{end_date}.csv")