In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from statsmodels.tsa.stattools import coint

# Get S&P 500 constituents
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
sp500_table = pd.read_html(url)
sp500_symbols = sp500_table[0]['Symbol'].tolist()

# Download historical data and handle tickers with errors
data_frames = []
symbols = []

for symbol in sp500_symbols:
    try:
        ticker_data = yf.download(symbol, start='2018-01-01', end='2023-01-01')['Adj Close']
        data_frames.append(ticker_data.rename(symbol))
        symbols.append(symbol)
    except Exception as e:
        print(f"Failed to download data for {symbol}: {e}. Skipping...")

# Concatenate all data at once
data = pd.concat(data_frames, axis=1)

# Drop tickers with too many missing values
missing_threshold = 0.1  # Drop tickers with more than 10% missing data
data = data.dropna(thresh=int(data.shape[0] * (1 - missing_threshold)), axis=1)

# Handle remaining missing data: fill forward, then fill backward
data.fillna(method='ffill', inplace=True)
data.fillna(method='bfill', inplace=True)

# Ensure no inf or nan values are present
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Check if the data is empty after cleaning
if data.empty:
    raise ValueError("Data is empty after cleaning. Please check the data source and handling steps.")

# Calculate daily returns
returns = data.pct_change().dropna()

# Check if the returns DataFrame is empty
if returns.empty:
    raise ValueError("Returns data is empty after calculating percentage changes. Please check the data source and handling steps.")

# Function to perform the cointegration test for a pair
def coint_test(stock1, stock2, significance_level=0.05):
    result = coint(stock1, stock2)
    p_value = result[1]
    return p_value < significance_level

# Function to find cointegrated pairs
def find_cointegrated_pairs(data, significance_level=0.05):
    n = data.shape[1]
    if n < 2:
        raise ValueError("Not enough data to perform cointegration test.")
    keys = data.columns
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            if coint_test(data[keys[i]], data[keys[j]], significance_level):
                pairs.append((keys[i], keys[j]))
    return pairs

# Find cointegrated pairs
cointegrated_pairs = find_cointegrated_pairs(returns)

print(f"Cointegrated pairs: {cointegrated_pairs}")


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

$BF.B: possibly delisted; No price data found  (1d 2018-01-01 -> 2023-01-01)



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******

Cointegrated pairs: [('MMM', 'AOS'), ('MMM', 'ABT'), ('MMM', 'ABBV'), ('MMM', 'ACN'), ('MMM', 'ADBE'), ('MMM', 'AMD'), ('MMM', 'AES'), ('MMM', 'AFL'), ('MMM', 'A'), ('MMM', 'APD'), ('MMM', 'AKAM'), ('MMM', 'ALB'), ('MMM', 'ARE'), ('MMM', 'ALGN'), ('MMM', 'ALLE'), ('MMM', 'LNT'), ('MMM', 'ALL'), ('MMM', 'GOOGL'), ('MMM', 'GOOG'), ('MMM', 'MO'), ('MMM', 'AMZN'), ('MMM', 'AMCR'), ('MMM', 'AEE'), ('MMM', 'AAL'), ('MMM', 'AEP'), ('MMM', 'AXP'), ('MMM', 'AIG'), ('MMM', 'AMT'), ('MMM', 'AWK'), ('MMM', 'AMP'), ('MMM', 'AME'), ('MMM', 'AMGN'), ('MMM', 'APH'), ('MMM', 'ADI'), ('MMM', 'ANSS'), ('MMM', 'AON'), ('MMM', 'APA'), ('MMM', 'AAPL'), ('MMM', 'AMAT'), ('MMM', 'APTV'), ('MMM', 'ACGL'), ('MMM', 'ADM'), ('MMM', 'ANET'), ('MMM', 'AJG'), ('MMM', 'AIZ'), ('MMM', 'T'), ('MMM', 'ATO'), ('MMM', 'ADSK'), ('MMM', 'ADP'), ('MMM', 'AZO'), ('MMM', 'AVB'), ('MMM', 'AVY'), ('MMM', 'AXON'), ('MMM', 'BKR'), ('MMM', 'BALL'), ('MMM', 'BAC'), ('MMM', 'BK'), ('MMM', 'BBWI'), ('MMM', 'BAX'), ('MMM', 'BDX'), ('MM