In [None]:
from statsmodels.tsa.stattools import coint
import pandas as pd
from dotenv import load_dotenv
import os
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
#load paths/credentials
load_dotenv()

def load_data(file_path):
  if file_path.endswith('.parquet'):
    return pd.read_parquet(file_path,engine='fastparquet')
  elif file_path.endswith('.csv'):
    return pd.read_csv(file_path)


def get_close_cols(df):
  close_cols = [col for col in df.columns if 'Close' in col]
  df_close = df[['Date'] + close_cols ]
  return df_close
  
def engle_granger_test(series1, series2, significance=0.05, alpha=0.05):
  x_with_constant = sm.add_constant(series1)
  # Run the test with series2 and the series1 with constant
  # constant is used to add an intercept to the model
  model = sm.OLS(series2, x_with_constant)
  results = model.fit()
  # print(results.summary())
  return results

def find_cointegrated_pairs(significance=0.05,alpha=0.05):
  file_path = os.getenv('stock_data_path')
  df = load_data(file_path)
  df=get_close_cols(df)
  #drop na
  Tickers = ['NVDA', 'AMD', 'MSFT', 'GOOGL', 'AAPL', 'V', 'MA', 'CRM', 'ADBE', 'INTC', 'QCOM', 'CSCO', 'ANET', 'ORCL', 'SAP', 'UBER', 'LYFT', 'META', 'SNAP']

  """Engle Granger Test"""
  copairs=[]
  for i in range(len(Tickers)):
    for j in range(i+1,len(Tickers)):
      ticker1=Tickers[i]
      ticker2=Tickers[j]  
      results = engle_granger_test(df[f"Close__{ticker1}"], df[f"Close__{ticker2}"])
      #get the residuals from the engle granger test and run the adf test
      adf_fuller_results = adfuller(results.resid)
      #if the signifiance level is less than the p_value rejefct the null hypothesis and do not use the pair
      if adf_fuller_results[1] < significance:
        copairs.append((adf_fuller_results[1],(ticker1,ticker2)))
  copairs.sort(key=lambda x: x[0])
  print("Top 5 cointegrated pairs:")
  print(f"p-value, (ticker1,ticker2)")
  for i in range(5):
    print(copairs[i][0], copairs[i][1])

# Test all combinations
# Return p-values and optimal pairs
find_cointegrated_pairs()

Top 5 cointegrated pairs:
p-value, (ticker1,ticker2)
8.906588015028236e-05 ('V', 'MA')
0.001686847965818568 ('LYFT', 'SNAP')
0.0030841812599151524 ('MA', 'ORCL')
0.0054026249824672325 ('NVDA', 'MA')
0.0061153620321635046 ('SAP', 'META')
