## Cointegrated Variables

In [52]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.vector_ar.vecm import VECM

import yfinance as yf
import numpy as np
import pandas as pd

In [42]:
# Gather the datasets

google_data = pd.read_csv('data/GOOGL.csv', parse_dates=['Date'], index_col='Date')
apple_data = pd.read_csv('data/AAPL.csv', parse_dates=['Date'], index_col='Date')

In [43]:
apple_data

Unnamed: 0_level_0,Unnamed: 0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-09,0,27.407523,27.751425,27.405200,27.649183,134247600
2017-01-10,1,27.598065,27.739809,27.488855,27.677071,97848400
2017-01-11,2,27.591087,27.867603,27.558556,27.825777,110354400
2017-01-12,3,27.628269,27.721216,27.467936,27.709597,108344800
2017-01-13,4,27.677066,27.795573,27.607356,27.660801,104447600
...,...,...,...,...,...,...
2024-06-12,1868,207.369995,220.199997,206.899994,213.070007,198134300
2024-06-13,1869,214.740005,216.750000,211.600006,214.240005,97862700
2024-06-14,1870,213.850006,215.169998,211.300003,212.490005,70122700
2024-06-17,1871,213.369995,218.949997,212.720001,216.669998,93728300


In [44]:
google_data

Unnamed: 0_level_0,Unnamed: 0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-09,0,41.271134,41.473900,41.033907,41.311588,28178000
2017-01-10,1,41.306095,41.422962,41.109822,41.253155,23948000
2017-01-11,2,41.283622,41.447431,41.026417,41.445435,26508000
2017-01-12,3,41.371518,41.471406,41.003441,41.428955,26990000
2017-01-13,4,41.502365,41.684655,41.428453,41.499371,25804000
...,...,...,...,...,...,...
2024-06-12,1868,178.250000,180.410004,176.110001,177.789993,27864700
2024-06-13,1869,176.110001,176.740005,174.880005,175.160004,20913300
2024-06-14,1870,174.220001,177.059998,174.149994,176.789993,18063600
2024-06-17,1871,175.460007,178.360001,174.809998,177.240005,19618500


### Testing for Stationarity

In [45]:
def adf_test(series):
    '''
    Perform ADF test and print results.
    '''
    result = adfuller(series)
    labels = ['ADF Statistic', 'p-value', '# Lags Used', 'Number of Observations Used']
    out = dict(zip(labels, result[:4]))
    for key, val in out.items():
        print(f'{key}: {val}')
    for key, val in result[4].items():
        print(f'Critical Value {key}: {val}')
    print("\n")

In [46]:
adf_test(apple_data['Close'])

ADF Statistic: -0.16749913317425194
p-value: 0.9422863752570497
# Lags Used: 0
Number of Observations Used: 1872
Critical Value 1%: -3.4338480179204556
Critical Value 5%: -2.863085177979608
Critical Value 10%: -2.567592596439203




In [47]:
adf_test(google_data['Close'])

ADF Statistic: 0.20347529085206237
p-value: 0.9724558401866943
# Lags Used: 9
Number of Observations Used: 1863
Critical Value 1%: -3.433864939927475
Critical Value 5%: -2.8630926485318433
Critical Value 10%: -2.567596574226058




### Testing for Cointegration

### Engle-Granger Test

In [48]:
# Test for cointegration between X and Y
coint_result = coint(apple_data['Close'], google_data['Close'])

print(f'---Engle-Granger Cointegration Test---')
print(f'Cointegration Test Statistic: {coint_result[0]}')
print(f'p-value: {coint_result[1]}')
print(f'Critical Values: {coint_result[2]}')


---Engle-Granger Cointegration Test---
Cointegration Test Statistic: -2.373910484271602
p-value: 0.3373090317685453
Critical Values: [-3.90229994 -3.33939589 -3.04671637]


Interpretation:
* If the test statistic is lower than the critical values at 1%, 5%, or 10%, we reject the null hypothesis of no cointegration.
* A low p-value (e.g., < 0.05) suggests that the series are cointegrated.

### Error Correction Model (ECM)

If the variables are cointegrated, you can proceed to estimate an Error Correction Model (ECM), which helps model both short-term and long-term dynamics.

In [51]:
data = pd.DataFrame(apple_data['Close'].values, columns=['Apple'])
data['Google'] = google_data['Close'].values

# Check for NaN or non-numeric data
print(data.isnull().sum())  # This will show any missing values

# Optionally drop missing values
data = data.dropna()

vecm_model = VECM(data, k_ar_diff=1, coint_rank=1)
vecm_fit = vecm_model.fit()

# Summary of the VECM model
print(vecm_fit.summary())


Apple     0
Google    0
dtype: int64
Det. terms outside the coint. relation & lagged endog. parameters for equation Apple
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
L1.Apple      -0.0346      0.029     -1.202      0.229      -0.091       0.022
L1.Google      0.0157      0.034      0.469      0.639      -0.050       0.081
Det. terms outside the coint. relation & lagged endog. parameters for equation Google
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
L1.Apple      -0.0253      0.025     -1.025      0.305      -0.074       0.023
L1.Google     -0.0391      0.029     -1.357      0.175      -0.096       0.017
               Loading coefficients (alpha) for equation Apple                
                 coef    std err          z      P>|z|      [0.025      0.975]
--

## Cola vs Pepsi

In [40]:
# Download stock data from Yahoo Finance
ko = yf.download('KO', start='2020-01-01', end='2023-01-01')['Adj Close']
pep = yf.download('PEP', start='2020-01-01', end='2023-01-01')['Adj Close']

# Create DataFrame
df_stocks = pd.DataFrame({'KO': ko, 'PEP': pep})

# Check for Stationarity (ADF Test)
adf_test(df_stocks['KO'])
adf_test(df_stocks['PEP'])

# Test for Cointegration (Engle-Granger)
coint_result_stocks = coint(df_stocks['KO'], df_stocks['PEP'])
print(f'Engle-Granger Test p-value: {coint_result_stocks[1]}')


[*********************100%***********************]  1 of 1 completed


[*********************100%***********************]  1 of 1 completed

ADF Statistic: -0.987211447103292
p-value: 0.7579000078934196
# Lags Used: 12
Number of Observations Used: 743
Critical Value 1%: -3.439181811684251
Critical Value 5%: -2.865437807603377
Critical Value 10%: -2.5688456132154935


ADF Statistic: -0.8326557351201618
p-value: 0.8093410247745684
# Lags Used: 9
Number of Observations Used: 746
Critical Value 1%: -3.439146171679794
Critical Value 5%: -2.865422101274577
Critical Value 10%: -2.568837245865348


Engle-Granger Test p-value: 0.09340001664225933



