In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import yfinance
import yfinance as yf

# import library for plotting candlestick charts
import mplfinance as mpf

In [None]:
equity1_df = yf.download('KO', interval='1h', period='1y')
equity1_df

In [None]:
# get coke data, pepsi data hourly data
coke_df = yf.download('KO', start='2022-05-01', end='2023-12-31', interval='1h')
pepsi_df = yf.download('PEP', start='2022-05-01', end='2023-12-31', interval='1h')
# coke_df = yf.download('KO', start='2008-01-01', end='2023-12-31')
# pepsi_df = yf.download('PEP', start='2008-01-01', end='2023-12-31')
# print(coke_df.head(10))

# plot the adjusted close price data
plt.figure(figsize=(12, 8))
plt.plot(coke_df['Adj Close'], label='Coke')
plt.plot(pepsi_df['Adj Close'], label='Pepsi')
plt.title('Coke vs Pepsi Adj Close Price History')
# plt.savefig('Coke_vs_Pepsi_Adj_Close_Price_History.png')
# plt.show()


In [None]:
# plot candlestick chart
mpf.plot(coke_df[-300:], type='candle', volume=True, style='yahoo', title='Coke Candlestick Chart')


In [None]:
# join the dfs with prefix coke, pepsi
coke_df_new = coke_df.add_prefix('Coke_')
pepsi_df_new = pepsi_df.add_prefix('Pepsi_')

# join the dataframes
df = coke_df_new.join(pepsi_df_new, how='outer')

# add a column for the differnece in close
df['Difference'] = df['Coke_Close'] - df['Pepsi_Close']

# plot difference with matplotlib
df['Difference'].plot(figsize=(12, 8), title='Coke vs Pepsi Close Price Difference')
plt.show()

In [None]:
PERIOD = 100
# plot the difference with a rolling window and the difference between the difference and moving average
df[f'{PERIOD}_MA_Difference'] = df['Difference'].rolling(window=PERIOD).mean()
df[f'{PERIOD}_MA_Difference_Difference'] = df['Difference'] - df[f'{PERIOD}_MA_Difference']
# plot the difference difference
df[f'{PERIOD}_MA_Difference_Difference'].plot(figsize=(12, 8), title=f'{PERIOD} Period MA Difference Difference')
plt.axhline(0, color='red', linestyle='--')
plt.show()

In [None]:
# add standard deviation bands to difference difference
MULTIPLIER = 1.75
STD_WINDOW_MULT = 3
df['Upper_Band'] = df[f'{PERIOD}_MA_Difference_Difference'].rolling(window=PERIOD*STD_WINDOW_MULT).std() * MULTIPLIER
df['Lower_Band'] = -df[f'{PERIOD}_MA_Difference_Difference'].rolling(window=PERIOD*STD_WINDOW_MULT).std() * MULTIPLIER
# add marks for when the difference difference is outside the bands
df['Outside_Upper'] = df['Difference'] > df['Upper_Band']
df['Outside_Lower'] = df['Difference'] < df['Lower_Band']
# plot the difference difference and the bands and the marks
df[f'{PERIOD}_MA_Difference_Difference'].plot(figsize=(12, 8), title=f'{PERIOD} Period MA Difference Difference')
df['Upper_Band'].plot(label='Upper Band', color='g')
df['Lower_Band'].plot(label='Lower Band', color='r')
# plt.scatter(df.dropna().index, df['Difference'][df['Outside_Upper']], marker='^', color='g')
# plt.scatter(df.dropna().index, df['Difference'][df['Outside_Lower']], marker='v', color='r')


plt.axhline(0, color='red', linestyle='--')
plt.show()


In [None]:
# iterate through df, short the spread when the difference difference is above the upper band and long the spread when the difference difference is below the lower band
position = 0
df['return'] = 0
for i in range(len(df)):
    if df[f'{PERIOD}_MA_Difference_Difference'].iloc[i] > df['Upper_Band'].iloc[i] and position != -1:
        position = -1
        entry = df['Difference'].iloc[i]
        entry_cost = df['Coke_Close'].iloc[i] + df['Pepsi_Close'].iloc[i]
        print(f'Shorting the spread at {entry}')
    elif df[f'{PERIOD}_MA_Difference_Difference'].iloc[i] < df['Lower_Band'].iloc[i] and position != 1:
        position = 1
        entry = df['Difference'].iloc[i]
        entry_cost = df['Pepsi_Close'].iloc[i] + df['Coke_Close'].iloc[i]
        print(f'Longing the spread at {entry}')
    # close if it passes 0
    elif df[f'{PERIOD}_MA_Difference_Difference'].iloc[i] >= 0 and position == -1:
        position = 0
        exitv = df['Difference'].iloc[i]
        df['return'].iloc[i] = (entry - exitv) / entry
        # print(f'Closing the short at {exitv}')
    elif df[f'{PERIOD}_MA_Difference_Difference'].iloc[i] > 0 and position == 1:
        position = 0
        exitv = df['Difference'].iloc[i]
        df['return'].iloc[i] = (exitv - entry) / entry
        # print(f'Closing the long at {exitv}')

In [None]:
# plot cumulative return
plt.plot((1+df['return']).cumprod())

In [None]:
# rather than ma, try arima model
# import library for arima model
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller

# check for stationarity
result = adfuller(df['Difference'].dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
print(f'Critical Values: {result[4]}')
