### Optimal pairs through correlation strategy as outlined in:
#### (ChenH, ChenS, ChenZ, LiF (2017) Empirical investigation of an equity pairs trading strategy. Management Science)

    * Calculate Pearson correlation for all pair combinations and pick the pairs with the maximal correlation
    * Assets from cluster #1 in DBSCAN (energy cluster)

In [21]:
import pandas as pd
import pandas_ta as ta
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
import yfinance as yf
import itertools
import math
import random
from numpy import linalg as LA

In [50]:
#Asset list from DBSCAN clusters - utilities
lst=('LNT','AEE','AEP','ATO','CNP','CMS','ED','D','DTE','DUK','EIX','ETR','EVRG','ES','EXC','FE')

#dates for model calibration
start_data='2021-09-01'
end_data='2022-09-01'

In [51]:
pair_order_list = itertools.combinations(lst,2)
pairs=list(pair_order_list)

In [59]:
ret_df= pd.DataFrame()
for j in range(0,len(lst)):
    X = yf.download(lst[j], start = start_data, end=end_data)
    X.reset_index(inplace=True)
    X.tail()
    X['Log_Returns'] = np.log(X['Adj Close']) - np.log(X['Adj Close'].shift(1))
    x = X['Log_Returns'].drop(labels=0, axis=0)
    date=X['Date'].drop(labels=0,axis=0)
    date=date.to_frame()

    x_df=x.to_frame().dropna()

    b=x_df.rename(columns={'Log_Returns': lst[j]})
    b=b[lst[j]].values
    ret_df[lst[j]]=b

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [60]:
ret_df

Unnamed: 0,LNT,AEE,AEP,ATO,CNP,CMS,ED,D,DTE,DUK,EIX,ETR,EVRG,ES,EXC,FE
0,0.006629,0.002476,0.004740,0.000509,0.010141,0.008572,0.008986,0.006869,0.006242,0.004982,0.015166,0.010432,0.004797,0.010419,0.010030,0.012351
1,-0.009391,-0.009826,-0.005735,-0.008172,0.006575,-0.009956,-0.010819,-0.006486,-0.011364,-0.009137,-0.008151,-0.010165,-0.006548,-0.006825,-0.005605,-0.008301
2,-0.028380,-0.016351,-0.016730,-0.014151,-0.005412,-0.022417,-0.018786,-0.024017,-0.012249,-0.023747,-0.000341,-0.000533,-0.018714,-0.024875,-0.006242,-0.028698
3,0.022015,0.026237,0.019712,0.015688,0.025638,0.018715,0.026879,0.017103,0.017040,0.020430,0.007815,0.020500,0.017399,0.023787,0.017023,0.012143
4,-0.006241,-0.006239,-0.003092,-0.015064,0.003394,-0.004336,-0.007045,0.007934,-0.006284,-0.006763,-0.007304,-0.016596,-0.003221,-0.022896,-0.002585,-0.014746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,0.006199,0.006040,0.006989,0.007341,0.003094,0.004317,0.005849,-0.000119,0.003940,-0.002076,0.013357,0.000922,0.004269,0.008763,0.007228,0.008095
247,-0.013400,-0.016012,-0.015205,-0.013615,-0.019339,-0.012717,-0.009394,-0.014578,-0.011716,-0.017956,-0.011768,-0.003440,-0.013438,-0.013446,-0.024523,-0.018245
248,0.003687,0.004841,0.010843,0.003185,0.010652,0.009841,0.006878,0.006359,0.002100,0.004313,0.005471,0.003859,0.009882,0.003161,0.006465,-0.000249
249,-0.015317,-0.020256,-0.015568,-0.016726,-0.013806,-0.019339,-0.009215,-0.015306,-0.015475,-0.014017,-0.011988,-0.021839,-0.011178,-0.012263,-0.021110,-0.011263


In [61]:
pearson_corr_list=[]

for i in range(0,len(pairs)):
    corr= pearsonr(ret_df[pairs[i][0]],ret_df[pairs[i][1]])[0]
    pearson_corr_list.append(corr)


sort_corr_list=list(zip(pairs,pearson_corr_list))
sort_corr_list.sort(key = lambda x: x[1])


In [62]:
sdd1=[]
sdd2=[]
for i in range(0,len(sort_corr_list)):
    sdd1.append(sort_corr_list[i][0][0])
    sdd2.append(sort_corr_list[i][0][1])

selected_stocks = []
selected_pairs = []
opt_asset1=[]
opt_asset2=[]

for i in range(0,len(sort_corr_list)):
    s1=sdd1[i]
    s2=sdd2[i]

    if (s1 not in selected_stocks) and (s2 not in selected_stocks):
        selected_stocks.append(s1)
        selected_stocks.append(s2)
        pair=s1+' and '+s2
        selected_pairs.append(pair)

    if len(selected_pairs) == math.comb(len(lst),2):
        break

opt_asset1=selected_stocks[0:len(selected_stocks)-1:2]
opt_asset2=selected_stocks[1:len(selected_stocks):2]

print('Pairs with maximum Pearson correlation:',selected_pairs)


Pairs with maximum Pearson correlation: ['EIX and FE', 'ATO and EXC', 'CNP and ETR', 'D and EVRG', 'AEP and ED', 'DTE and ES', 'AEE and DUK', 'LNT and CMS']
