### The method of selecting lags by partial correlation function (PACF)

In [1]:
# Description:
# Generating the benchmark for the proposed method

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf


from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings("ignore")

from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import math

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Read the data set from the local data folder

startdate = '2012-04-01'
enddate = '2022-06-01'

datasource = 'SPY'
data = yf.download(datasource,start = startdate,end = enddate)

data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-04-02,140.639999,142.210007,140.360001,141.839996,116.761978,151741100
2012-04-03,141.639999,141.880005,140.429993,141.259995,116.284515,155806700
2012-04-04,140.220001,140.339996,139.339996,139.860001,115.132027,146896000
2012-04-05,139.380005,140.199997,139.259995,139.789993,115.074417,137439400
2012-04-09,138.029999,139.839996,137.839996,138.220001,113.782028,127555900
...,...,...,...,...,...,...
2022-05-24,392.559998,395.149994,386.959991,393.890015,392.195831,91448800
2022-05-25,392.309998,399.450012,391.890015,397.369995,395.660858,91472900
2022-05-26,398.670013,407.040009,398.450012,405.309998,403.566711,82168300
2022-05-27,407.910004,415.380005,407.700012,415.260010,413.473907,84768700


In [4]:
# ADF test and diffrencing operation 

def adf_with_diff(series):
    if (adfuller(series)[1]>0.05):
        series = series.diff().dropna()
        series = pd.Series(series)
        return(series)
    else:
        series = pd.Series(series)
        return(series)

In [5]:
# Lag selection according to PACF value

def pacf_lag_selection(data):
    cols = data.columns
    lag_list = []
    for i in cols:
        series = adf_with_diff(data[i])
        bool_list = abs(pacf(series,nlags = 30))>0.05
        lag = [i for i, x in enumerate(bool_list) if x][1:]
        lag_list.append(lag)
        plt.show()
    
    return(lag_list)

In [6]:
def lag_to_data(lag_list,data,target):
    data_selected =  np.empty((data.shape[0],0))
    i = 0
    cols = data.columns
    for lags in lag_list:
        for lag in lags:
            data_selected = np.concatenate((data_selected,pd.DataFrame(data[cols[i]]).shift(lag)),axis = 1)
        i = i + 1
    
    data_selected = np.concatenate((data_selected,pd.DataFrame(data[target])),axis = 1)
    data_selected = pd.DataFrame(data_selected).dropna()
    
    print('Number of selected features: ', data_selected.shape[1] - 1)
    
    return data_selected

In [7]:
# Function spliting train test set 

train_test_ratio = 0.7

def train_test_split(data, train_test_ratio):
    
    split_point = round(data.shape[0] * train_test_ratio)
    train = data.iloc[0:split_point,:]
    test = data.iloc[split_point: ,:]
    
    return train, test

In [8]:
def linear_regression_result(train,test):
    
    print(' ')
    print('RMSE of Linear Regression:')
    
    train_x = train.iloc[:,:(train.shape[1] - 1)]
    train_y = train.iloc[:,-1:]
    
    test_x = test.iloc[:,:(test.shape[1] - 1)]
    test_y = test.iloc[:,-1:]
    
    regressor = LinearRegression()
    model = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(train_x,train_y)
    y_pred = model.predict(test_x)
    
    print(math.sqrt(mean_squared_error(y_pred, test_y)))
    print(' ')

In [9]:
def random_forest_result(train,test):
    
    print(' ')
    print('RMSE of Linear Regression:')
    
    train_x = train.iloc[:,:(train.shape[1] - 1)]
    train_y = train.iloc[:,-1:]
    
    test_x = test.iloc[:,:(test.shape[1] - 1)]
    test_y = test.iloc[:,-1:]
    
    regressor = RandomForestRegressor()
    model = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(train_x,train_y)
    y_pred = model.predict(test_x)
    
    print(math.sqrt(mean_squared_error(y_pred, test_y)))
    print(' ')

In [10]:
def elastic_net_result(train,test):
    
    print(' ')
    print('RMSE of Linear Regression:')
    
    train_x = train.iloc[:,:(train.shape[1] - 1)]
    train_y = train.iloc[:,-1:]
    
    test_x = test.iloc[:,:(test.shape[1] - 1)]
    test_y = test.iloc[:,-1:]
    
    model = ElasticNet()
    # define grid
    grid = dict()
    grid['alpha'] = [0.0, 1.0, 10.0, 100.0]
    grid['l1_ratio'] = [0.1,0.5,0.7,0.9,0.95,0.99,1]
    # define search
    search = GridSearchCV(model, grid, scoring='neg_mean_squared_error')
    best = search.fit(train_x,train_y)
    best_model = ElasticNet(l1_ratio = best.best_params_.get('l1_ratio'), alpha = best.best_params_.get('alpha')).fit(train_x,train_y)
            
    y_pred = best_model.predict(test_x)
    
    print(math.sqrt(mean_squared_error(y_pred, test_y)))
    print(' ')

In [11]:
def svr_result(train,test):
    
    print(' ')
    print('RMSE of Linear Regression:')
    
    train_x = train.iloc[:,:(train.shape[1] - 1)]
    train_y = train.iloc[:,-1:]
    
    test_x = test.iloc[:,:(test.shape[1] - 1)]
    test_y = test.iloc[:,-1:]
    
    regressor = SVR(kernel='rbf')
    model = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(train_x,train_y)
    y_pred = model.predict(test_x)
    
    print(math.sqrt(mean_squared_error(y_pred, test_y)))
    print(' ')

In [12]:
def benchmark_pacf(data,target):
    
    lag_list = pacf_lag_selection(data)
    
    data_selected = lag_to_data(lag_list,data,target)
    
    train, test = train_test_split(data_selected,train_test_ratio)
    
    linear_regression_result(train,test)
    random_forest_result(train,test)
    elastic_net_result(train,test)
    svr_result(train,test)

In [13]:
benchmark_pacf(data,'Close')

Number of selected features:  27
 
RMSE of Linear Regression:
4.802422072868402
 
 
RMSE of Linear Regression:
98.58506061660857
 
 
RMSE of Linear Regression:
4.974613381857405
 
 
RMSE of Linear Regression:
151.61102339559093
 
