In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from scipy import stats
from statsmodels.tsa.api import VAR
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import grangercausalitytests
import pickle

In [2]:
df_aapl = pd.read_csv("df_aaple.csv")
df_small = df_aapl.iloc[:,:4]
df_small.drop(columns="Adj. Close", inplace=True)
df_small["P/E"] = df_small["P/E (LTM)"]
df_small.drop(columns="P/E (LTM)", inplace=True)
df_small["# Buys"] = df_aapl["# Buys"]

df_small_raw = df_small

def algo(df, max_lag):

    # Step 1: Tranformation for stationarity
    features = list(df.columns)[1:]

    for feature in features:
        result = adfuller(df[feature], autolag=None)
        counter = 0
        while result[1] > 0.05:
            df[feature] = df[feature] - df[feature].shift(1)
            #df_small.dropna()
            counter += 1
            #dropna(inplace=False) because it drops one observation for each feature
            result = adfuller(df.dropna()[feature], autolag=None)
        print(f'Order of integration for feature "{feature}" is {counter}')
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Step 2: Building a univariate model and finding the optimal l
    BICs = []
    for i in list(range(max_lag)):
        model = AutoReg(df.iloc[:,1], lags=i).fit()
        BICs.append(model.bic)

    min_bic_ind = BICs.index(min(BICs))

    # model = AutoReg(df_small.iloc[:,1], lags=min_bic_ind).fit()
    # model.summary()


    # Step 2: Bulding augmented model and finding the optimal w for each Xi
    
    # Defining dictionary to store all augmented models
    aug_models = {}
    feature_n_dfs = {}
    feature_n_dfs_merge = []
    
    for n in list(range(1, len(features))):
        columns = []
        for i in list(range(1, max_lag+1)):
            columns.append(features[n]+".L"+str(i))

        feature_n_df = pd.DataFrame(columns=columns)
        for i in list(range(max_lag)):
            feature_n_df[columns[i]] = df[features[n]].shift(i+1)

        feature_n_df.fillna(1, inplace=True)

        BICs = []
        #Why do I have max_lag-1 and then i+1?
        for i in list(range(max_lag-1)):
            model = AutoReg(df.iloc[:,1], lags=min_bic_ind, exog=feature_n_df.iloc[:,:i+1]).fit()
            BICs.append(model.bic)

        min_bic_ind_aug = BICs.index(min(BICs))
        #Full and Partial autocorrelation plot?
        feature_n_df1 = feature_n_df
        feature_n_df = feature_n_df.iloc[:,:min_bic_ind_aug+1]

        model = AutoReg(df.iloc[:,1], lags=min_bic_ind, exog=feature_n_df).fit()

        if grangercausalitytests(df[[features[1], features[0]]], maxlag=[min_bic_ind_aug+1])[min_bic_ind_aug+1][0]['params_ftest'][1] <= 0.05:
            aug_models[features[n]] = model
            feature_n_dfs[features[n]] = feature_n_df1
            feature_n_dfs_merge.append(feature_n_df)
            #model.summary()
        else:
            continue


        # aug_models[features[n]] = model
        # feature_n_dfs[features[n]] = feature_n_df1
        # feature_n_dfs_merge.append(feature_n_df)
        # #model.summary()
    feature_n_dfs_merge = pd.concat(feature_n_dfs_merge, axis=1)

    fin_model = AutoReg(df.iloc[:,1], lags=min_bic_ind, exog=feature_n_dfs_merge).fit()

    MAE = np.nanmean(abs(fin_model.predict() - df.iloc[:,1]))

    return fin_model, aug_models, feature_n_dfs, feature_n_dfs_merge, MAE

In [3]:
fin_model, aug_models, dfs, dfs_merged, MAE = algo(df=df_small, max_lag=20)

Order of integration for feature "Close" is 1
Order of integration for feature "P/E" is 1
Order of integration for feature "# Buys" is 1

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=3.2990  , p=0.0196  , df_denom=4243, df_num=3
ssr based chi2 test:   chi2=9.9132  , p=0.0193  , df=3
likelihood ratio test: chi2=9.9017  , p=0.0194  , df=3
parameter F test:         F=3.2990  , p=0.0196  , df_denom=4243, df_num=3

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.0619  , p=0.3028  , df_denom=4249, df_num=1
ssr based chi2 test:   chi2=1.0627  , p=0.3026  , df=1
likelihood ratio test: chi2=1.0626  , p=0.3026  , df=1
parameter F test:         F=1.0619  , p=0.3028  , df_denom=4249, df_num=1


In [4]:
fin_model.summary()

0,1,2,3
Dep. Variable:,Close,No. Observations:,4253.0
Model:,AutoReg-X(8),Log Likelihood,-4775.972
Method:,Conditional MLE,S.D. of innovations,0.745
Date:,"Fri, 20 May 2022",AIC,9577.944
Time:,18:04:55,BIC,9660.539
Sample:,8,HQIC,9607.134
,4253,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0382,0.012,3.306,0.001,0.016,0.061
Close.L1,-0.0775,0.017,-4.523,0.000,-0.111,-0.044
Close.L2,-0.0835,0.017,-4.863,0.000,-0.117,-0.050
Close.L3,0.0711,0.017,4.132,0.000,0.037,0.105
Close.L4,-0.0555,0.015,-3.623,0.000,-0.086,-0.025
Close.L5,-0.0665,0.015,-4.337,0.000,-0.097,-0.036
Close.L6,-0.0303,0.015,-1.975,0.048,-0.060,-0.000
Close.L7,0.0118,0.015,0.768,0.442,-0.018,0.042
Close.L8,-0.0852,0.015,-5.591,0.000,-0.115,-0.055

0,1,2,3,4
,Real,Imaginary,Modulus,Frequency
AR.1,-1.2331,-0.5655j,1.3566,-0.4316
AR.2,-1.2331,+0.5655j,1.3566,0.4316
AR.3,-0.4798,-1.1793j,1.2732,-0.3115
AR.4,-0.4798,+1.1793j,1.2732,0.3115
AR.5,1.2138,-0.5986j,1.3534,-0.0729
AR.6,1.2138,+0.5986j,1.3534,0.0729
AR.7,0.5681,-1.3507j,1.4653,-0.1866
AR.8,0.5681,+1.3507j,1.4653,0.1866


In [5]:
MAE

0.4564910413023767

In [33]:
predicted = fin_model.predict()
predicted

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
          ...   
4248   -0.098360
4249   -0.133332
4250   -0.263861
4251    0.030596
4252    0.030820
Length: 4253, dtype: float64

In [17]:
dfs['P/E']

Unnamed: 0,P/E.L1,P/E.L2,P/E.L3,P/E.L4,P/E.L5,P/E.L6,P/E.L7,P/E.L8,P/E.L9,P/E.L10,...,P/E.L21,P/E.L22,P/E.L23,P/E.L24,P/E.L25,P/E.L26,P/E.L27,P/E.L28,P/E.L29,P/E.L30
0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,0.070944,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2,0.303125,0.070944,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3,0.374069,0.303125,0.070944,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4,-0.451463,0.374069,0.303125,0.070944,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4248,0.020755,-0.026415,-0.145283,0.009434,0.069811,0.360377,0.141509,0.045283,0.181132,0.028302,...,0.149056,0.509433,-1.056278,-0.407115,0.201581,-0.450593,-0.142292,-0.230237,0.400198,-0.025692
4249,0.309434,0.020755,-0.026415,-0.145283,0.009434,0.069811,0.360377,0.141509,0.045283,0.181132,...,0.652830,0.149056,0.509433,-1.056278,-0.407115,0.201581,-0.450593,-0.142292,-0.230237,0.400198
4250,0.150943,0.309434,0.020755,-0.026415,-0.145283,0.009434,0.069811,0.360377,0.141509,0.045283,...,-0.256604,0.652830,0.149056,0.509433,-1.056278,-0.407115,0.201581,-0.450593,-0.142292,-0.230237
4251,0.054717,0.150943,0.309434,0.020755,-0.026415,-0.145283,0.009434,0.069811,0.360377,0.141509,...,0.335849,-0.256604,0.652830,0.149056,0.509433,-1.056278,-0.407115,0.201581,-0.450593,-0.142292


In [10]:
aug_models['P/E'].summary()

0,1,2,3
Dep. Variable:,Close,No. Observations:,4253.0
Model:,AutoReg-X(8),Log Likelihood,-4293.157
Method:,Conditional MLE,S.D. of innovations,0.665
Date:,"Fri, 13 May 2022",AIC,8614.314
Time:,14:47:41,BIC,8703.263
Sample:,8,HQIC,8645.749
,4253,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0376,0.010,3.649,0.000,0.017,0.058
Close.L1,-0.0688,0.015,-4.498,0.000,-0.099,-0.039
Close.L2,-0.0807,0.015,-5.268,0.000,-0.111,-0.051
Close.L3,0.0524,0.015,3.410,0.001,0.022,0.082
Close.L4,-0.0549,0.014,-4.015,0.000,-0.082,-0.028
Close.L5,-0.0479,0.014,-3.498,0.000,-0.075,-0.021
Close.L6,-0.0278,0.014,-2.025,0.043,-0.055,-0.001
Close.L7,0.0164,0.014,1.197,0.231,-0.010,0.043
Close.L8,-0.0605,0.014,-4.441,0.000,-0.087,-0.034

0,1,2,3,4
,Real,Imaginary,Modulus,Frequency
AR.1,-1.2627,-0.5953j,1.3960,-0.4299
AR.2,-1.2627,+0.5953j,1.3960,0.4299
AR.3,-0.4911,-1.2416j,1.3352,-0.3100
AR.4,-0.4911,+1.2416j,1.3352,0.3100
AR.5,1.2768,-0.6376j,1.4272,-0.0737
AR.6,1.2768,+0.6376j,1.4272,0.0737
AR.7,0.6122,-1.4002j,1.5282,-0.1844
AR.8,0.6122,+1.4002j,1.5282,0.1844
