In [155]:
#pip install import-ipynb
import import_ipynb
import warnings
warnings.filterwarnings('ignore')

In [6]:
from utility import regression

In [191]:
import numpy as np
import pandas as pd

import statsmodels
from sklearn.linear_model import LinearRegression as LR

from statsmodels.tsa.stattools import coint
import matplotlib.pyplot as plt
import seaborn as sns

import yfinance as yf
from scipy.stats import norm



In [20]:
df = pd.read_csv('data.csv',).drop('Date',axis=1)
df.head()

Unnamed: 0,BABA,FB
0,76.690002,102.220001
1,78.629997,102.730003
2,77.330002,102.970001
3,72.720001,97.919998
4,70.800003,97.330002


In [21]:
x = pd.DataFrame(df)
y = pd.DataFrame(df)

In [242]:
class VAR():
    """
    This is class contains all the 
    statistical means I need for this
    pair trading project
    """
    def __init__(self, df, lag, * args):
        """
        x and y are pandas dataframe
        """
        self.df = df
        self.lag =lag
        self.coef = np.array([])
        self.columns_name = self.name_lag()
        self.data = self.process_data()
        self.x = self.data[self.indepvar_name]
        self.y = self.data[df.columns]
        
        self.addconst()
        self.fit()
        
        
        
    def name_lag(self,):
        rlst = list(self.df.columns)
        self.indepvar_name = []
        for j in range(self.lag):
            for i in range(len(self.df.columns)):
                rlst.append('(Lag_'+str(j+1)+', '+self.df.columns[i]+')')
                self.indepvar_name.append('(Lag_'+str(j+1)+', '+self.df.columns[i]+')')
        return rlst
        
    def process_data(self,):
        lst = [self.df]
        for i in range(self.lag):
            lst.append(df.shift(i+1))
        r_df = pd.concat(lst,axis = 1)
        r_df.columns = self.columns_name
        return r_df.fillna(0).reset_index(drop=True)
    
    def addconst(self):
        self.x['constant'] = np.ones(len(self.x))
        self.indepvar_name.append('constant')

    def fit(self):
        # Coefficient
        self.coef = np.linalg.inv(self.x.T.dot(self.x)).dot(self.x.T.dot(self.y))
        self.coef = pd.DataFrame(self.coef, 
                                 index = self.x.columns,
                                columns = self.y.columns)
        self.error = self.y - self.x.dot(self.coef)
        self.residual_cov = self.error.T.dot(self.error)/ len(self.y)
        self.coef_cov = np.kron(np.linalg.inv(self.x.T.dot(self.x))
                                ,self.residual_cov )
        self.coef_std = np.sqrt(self.y.shape[0]/(self.y.shape[0]-self.x.shape[1])*np.diag(self.coef_cov))
        self.tstats = self.coef/self.coef_std.reshape(self.coef.shape)
        
        self.coef_std = pd.DataFrame(self.coef_std.reshape(self.coef.shape),
                                    index =self.x.columns,
                                   columns = self.y.columns)
        
        self.p_value = pd.DataFrame(2*(1-norm.cdf(abs(self.tstats))), 
                     index = self.indepvar_name, 
                     columns = self.df.columns)
        
    
    def report(self):
        return pd.concat({'Estimate Coefficient':self.coef,
                          'SD of Estimate':self.coef_std,
                          't-Statistic':self.tstats,}, axis = 1)
        
    def AIC(self):
        return np.log(np.linalg.det(self.residual_cov)) + 2*self.x.shape[1]*self.y.shape[1]/self.y.shape[0]
    
    def BIC(self):
        return np.log(np.linalg.det(self.residual_cov)) + np.log(self.y.shape[0])*self.x.shape[1]*self.y.shape[1]/self.y.shape[0]
        
    def IC(self, lag):
        ic = pd.DataFrame([[VAR(df, p+1).AIC(), VAR(df, p+1).BIC()] for p in range(lag)], 
             index=[p+1 for p in range(lag)],
             columns = ['AIC','BIC'],)
        ic.index.name = 'Lag'
        return ic

        
        
        
        
    

In [243]:
model = VAR(df = df,lag = 1)
model.report()


Unnamed: 0_level_0,Estimate Coefficient,Estimate Coefficient,SD of Estimate,SD of Estimate,t-Statistic,t-Statistic
Unnamed: 0_level_1,BABA,FB,BABA,FB,BABA,FB
"(Lag_1, BABA)",1.001204,0.033235,0.007256,0.008212,137.980277,4.047076
"(Lag_1, FB)",-0.007545,0.945897,0.009562,0.010822,-0.789104,87.406558
constant,1.314239,4.012441,0.641629,0.726166,2.048285,5.525516


In [244]:
model.p_value <0.05

Unnamed: 0,BABA,FB
"(Lag_1, BABA)",True,True
"(Lag_1, FB)",False,True
constant,True,True


In [246]:
model.IC(5)

Unnamed: 0_level_0,AIC,BIC
Lag,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.328707,5.354243
2,5.330497,5.373057
3,5.335523,5.395107
4,5.335252,5.41186
5,5.340507,5.434138


In [299]:
#testing

In [1]:
from sklearn import linear_model
import statsmodels.api as sm


In [14]:
x = pd.DataFrame(df.shift().fillna(0))
y = pd.DataFrame(df.FB)
x = sm.add_constant(x)


In [15]:
x

Unnamed: 0,const,BABA,FB
0,1.0,0.000000,0.000000
1,1.0,76.690002,102.220001
2,1.0,78.629997,102.730003
3,1.0,77.330002,102.970001
4,1.0,72.720001,97.919998
...,...,...,...
1190,1.0,272.950012,249.020004
1191,1.0,269.730011,249.529999
1192,1.0,271.089996,254.820007
1193,1.0,276.010010,256.820007


In [16]:
model = sm.OLS(y,x)
result = model.fit()
result.params

const    4.012441
BABA     0.033235
FB       0.945897
dtype: float64

In [17]:
result.tvalues

const     5.525516
BABA      4.047076
FB       87.406558
dtype: float64