## Derivations of beta β
A linear regression takes the form of:  
               
$$ y = bx + ε $$ 

where x is the regressor; b is x's coefficient, and ε is noise. its estimate can be expressed by:  
               
$$ {y = \hat{b} x} $$ 
               
As the Ordinary Least Square method states, the best value of beta would minimize the difference between the estimate value $ \hat{y} $ and the actual y. Thus we have:  
               
$$ arg\underset{b}min (y - \hat{y})^2 $$ 

Let:  
        
$$ F =  (y - \hat{y})^2$$
$$F = (y - \hat{b} x)^2 $$
Take the derivative of F with respect to $\hat{b}$, and make it equal to 0 for optimization  
  
$${dF\over{d\hat{b}}} = 2(y - \hat{b} x)(-x) = 0 $$  

$$(y - \hat{b} x)(-x) = 0 $$  

$$ \hat{b} x x' = xy $$  

and expression of estimated beta would be:   
$$ \hat{b} = (xx')^{-1} xy $$

Now, for the standard deviation of beta, first take calculate the variance of beta:  
$$ Var(b) = Var((xx')^{-1} xy) $$
$$ Var(b) = Var((xx')^{-1} x (b x + ε)) $$
$$ Var(b) = Var((xx')^{-1} x b x + (xx')^{-1} x ε)) $$
$$ Var(b) = Var((xx')^{-1} x ε)) $$




<tbody><tr>
<th colspan="5">Critical values for Dickey–Fuller <i>t</i>-distribution.
</th></tr>
<tr>
<td></td>
<td colspan="2">Without trend</td>
<td colspan="2">With trend
</td></tr>
<tr>
<td>Sample size</td>
<td>1%</td>
<td>5%</td>
<td>1%</td>
<td>5%
</td></tr>
<tr>
<td>T = 25</td>
<td>−3.75</td>
<td>−3.00</td>
<td>−4.38</td>
<td>−3.60
</td></tr>
<tr>
<td>T = 50</td>
<td>−3.58</td>
<td>−2.93</td>
<td>−4.15</td>
<td>−3.50
</td></tr>
<tr>
<td>T = 100</td>
<td>−3.51</td>
<td>−2.89</td>
<td>−4.04</td>
<td>−3.45
</td></tr>
<tr>
<td>T = 250</td>
<td>−3.46</td>
<td>−2.88</td>
<td>−3.99</td>
<td>−3.43
</td></tr>
<tr>
<td>T = 500</td>
<td>−3.44</td>
<td>−2.87</td>
<td>−3.98</td>
<td>−3.42
</td></tr>
<tr>
<td>T = ∞</td>
<td>−3.43</td>
<td>−2.86</td>
<td>−3.96</td>
<td>−3.41
</td></tr>
<tr>
<td colspan="5">Source<sup id="cite_ref-Fuller1976_2-0" class="reference"><a href="#cite_note-Fuller1976-2">[2]</a></sup><sup class="reference" style="white-space:nowrap;">:<span>373</span></sup>
</td></tr></tbody>

# import section


In [1]:
import warnings
warnings.filterwarnings('ignore')
from colorit import *
init_colorit()

import numpy as np
import pandas as pd

import statsmodels
from sklearn.linear_model import LinearRegression as LR

from statsmodels.tsa.stattools import coint
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import norm
 

In [2]:
class LinearRegression():
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.addconst()
        self.fit()
        
        
    def addconst(self):
        self.x['constant'] = np.ones(len(self.x))
        
    def fit(self):
        self.coef = np.linalg.inv(self.x.T.dot(self.x)).dot(self.x.T.dot(self.y))
        
        
        self.error = self.y.values - self.x.dot(self.coef).values
        self.residual_cov = self.error.T.dot(self.error)/ len(self.y)
        self.coef_cov = np.kron(np.linalg.inv(self.x.T.dot(self.x))
                                ,self.residual_cov )
        self.coef_std = np.sqrt(self.y.shape[0]/(self.y.shape[0]-self.x.shape[1])*np.diag(self.coef_cov))
        self.tstats = self.coef/self.coef_std.reshape(self.coef.shape)
        
    def return_coef(self):
        return self.coef
        
    def report(self):
        self.coef = pd.DataFrame(self.coef, 
                                 index = self.x.columns, 
                                 columns = self.y.columns)
        self.coef_std = pd.DataFrame(self.coef_std, 
                                     index = self.x.columns, 
                                     columns = self.y.columns)
        self.tstats = pd.DataFrame(self.tstats, 
                                   index = self.x.columns, 
                                   columns = self.y.columns)
        return pd.concat({'Estimate Coefficient':self.coef,
                          'SD of Estimate':self.coef_std,
                          't-Statistic':self.tstats,}, axis = 1)
        

In [65]:
class VAR():
    """
    This is class contains all the 
    statistical means I need for this
    pair trading project
    """
    def __init__(self, df, lag, ):
        """
        x and y are pandas dataframe
        """
        self.df = df
        self.lag =lag
        self.coef = np.array([])
        self.columns_name = self.name_lag()
        self.data = self.process_data()
        self.x = self.data[self.indepvar_name]
        self.y = self.data[df.columns]
        self.addconst()
        self.fit()
        
        
    def name_lag(self,):
        rlst = list(self.df.columns)
        self.indepvar_name = []
        for j in range(self.lag):
            for i in range(len(self.df.columns)):
                rlst.append('(Lag_'+str(j+1)+', '+self.df.columns[i]+')')
                self.indepvar_name.append('(Lag_'+str(j+1)+', '+self.df.columns[i]+')')
        return rlst
        
    def process_data(self,):
        lst = [self.df]
        for i in range(self.lag):
            lst.append(self.df.shift(i+1))
        r_df = pd.concat(lst,axis = 1)
        r_df.columns = self.columns_name
        return r_df.fillna(0).reset_index(drop=True)
    
    def addconst(self):
        self.x['constant'] = np.ones(len(self.x))
        self.indepvar_name.append('constant')

    def fit(self):
        self.coef = np.linalg.inv(self.x.T.dot(self.x)).dot(self.x.T.dot(self.y))
        self.coef = pd.DataFrame(self.coef, 
                                 index = self.x.columns,
                                columns = self.y.columns)
        self.error = self.y - self.x.dot(self.coef)
        self.residual_cov = self.error.T.dot(self.error)/ len(self.y)
        self.coef_cov = np.kron(np.linalg.inv(self.x.T.dot(self.x))
                                ,self.residual_cov )
        self.coef_std = np.sqrt(self.y.shape[0]/(self.y.shape[0]-self.x.shape[1])*np.diag(self.coef_cov))
        self.tstats = self.coef/self.coef_std.reshape(self.coef.shape)
        
        self.coef_std = pd.DataFrame(self.coef_std.reshape(self.coef.shape),
                                    index =self.x.columns,
                                   columns = self.y.columns)
        
        self.p_value = pd.DataFrame(2*(1-norm.cdf(abs(self.tstats))), 
                     index = self.indepvar_name, 
                     columns = self.df.columns)
        
    
    def report(self):
        return pd.concat({'Estimate Coefficient':self.coef,
                          'SD of Estimate':self.coef_std,
                          't-Statistic':self.tstats,}, axis = 1)
        
    def AIC(self):
        return np.log(np.linalg.det(self.residual_cov)) + 2*self.x.shape[1]*self.y.shape[1]/self.y.shape[0]
    
    def BIC(self):
        return np.log(np.linalg.det(self.residual_cov)) + np.log(self.y.shape[0])*self.x.shape[1]*self.y.shape[1]/self.y.shape[0]
        
    def IC(self, lag):
        ic = pd.DataFrame([[VAR(self.df, p+1).AIC(), VAR(self.df, p+1).BIC()] for p in range(lag)], 
             index=[p+1 for p in range(lag)],
             columns = ['AIC','BIC'],)
        ic.index.name = 'Lag'
        return ic.style.apply(self.color_min_red)
    
    
    def stability(self):
        eig_v,_ = np.linalg.eig(self.coef.drop('constant'))
        if False in (eig_v < 1):
            print('Stability status: '+ color('[x] UNSTABLE',Colors.red))
        else:
            print('Stability status: ' + color('[o] STABLE',(9, 86, 146)))
            
    def color_min_red(self, s):
        is_min = s== s.min()
        return ['color: %s' % 'red' if v else '' for v in is_min]
        
        


In [70]:
class ADF:
    def __init__(self, residual):
        
        self.e_lag_1 = residual.shift()
        self.delta_e = residual.diff()
        self.delta_e.columns = ['(Δ{})'.format(residual.columns[0])]
        
        self.delta_e_lag_1 = self.e_lag_1.diff()
        
        self.x = pd.DataFrame(index = residual.index)
        self.x['(Lag 1, {})'.format(residual.columns[0])] = self.e_lag_1
        self.x['(Lag 1, Δ{})'.format(residual.columns[0])] = self.delta_e_lag_1
        
        self.x = self.x.dropna()
        self.y = self.delta_e.loc[self.x.index]
        
    def report(self):
        model = LinearRegression(self.x, self.y)
        t_value = model.report()['t-Statistic'].loc['(Lag 1, Residual)']
        if t_value.values < -3.45:
            print("The T-value is {} which is lower than -3.45".format(t_value.values))
            print('[*]We '+color('reject', (18, 96, 184))+
                  ' the  𝐻0  hypothesis of unit root.'+
                  color(' The residuals are stationary.',(18, 96, 184)))
        else:
            print("The T-value is {} which is higher than -3.45".format(t_value.values))
            print('[*]We '+color('fail to reject', (235, 30, 6))+
                  ' the  𝐻0  hypothesis of unit root.'+
                  color(' The residuals are NOT stationary.',(235, 30, 6)))
        return model.report().style.applymap(self.color_negative_red)

    
    def color_negative_red(self, val):
        color = 'red' if val < -3.45 else 'black'
        return 'color: %s' % color
    
 
            
        
        

In [71]:
#coint_prices = pd.read_csv('data.csv',index_col = 'Date')
#FB = pd.DataFrame(coint_prices['FB'])
#BABA = pd.DataFrame(coint_prices['BABA'])
#model = LinearRegression(x = BABA, y = FB) 
#model.report()


In [74]:
#residuals = pd.DataFrame(model.error.reshape(1,1237)[0],
#                         index = BABA.index,
#                         columns = ['Residual'])
#residuals

In [73]:
#adf_test = ADF(residuals)
#adf_test.report()