# Lecture 3 Notes

In [1]:
# Import Statements 
import numpy as np
import pandas as pd
import matplotlib as mp
import statsmodels.api as sm

In [2]:
mu, sigma = 0, 5 # mean and standard deviation of normal distribution for the error term
x = np.random.uniform(1,8,100) #40 low, 80 high, size of 100; now it's 1 to 8
epsilon = np.random.normal(mu,sigma,100)
y = 3 + 4*x + epsilon

In [3]:
# 100 values
len(x)

100

In [18]:
x

array([3.32132191, 6.58668959, 6.9477287 , 1.52494073, 3.36103043,
       6.4639821 , 6.77195713, 4.57186916, 4.30886201, 5.09318282,
       3.25759061, 2.27962201, 3.78230191, 3.62780766, 3.74324611,
       7.04541533, 2.63379684, 2.76687606, 4.31811943, 7.15080551,
       3.16050068, 1.11872232, 1.32043501, 5.00881499, 2.63923119,
       5.8019218 , 5.91257329, 7.65740666, 3.82450083, 5.12067468,
       3.63230498, 6.90390328, 5.82024257, 3.38904059, 7.09965435,
       2.99927326, 5.3039367 , 5.31620953, 5.00659273, 4.18955506,
       6.21530147, 7.69324738, 1.12521265, 2.77881796, 3.95999406,
       4.53370773, 7.58046665, 1.79414054, 7.25362158, 1.31781988,
       5.02318508, 7.98945136, 5.45598042, 6.30429438, 5.40638497,
       1.3827024 , 6.88168023, 3.64721219, 3.28008848, 3.07861999,
       4.21657041, 1.94341635, 1.60550584, 1.98520321, 5.75080479,
       3.11044532, 5.69547374, 4.01985049, 6.97249944, 3.94754409,
       1.33973816, 4.80779032, 2.9107861 , 2.00988129, 1.97921

In [4]:
# Fit OLS model:
model_reg = sm.OLS(y,x).fit()

In [5]:
model_reg.summary()

# It returned x1 and the coefficient, but there is no intercept here

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.954
Model:,OLS,Adj. R-squared (uncentered):,0.954
Method:,Least Squares,F-statistic:,2070.0
Date:,"Sat, 12 Oct 2024",Prob (F-statistic):,3.59e-68
Time:,22:21:34,Log-Likelihood:,-300.77
No. Observations:,100,AIC:,603.5
Df Residuals:,99,BIC:,606.2
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,4.7628,0.105,45.494,0.000,4.555,4.971

0,1,2,3
Omnibus:,2.539,Durbin-Watson:,2.216
Prob(Omnibus):,0.281,Jarque-Bera (JB):,2.097
Skew:,-0.156,Prob(JB):,0.35
Kurtosis:,3.637,Cond. No.,1.0


In [6]:
# Add constants
x_updated = sm.add_constant(x)

In [7]:
model_updated = sm.OLS(y,x_updated).fit()

In [8]:
# In the example of statistical significance, we don't need the intercept when P > 0.05
model_updated.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.736
Model:,OLS,Adj. R-squared:,0.733
Method:,Least Squares,F-statistic:,272.8
Date:,"Sat, 12 Oct 2024",Prob (F-statistic):,4.49e-30
Time:,22:21:34,Log-Likelihood:,-298.04
No. Observations:,100,AIC:,600.1
Df Residuals:,98,BIC:,605.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8171,1.200,2.348,0.021,0.436,5.198
x1,4.2140,0.255,16.517,0.000,3.708,4.720

0,1,2,3
Omnibus:,2.097,Durbin-Watson:,2.244
Prob(Omnibus):,0.35,Jarque-Bera (JB):,1.506
Skew:,-0.21,Prob(JB):,0.471
Kurtosis:,3.43,Cond. No.,12.2


In [9]:
# Some problems...

# It's not returning the right result
# very different - in the true model, the intercept 

# In the true model, the intercept was 3, but in the OLS model created, it added a bad intercept in our  OLS model


In [10]:
# Why did this happen?
# Our true model is unknown; run the model, and then we re-sample and run it again
# Resample, run it again
# When we get the coefficient and the values, that one should be the closest
# depending on the model, we might suffer from the variance issue.

# Bias vs. Variance Trade-off - overfitting possible, vs. experiencing high variances

In [11]:
# Samping importance:
# If you always the sample from the same limited dataset

# When you calculate the average values of beta, it will be close to the true model
# If you want to run a sophiscated model
# When you have data for each column, check non-normality
# If you can, and then scale the dataset, such that each column will follow some standard normal distribution, it will be better

# Even if the model obeys all the assumptions of OLS, we fail
# If X is 0.1

In [12]:
# GLS in Python

# This model is not homoscedastic - since each error term is related now
# We now generate autocorrelated error terms
epsilon[0] = np.random.normal(mu,sigma,1)
for i in range(0,99):
    epsilon[i+1]=0.4*epsilon[i]+0.6*np.random.normal(mu,sigma,1)

  epsilon[0] = np.random.normal(mu,sigma,1)
  epsilon[i+1]=0.4*epsilon[i]+0.6*np.random.normal(mu,sigma,1)


In [13]:
# What if we use OLS instead of GLS?
y = 3 + 4*x + epsilon

In [14]:
x_updated = sm.add_constant(x)
model_OLS = sm.OLS(y,x_updated).fit()
model_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.83
Model:,OLS,Adj. R-squared:,0.828
Method:,Least Squares,F-statistic:,477.5
Date:,"Sat, 12 Oct 2024",Prob (F-statistic):,1.87e-39
Time:,22:21:35,Log-Likelihood:,-263.12
No. Observations:,100,AIC:,530.2
Df Residuals:,98,BIC:,535.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.3162,0.846,5.102,0.000,2.637,5.995
x1,3.9319,0.180,21.852,0.000,3.575,4.289

0,1,2,3
Omnibus:,11.76,Durbin-Watson:,1.03
Prob(Omnibus):,0.003,Jarque-Bera (JB):,28.932
Skew:,-0.264,Prob(JB):,5.22e-07
Kurtosis:,5.582,Cond. No.,12.2


In [15]:
from scipy.linalg import toeplitz
toeplitz(np.array([1,0.5,0,0,0,0,0,0]))

#TOPEXLITX FUNCTION - returns a symmetric metric, all you need to do is enter the first row
# The rest will be made automatically
# 8 elements, in total, and it will be symmetric overall
# useful for auto-correlation

rho = 0.4 # the lag factor of the correlation, starting from lag 2, it must be zero.
# It will automatically make the covariance of the error
cov_matrix = sigma**2*toeplitz(np.append([1,rho],np.zeros(98)))
sm.GLS(y,x_updated,cov_matrix).fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.881
Model:,GLS,Adj. R-squared:,0.88
Method:,Least Squares,F-statistic:,726.0
Date:,"Sat, 12 Oct 2024",Prob (F-statistic):,4.18e-47
Time:,22:21:35,Log-Likelihood:,-256.35
No. Observations:,100,AIC:,516.7
Df Residuals:,98,BIC:,521.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.9074,0.796,4.908,0.000,2.327,5.487
x1,4.0152,0.149,26.944,0.000,3.720,4.311

0,1,2,3
Omnibus:,5.58,Durbin-Watson:,2.108
Prob(Omnibus):,0.061,Jarque-Bera (JB):,5.747
Skew:,-0.347,Prob(JB):,0.0565
Kurtosis:,3.948,Cond. No.,9.19


In [16]:
# We see an improvement in the model performance

In [17]:
# In Summary, the OLS model, the GLS correctly captured the constant when OLS could not
# even when X-squared is high, it failed to capture the right y-intercept
# 