# 测试statsmodels回归API、有无截距项回归和残差均值

In [1]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
np.random.seed(42)
x = np.random.random(1000) * 10  # 产生0-10的均匀粉笔
y = 1.5 * x + 2 + 2 * np.random.normal(0,1)   

# 有无截距项回归

结论：
1. 有无截距项回归的回归系数是不同的，原因是python识别常数项之后进行无截距项回归
2. 自变量中截距项为0 == 无截距项回归
3. 有截距项回归的残差均值==0，无截距项回归残差均值！=0

In [21]:
x_cons = sm.add_constant(x)
x_cons

array([[1.        , 3.74540119],
       [1.        , 9.50714306],
       [1.        , 7.31993942],
       ...,
       [1.        , 1.36818631],
       [1.        , 9.50237354],
       [1.        , 4.46005773]])

In [22]:
coef_cons = sm.OLS(y,x_cons).fit().params
coef_cons

array([2.355402, 1.5     ])

In [23]:
coef_no_cons = sm.OLS(y,x).fit().params
coef_no_cons

array([1.85464203])

有无截距项回归的回归系数是不同的，原因是python识别常数项之后进行无截距项回归

In [24]:
#令常数项为0,2分别尝试

In [25]:
x_cons_0 = x_cons.copy()
x_cons_0[:,0] = 0
x_cons_0

array([[0.        , 3.74540119],
       [0.        , 9.50714306],
       [0.        , 7.31993942],
       ...,
       [0.        , 1.36818631],
       [0.        , 9.50237354],
       [0.        , 4.46005773]])

In [26]:
coef_cons_0 = sm.OLS(y,x_cons_0).fit().params
coef_cons_0

array([0.        , 1.85464203])

In [27]:
x_cons_2 = x_cons.copy()
x_cons_2[:,0] = 2
x_cons_2

array([[2.        , 3.74540119],
       [2.        , 9.50714306],
       [2.        , 7.31993942],
       ...,
       [2.        , 1.36818631],
       [2.        , 9.50237354],
       [2.        , 4.46005773]])

In [28]:
coef_cons_2 = sm.OLS(y,x_cons_2).fit().params
coef_cons_2

array([1.177701, 1.5     ])

自变量中截距项为0 == 无截距项回归

In [30]:
residue_no_cons = y - coef_no_cons*x
sum(residue_no_cons)

616.7462116491785

In [32]:
residue_cons = y - coef_cons[1]*x - coef_cons[0]
sum(residue_cons)

-8.192557743313955e-12

有截距项回归的残差均值==0，无截距项回归残差均值！=0

# 虚拟变量的识别

结论:
1. 非dummy变量回归系数：除了无截距项+一个虚拟变量之外其他都相等，即虚拟变量的个数不想要非虚拟变量系数的估计
2. 有截距项、一个虚拟变量时 对虚拟变量系数的估计最准确

In [66]:
x_mat = np.arange(3000).reshape(1000,3)
x_mat[:,:] = 0

In [80]:
x_mat[:,0] = x
x_mat

array([[3, 0, 0],
       [9, 0, 0],
       [7, 0, 0],
       ...,
       [1, 0, 0],
       [9, 0, 0],
       [4, 0, 0]])

In [92]:
x_mat[:,1] = 1
x_mat[500:,1] = 0

In [93]:
sum(x_mat)

array([4398,  500,    0])

In [94]:
x_mat[:,2] = 1 - x_mat[:,1]
x_mat

array([[3, 1, 0],
       [9, 1, 0],
       [7, 1, 0],
       ...,
       [1, 0, 1],
       [9, 0, 1],
       [4, 0, 1]])

In [139]:
y = 1.5 * x_mat[:,0] + 2 * x_mat[:,1] + 3 * x_mat[:,2] + 2 +  2 * np.random.normal(0,1) 
y

array([13.2691956, 22.2691956, 19.2691956, 16.2691956, 10.2691956,
       10.2691956,  8.7691956, 20.7691956, 17.7691956, 19.2691956,
        8.7691956, 22.2691956, 20.7691956, 11.7691956, 10.2691956,
       10.2691956, 13.2691956, 16.2691956, 14.7691956, 11.7691956,
       17.7691956, 10.2691956, 11.7691956, 13.2691956, 14.7691956,
       19.2691956, 10.2691956, 16.2691956, 16.2691956,  8.7691956,
       17.7691956, 10.2691956,  8.7691956, 22.2691956, 22.2691956,
       20.7691956, 13.2691956,  8.7691956, 17.7691956, 14.7691956,
       10.2691956, 14.7691956,  8.7691956, 22.2691956, 11.7691956,
       17.7691956, 13.2691956, 16.2691956, 16.2691956, 10.2691956,
       22.2691956, 19.2691956, 22.2691956, 20.7691956, 16.2691956,
       22.2691956,  8.7691956, 10.2691956,  8.7691956, 13.2691956,
       13.2691956, 11.7691956, 20.7691956, 13.2691956, 11.7691956,
       16.2691956, 10.2691956, 20.7691956,  8.7691956, 22.2691956,
       19.2691956, 10.2691956,  8.7691956, 20.7691956, 19.2691

In [140]:
# 无截距项，两个虚拟变量

In [141]:
coef_no_cons_twodummy = sm.OLS(y,x_mat).fit().params
coef_no_cons_twodummy

array([1.5      , 8.7691956, 9.7691956])

In [142]:
residue_1 = x_mat.dot(coef_no_cons_twodummy)
sum(residue_1)

15866.195600663334

In [143]:
# 有截距项，两个虚拟变量

In [144]:
x_mat_cons = sm.add_constant(x_mat)
coef_cons_twodummy = sm.OLS(y,x_mat_cons).fit().params
coef_cons_twodummy

array([6.17946373, 1.5       , 2.58973187, 3.58973187])

In [145]:
residue_2 = x_mat_cons.dot(coef_cons_twodummy)
sum(residue_2)

15866.195600663332

In [146]:
# 有截距项，一个虚拟变量

In [147]:
coef_cons_onedummy = sm.OLS(y,x_mat_cons[:,[0,1,2]]).fit().params
coef_cons_onedummy

array([ 9.7691956,  1.5      , -1.       ])

In [148]:
residue_3 = x_mat_cons[:,[0,1,2]].dot(coef_cons_onedummy)
sum(residue_3)

15866.19560066333

In [149]:
# 无虚拟变量，一个虚拟变量

In [150]:
coef_no_cons_onedummy = sm.OLS(y,x_mat[:,[0,1]]).fit().params
coef_no_cons_onedummy

array([2.68450516, 3.46024347])

In [151]:
residue = x_mat[:,[0,1]].dot(coef_no_cons_onedummy)
sum(residue)

13536.575430944515

In [137]:
sum(residue_1 - residue_2)

1.2040146657454898e-11

In [138]:
sum(residue_2 - residue_3)

-1.7555290554582825e-11