# 测试statsmodels回归API、有无截距项回归和残差均值

In [1]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

In [56]:
np.random.seed(42)
x = np.random.random(1000) * 10  # 产生0-10的均匀粉笔
y = 1.5 * x + 2 + 2 * np.random.randn(1000)   

# 有无截距项回归

结论：
1. 有无截距项回归的回归系数是不同的，原因是python识别常数项之后进行无截距项回归
2. 自变量中截距项为0 == 无截距项回归
3. 有截距项回归的残差均值==0，无截距项回归残差均值！=0

In [57]:
x_cons = sm.add_constant(x)
x_cons

array([[1.        , 3.74540119],
       [1.        , 9.50714306],
       [1.        , 7.31993942],
       ...,
       [1.        , 1.36818631],
       [1.        , 9.50237354],
       [1.        , 4.46005773]])

In [58]:
coef_cons = sm.OLS(y,x_cons).fit().params
coef_cons

array([2.34956052, 1.46904297])

In [59]:
coef_no_cons = sm.OLS(y,x).fit().params
coef_no_cons

array([1.82280547])

有无截距项回归的回归系数是不同的，原因是python识别常数项之后进行无截距项回归

In [60]:
#令常数项为0,2分别尝试

In [61]:
x_cons_0 = x_cons.copy()
x_cons_0[:,0] = 0
x_cons_0

array([[0.        , 3.74540119],
       [0.        , 9.50714306],
       [0.        , 7.31993942],
       ...,
       [0.        , 1.36818631],
       [0.        , 9.50237354],
       [0.        , 4.46005773]])

In [62]:
coef_cons_0 = sm.OLS(y,x_cons_0).fit().params
coef_cons_0

array([0.        , 1.82280547])

In [63]:
x_cons_2 = x_cons.copy()
x_cons_2[:,0] = 2
x_cons_2

array([[2.        , 3.74540119],
       [2.        , 9.50714306],
       [2.        , 7.31993942],
       ...,
       [2.        , 1.36818631],
       [2.        , 9.50237354],
       [2.        , 4.46005773]])

In [64]:
coef_cons_2 = sm.OLS(y,x_cons_2).fit().params
coef_cons_2

array([1.17478026, 1.46904297])

自变量中截距项为0 == 无截距项回归

In [65]:
residue_no_cons = y - coef_no_cons*x
sum(residue_no_cons)

615.2166580241596

In [66]:
residue_cons = y - coef_cons[1]*x - coef_cons[0]
sum(residue_cons)

-2.4438229218048946e-12

有截距项回归的残差均值==0，无截距项回归残差均值！=0

# 虚拟变量的识别

结论:
1. 非dummy变量回归系数：除了无截距项+一个虚拟变量之外其他都相等（==0），即虚拟变量的个数不想要非虚拟变量系数的估计
2. 有截距项、一个虚拟变量时 对虚拟变量系数的估计最准确

In [67]:
x_mat = np.arange(3000).reshape(1000,3)
x_mat[:,:] = 0

In [68]:
x_mat[:,0] = x
x_mat

array([[3, 0, 0],
       [9, 0, 0],
       [7, 0, 0],
       ...,
       [1, 0, 0],
       [9, 0, 0],
       [4, 0, 0]])

In [69]:
x_mat[:,1] = 1
x_mat[500:,1] = 0

In [70]:
sum(x_mat)

array([4398,  500,    0])

In [71]:
x_mat[:,2] = 1 - x_mat[:,1]
x_mat

array([[3, 1, 0],
       [9, 1, 0],
       [7, 1, 0],
       ...,
       [1, 0, 1],
       [9, 0, 1],
       [4, 0, 1]])

In [8]:
y = 1.5 * x_mat[:,0] + 2 * x_mat[:,1]  + 2 +  2 * np.random.normal(0,1) 
y

array([ 5.82931128, 14.82931128, 11.82931128,  8.82931128,  2.82931128,
        2.82931128,  1.32931128, 13.32931128, 10.32931128, 11.82931128,
        1.32931128, 14.82931128, 13.32931128,  4.32931128,  2.82931128,
        2.82931128,  5.82931128,  8.82931128,  7.32931128,  4.32931128,
       10.32931128,  2.82931128,  4.32931128,  5.82931128,  7.32931128,
       11.82931128,  2.82931128,  8.82931128,  8.82931128,  1.32931128,
       10.32931128,  2.82931128,  1.32931128, 14.82931128, 14.82931128,
       13.32931128,  5.82931128,  1.32931128, 10.32931128,  7.32931128,
        2.82931128,  7.32931128,  1.32931128, 14.82931128,  4.32931128,
       10.32931128,  5.82931128,  8.82931128,  8.82931128,  2.82931128,
       14.82931128, 11.82931128, 14.82931128, 13.32931128,  8.82931128,
       14.82931128,  1.32931128,  2.82931128,  1.32931128,  5.82931128,
        5.82931128,  4.32931128, 13.32931128,  5.82931128,  4.32931128,
        8.82931128,  2.82931128, 13.32931128,  1.32931128, 14.82

In [72]:
# 无截距项，两个虚拟变量

In [73]:
coef_no_cons_twodummy = sm.OLS(y,x_mat).fit().params
coef_no_cons_twodummy

array([1.46301462, 3.15488781, 3.07971549])

In [74]:
residue_1 = y - x_mat.dot(coef_no_cons_twodummy)
sum(residue_1)

-5.565770067050835e-12

In [75]:
# 有截距项，两个虚拟变量

In [76]:
x_mat_cons = sm.add_constant(x_mat)
coef_cons_twodummy = sm.OLS(y,x_mat_cons).fit().params
coef_cons_twodummy

array([2.0782011 , 1.46301462, 1.07668671, 1.00151439])

In [77]:
residue_2 = y - x_mat_cons.dot(coef_cons_twodummy)
sum(residue_2)

-7.887468456146962e-12

In [78]:
# 有截距项，一个虚拟变量

In [79]:
coef_cons_onedummy = sm.OLS(y,x_mat_cons[:,[0,1,2]]).fit().params
coef_cons_onedummy

array([3.07971549, 1.46301462, 0.07517232])

In [80]:
residue_3 = y - x_mat_cons[:,[0,1,2]].dot(coef_cons_onedummy)
sum(residue_3)

-2.148947686464453e-12

In [82]:
# 无截距项，一个虚拟变量

In [83]:
coef_no_cons_onedummy = sm.OLS(y,x_mat[:,[0,1]]).fit().params
coef_no_cons_onedummy

array([1.83642703, 1.48125338])

In [84]:
residue = y - x_mat[:,[0,1]].dot(coef_no_cons_onedummy)
sum(residue)

734.407173547797

# 改变虚拟变量的个别值，构造“虚假”虚拟变量

In [85]:
x_mat_false = x_mat.copy()
x_mat_false[500,1] = 9
x_mat_false[500,1]

9

In [86]:
y = 1.5 * x_mat[:,0] + 2 * x_mat[:,1]  + 2 +  2 * np.random.normal(0,1) 

In [87]:
x_mat_false_cons = sm.add_constant(x_mat_false)

In [88]:
coef_false_cons_onedummy = sm.OLS(y,x_mat_false_cons[:,[0,1,2]]).fit().params
coef_false_cons_onedummy

array([-0.58793459,  1.49975826,  1.52527076])

In [89]:
coef_false_cons_twodummy = sm.OLS(y,x_mat_false_cons).fit().params
coef_false_cons_twodummy

array([ 1.18736507e+00,  1.50000000e+00,  1.09912079e-14, -2.00000000e+00])

In [90]:
sum(y - x_mat_false_cons.dot(coef_false_cons_twodummy))

-7.724043626922139e-12

In [91]:
coef_false_twodummy = sm.OLS(y,x_mat_false).fit().params
coef_false_twodummy

array([ 1.53633594,  0.88097256, -0.98524566])

In [92]:
sum(y - x_mat_false.dot(coef_false_twodummy))

71.76742152737387