In [4]:
import pandas as pd
from sklearn import metrics
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)

# 建议使用下面各个两个种数据尝试一下，就会发现各个方法的不同
all_data = data[['TV','radio', 'newspaper']]
# all_data = data[['TV','radio']]
all_label = data['sales']
train_data, test_data, train_label, test_label = train_test_split(all_data, all_label,test_size=0.25, random_state=1)
linreg = LinearRegression()
linreg.fit(train_data, train_label)
test_pred = linreg.predict(test_data)
# 均方根误差 Root Mean Squared Error, RMSE
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4046514230328955


In [5]:
from sklearn.linear_model import Ridge
model = Ridge (alpha = .5)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4046675192134495


In [6]:
from sklearn.linear_model import RidgeCV
model =  RidgeCV (alphas=[0.1, 1.0, 10.0])
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4049734594787202


In [7]:
from sklearn.linear_model import Lasso
model =  Lasso(alpha=0.1)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4057483336366394


In [8]:
from sklearn.linear_model import LassoCV
model =  LassoCV(alphas=[0.1, 1.0, 10.0], cv=5)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4169216816103538


In [9]:
from sklearn.linear_model import LassoLars
model =  LassoLars(alpha=0.1)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

2.1789417950559846


In [10]:
from sklearn.linear_model import LassoLarsCV
model =  LassoLarsCV(cv=5)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4026240169662034


In [11]:
from sklearn.linear_model import LassoLarsIC
model =  LassoLarsIC()
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4048770912814341


In [12]:
from sklearn.linear_model import ElasticNet
model =  ElasticNet(alpha=0.1, l1_ratio =0.1)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4051815501709382


In [13]:
from sklearn.linear_model import ElasticNetCV
model =  ElasticNetCV(cv=5)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4180560183339896


In [14]:
from sklearn.linear_model import OrthogonalMatchingPursuit
model =  OrthogonalMatchingPursuit(n_nonzero_coefs =2)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.3879034699382897


In [15]:
from sklearn.linear_model import OrthogonalMatchingPursuitCV
model =  OrthogonalMatchingPursuitCV(cv=5)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.3879034699382897


In [16]:
from sklearn.linear_model import BayesianRidge
model =  BayesianRidge()
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.4137303264637149


In [17]:
from sklearn.linear_model import ARDRegression
model =  ARDRegression()
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.3890642344625879


In [28]:
from sklearn.linear_model import HuberRegressor
model =  HuberRegressor(epsilon=1.5)
model.fit(train_data, train_label)
test_pred = model.predict(test_data)
print(np.sqrt(metrics.mean_squared_error(test_label, test_pred)))

1.3579951669009485


# 线性回归整理
$$h_{w}(x^{i})=w^{^{T}}x_{i}+b$$

|name|中文名称|损失函数|正则项|优化方法|验证方法|超参数|参数|描述|
|--|--|--|--|--|--|--|--|--|
|LinearRegression|线性回归|$$ J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}$$|-|最小二乘法|-|-|w，b|解决基本的线性回归问题|
|Ridge|岭回归|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}w^{2}$$|L2|最小二乘法|-|$$\alpha$$|w，b|解决过拟合问题，但参数均不为0|
|RidgeCV|岭回归|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}w^{2}$$|L2|最小二乘法|交叉验证|$$list(\alpha)$$|w，b|在超参数列表中找到一个最好的超参数|
|Lasso|最小绝对值收敛和选择算子、套索算法|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}\left | w \right |$$|L1|坐标下降算法|-|$$\alpha$$|w，b|允许参数为0，参数为稀疏矩阵|
|LassoCV|最小绝对值收敛和选择算子、套索算法|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}\left | w \right |$$|L1|坐标下降算法|交叉验证|$$list(\alpha)$$|w，b|在超参数列表中找到一个最好的超参数|
|LassoLars|最小绝对值收敛和选择算子、套索算法|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}\left | w \right |$$|L1|最小角回归法|-|$$\alpha$$|w，b|样本数远小于样本特征数,可以看到回归路径|
|LassoLarsCV|最小绝对值收敛和选择算子、套索算法|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}\left | w \right |$$|L1|最小角回归法|交叉验证|$$list(\alpha)$$|w，b|在超参数列表中找到一个最好的超参数|
|LassoLarsIC|最小绝对值收敛和选择算子、套索算法|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}\left | w \right |$$|L1|最小角回归法|AIC,BIC|-|α，w，b|自动求超参|
|ElasticNet|弹性网络|$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\\\alpha(\beta  \sum_{j=0}^{N}\left | w \right |+\frac{(1-\beta)}{2}\sum_{j=0}^{N}w^{2})$|L1, L2|坐标轴下降法|-|$\alpha,\beta$|w，b|少部分参数为0，防止过拟合|
|ElasticNetCV|弹性网络|$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\\\alpha(\beta  \sum_{j=0}^{N}\left | w \right |+\frac{(1-\beta)}{2}\sum_{j=0}^{N}w^{2})$|L1, L2|坐标轴下降法|交叉验证|-|$\alpha,\beta,w, b$|自动求超参数|
|OrthogonalMatchingPursuit|正交匹配追踪法|$ J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}\\subject to ||θ||0≤nnon−zero−coefs$|-|前向选择算法|-|n|w，b|自行设置非0参数的个数，精度较低|
|OrthogonalMatchingPursuitCV|正交匹配追踪法|$ J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}\\subject to ||θ||0≤nnon−zero−coefs$|-|前向选择算法|交叉验证|-|n,w，b|自动求超参数，精度较低|
|BayesianRidge|贝叶斯岭回归|$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}w^{2}$$|L2|最大似然|-|-|$\alpha,w,b$|对病态数据鲁棒性很高，时间花费大|
|BayesianRidge|稀疏贝叶斯学习 |$$J(W)=\frac{1}{2M}\sum_{i=0}^{M}(h_{w}(x^{i})-y^{i})^{2}+\alpha \sum_{j=0}^{N}w^{2}$$|L2|最大似然|-|-|$\alpha,w,b$|对病态数据鲁棒性很高，时间花费大|
|HuberRegressor|- |-|-|-|-|-|-|适用于具有强离群点的数据|


# 总结
线性回归= 基本模型+损失函数(优化目标)+正则化方式+优化方法

sklearn中实现的各种算法，基本是在公式下变化，各有优劣，选择模型时，从数据量（计算速度），要求精度，数据自身情况入手，选择不同的模型，比如在广告投放方式及花费的数据中存在大量的离群点，所以用HuberRegressor效果最好