In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 导入线性回归模型
from sklearn.linear_model import LinearRegression

# 导入数据集划分对象
from sklearn.model_selection import train_test_split

# 导入波士顿房价数据集
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

# 导入回归模型的评估指标
from sklearn.metrics import mean_squared_error, r2_score  

from IPython.core.interactiveshell import InteractiveShell # 这个对象设置所有行全部输出
  
# 设置该对象ast_node_interactivity的属性值为all，表示notebook下每一行有输出的代码全部输出运算结果
InteractiveShell.ast_node_interactivity = "all"

# 解决坐标轴刻度负号乱码
plt.rcParams['axes.unicode_minus'] = False

# 解决中文乱码问题
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.style.use('ggplot')


In [24]:
# 加载数据
x = data
y = target
x.shape

(506, 13)

In [25]:
# 数据集划分
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=0.3, random_state=123)

In [30]:
# 构建线性回归模型对象
reg = LinearRegression()

# 训练模型
reg = reg.fit(Xtrain, Ytrain)

# 模型预测
y_pred = reg.predict(Xtest)

# 模型评估
mean_squared_error(Ytest, y_pred)
r2_score(Ytest, y_pred)

# 查看当前回归模型的截距
reg.intercept_

# 查看当前回归模型的参数
reg.coef_

28.405854810508355

0.6485645742370689

28.98127038809571

array([-1.00993843e-01,  3.99582498e-02,  7.53760168e-02,  2.64563972e-01,
       -1.43409174e+01,  4.83318293e+00, -7.44105854e-03, -1.32674018e+00,
        2.73495492e-01, -1.30440007e-02, -9.56625955e-01,  6.42468925e-03,
       -4.86570773e-01])