# 线性回归

## 线性回归分析房屋信息

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.preprocessing import StandardScaler

In [3]:
#从源文件加载数据，并输出查看数据的各项特征
lines = np.loadtxt('USA_Housing.csv', delimiter=',', dtype='str')
header = lines[0]
lines = lines[1:].astype(float)

print('数据特征：', ','.join(header[:-1]))
print('数据标签', header[-1])
print('数据总条数', len(lines))

数据特征： Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
数据标签 Price
数据总条数 5000


In [4]:
#划分训练集和测试集
ratio = 0.8
split = int(len(lines) * ratio)
np.random.seed(0)
lines = np.random.permutation(lines)
train, test = lines[:split], lines[split:]

In [5]:
# 数据归一化
scaler = StandardScaler()
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [6]:
# 划分输入输出标签
x_train, y_train = train[:, :-1], train[:, -1].flatten()
x_test, y_test = test[:, :-1], test[:, 1].flatten()

In [7]:
# 在X矩阵最后添加一列1，代表常数项
X = np.concatenate([x_train, np.ones((len(x_train), 1))], axis=1)
# @ 表示矩阵相乘，X.T表示矩阵的转置，np.linalg.inv函数可以计算矩阵的逆
theta = np.linalg.inv(X.T @ X) @ X.T @ y_train
print('回归系数：', theta)

回归系数： [ 6.50881254e-01  4.67222833e-01  3.38466198e-01  6.17275856e-03
  4.26857089e-01 -1.46133106e-14]


In [9]:
# 在测试集上使用回归系数进行预测
X_test = np.concatenate([x_test, np.ones((len(x_test), 1))], axis=1)
y_pred = X_test @ theta

In [10]:
# 计算预测值和真实值之间的RMSE
rmse_loss = np.sqrt(np.square(y_test - y_pred).mean())
print('RMSE：', rmse_loss)

RMSE： 1.0051766837744343


## 使用sklearn中的线性模型

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
# 初始化线性模型
linreg = LinearRegression()
linreg.fit(x_train, y_train)

In [13]:
print('回归系数：', linreg.coef_, linreg.intercept_)
y_pred = linreg.predict(x_test)

回归系数： [0.65088125 0.46722283 0.3384662  0.00617276 0.42685709] -1.4635041882766183e-14


In [14]:
# 计算预测值和真实值之间的RMSE
rmse_test = np.sqrt(np.square(y_test - y_pred).mean())
print('RMSE：', rmse_loss)

RMSE： 1.0051766837744343
