## 第二章 机器学习实践作业

### 1.在线性回归实验中，我们生成了100个训练样本，如果生成5000个训练样本，模型的拟合效果会发生怎样的变化？请实践并给出结论。<span style="color:red">(必修题)</span>
<span style="color:red">提示：课节2：机器学习中[项目]第2章（上）：线性回归理论解读</span>

#### 数据构建

In [1]:
# 真实函数的参数缺省值为 w=1.2，b=0.5
def linear_func(x,w=1.2,b=0.5):
    y = w * x + b
    return y

In [2]:
import paddle

def create_toy_data(func, interval, sample_num, noise = 0.0, add_outlier = False, outlier_ratio = 0.001):
    """
    根据给定的函数，生成样本
    输入：
       - func：函数
       - interval： x的取值范围
       - sample_num： 样本数目
       - noise： 噪声均方差
       - add_outlier：是否生成异常值
       - outlier_ratio：异常值占比
    输出：
       - X: 特征数据，shape=[n_samples,1]
       - y: 标签数据，shape=[n_samples,1]
    """

    # 均匀采样
    # 使用paddle.rand在生成sample_num个随机数
    X = paddle.rand(shape = [sample_num]) * (interval[1]-interval[0]) + interval[0]
    y = func(X)

    # 生成高斯分布的标签噪声
    # 使用paddle.normal生成0均值，noise标准差的数据
    epsilon = paddle.normal(0,noise,paddle.to_tensor(y.shape[0]))
    y = y + epsilon
    if add_outlier:     # 生成额外的异常点
        outlier_num = int(len(y)*outlier_ratio)
        if outlier_num != 0:
            # 使用paddle.randint生成服从均匀分布的、范围在[0, len(y))的随机Tensor
            outlier_idx = paddle.randint(len(y),shape = [outlier_num])
            y[outlier_idx] = y[outlier_idx] * 5
    return X, y

In [3]:
from matplotlib import pyplot as plt # matplotlib 是 Python 的绘图库

func = linear_func
interval = (-10,10)
train_num = 100 # 训练样本数目
test_num = 50 # 测试样本数目
noise = 2
X_train, y_train = create_toy_data(func=func, interval=interval, sample_num=train_num, noise = noise, add_outlier = False)
X_test, y_test = create_toy_data(func=func, interval=interval, sample_num=test_num, noise = noise, add_outlier = False)

X_train_large, y_train_large = create_toy_data(func=func, interval=interval, sample_num=5000, noise = noise, add_outlier = False)

# paddle.linspace返回一个Tensor，Tensor的值为在区间start和stop上均匀间隔的num个值，输出Tensor的长度为num
X_underlying = paddle.linspace(interval[0],interval[1],train_num) 
y_underlying = linear_func(X_underlying)

# 绘制数据
plt.scatter(X_train, y_train, marker='*', facecolor="none", edgecolor='#e4007f', s=50, label="train data")
plt.scatter(X_test, y_test, facecolor="none", edgecolor='#f19ec2', s=50, label="test data")
plt.plot(X_underlying, y_underlying, c='#000000', label=r"underlying distribution")
plt.legend(fontsize='x-large') # 给图像加图例
plt.savefig('ml-vis.pdf') # 保存图像到PDF文件中
plt.show()

  from collections import MutableMapping
  from collections import Iterable, Mapping
  from collections import Sized
W0726 15:01:31.400933  3473 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0726 15:01:31.405462  3473 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.
  if isinstance(obj, collections.Iterator):
  return list(data) if isinstance(data, collections.MappingView) else data


<Figure size 640x480 with 1 Axes>

#### 模型构建

In [4]:
import paddle
from nndl.op import Op

paddle.seed(10) #设置随机种子

# 线性算子
class Linear(Op):
    def __init__(self, input_size):
        """
        输入：
           - input_size:模型要处理的数据特征向量长度
        """

        self.input_size = input_size

        # 模型参数
        self.params = {}
        self.params['w'] = paddle.randn(shape=[self.input_size,1],dtype='float32') 
        self.params['b'] = paddle.zeros(shape=[1],dtype='float32')

    def __call__(self, X):
        return self.forward(X)

    # 前向函数
    def forward(self, X):
        """
        输入：
           - X: tensor, shape=[N, D]
           注意这里的X矩阵是由N个x向量的转置拼接成的，与原教材行向量表示方式不一致
        输出：
           - y_pred： tensor, shape=[N]
        """

        N,D = X.shape

        if self.input_size==0:
            return paddle.full(shape=[N,1], fill_value=self.params['b'])
        
        assert D==self.input_size # 输入数据维度合法性验证

        # 使用paddle.matmul计算两个tensor的乘积
        y_pred = paddle.matmul(X, self.params['w']) + self.params['b']
        return y_pred

# 注意这里我们为了和后面章节统一，这里的X矩阵是由N个x向量的转置拼接成的，与原教材行向量表示方式不一致
input_size = 3
N = 2
X = paddle.randn(shape=[N, input_size],dtype='float32') # 生成2个维度为3的数据
model = Linear(input_size)
y_pred = model(X)
print("y_pred:", y_pred) # 输出结果的个数也是2个

y_pred: Tensor(shape=[2, 1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
       [[-7.15093279],
        [ 3.71817803]])


#### 损失函数

In [5]:
import paddle

# 实现MSE
def mean_squared_error(y_true, y_pred):
    """
    输入：
       - y_true: tensor，样本真实标签
       - y_pred: tensor, 样本预测标签
    输出：
       - error: float，误差值
    """

    assert y_true.shape[0] == y_pred.shape[0]
    
    # paddle.square计算输入的平方值
    # paddle.mean沿 axis 计算 x 的平均值，默认axis是None，则对输入的全部元素计算平均值。
    error = paddle.mean(paddle.square(y_true - y_pred))

    return error

#### 模型优化

In [6]:
def optimizer_lsm(model, X, y, reg_lambda=0):
  """
    输入：
       - model: 模型
       - X: tensor, 特征数据，shape=[N,D]
       - y: tensor,标签数据，shape=[N]
       - reg_lambda: float, 正则化系数，默认为0
    输出：
       - model: 优化好的模型
    """

  N, D = X.shape

  # 对输入特征数据所有特征向量求平均
  x_bar_tran = paddle.mean(X,axis=0).T 
  
  # 求标签的均值,shape=[1]
  y_bar = paddle.mean(y)
  
  # paddle.subtract通过广播的方式实现矩阵减向量
  x_sub = paddle.subtract(X,x_bar_tran)

  # 使用paddle.all判断输入tensor是否全0
  if paddle.all(x_sub==0):
    model.params['b'] = y_bar
    model.params['w'] = paddle.zeros(shape=[D])
    return model
  
  # paddle.inverse求方阵的逆
  tmp = paddle.inverse(paddle.matmul(x_sub.T,x_sub)+
          reg_lambda*paddle.eye(num_rows = (D)))

  w = paddle.matmul(paddle.matmul(tmp,x_sub.T),(y-y_bar))
  
  b = y_bar-paddle.matmul(x_bar_tran,w)
  
  model.params['b'] = b
  model.params['w'] = paddle.squeeze(w,axis=-1)

  return model

#### 模型训练

In [7]:
input_size = 1
model = Linear(input_size)
model = optimizer_lsm(model,X_train.reshape([-1,1]),y_train.reshape([-1,1]))
print("w_pred:",model.params['w'].item(), "b_pred: ", model.params['b'].item())

y_train_pred = model(X_train.reshape([-1,1])).squeeze()
train_error = mean_squared_error(y_true=y_train, y_pred=y_train_pred).item()
print("train error: ",train_error)

w_pred: 1.1782047748565674 b_pred:  0.8127316832542419
train error:  3.8648953437805176


In [8]:
model_large = Linear(input_size)
model_large = optimizer_lsm(model_large,X_train_large.reshape([-1,1]),y_train_large.reshape([-1,1]))
print("w_pred large:",model_large.params['w'].item(), "b_pred large: ", model_large.params['b'].item())

y_train_pred_large = model_large(X_train_large.reshape([-1,1])).squeeze()
train_error_large = mean_squared_error(y_true=y_train_large, y_pred=y_train_pred_large).item()
print("train error large: ",train_error_large)

w_pred large: 1.2004742622375488 b_pred large:  0.5069178342819214
train error large:  3.966343879699707


#### 模型评估

In [9]:
y_test_pred = model(X_test.reshape([-1,1])).squeeze()
test_error = mean_squared_error(y_true=y_test, y_pred=y_test_pred).item()
print("test error: ",test_error)

test error:  5.9800310134887695


In [10]:
y_test_pred_large = model_large(X_test.reshape([-1,1])).squeeze()
test_error_large = mean_squared_error(y_true=y_test, y_pred=y_test_pred_large).item()
print("test error large: ",test_error_large)

test error large:  5.949461936950684


**观察**到：样本数量增大时，模型拟合能力几乎保持不变。

### 2.在基于线性回归模型的波士顿房价预测任务中，与房屋售价相关的特征比较少。那我们换一个更为复杂的数据集House Prices - Advanced Regression Techniques：[https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data)，通过特征选择、数据处理等过程，呈现出结果。<span style="color:red">(附加题&加分题)</span>
<span style="color:red">提示：课节2：机器学习中第2章（下）：基于线性回归完成波士顿房价预测任务</span>

In [11]:
class Runner(object):
    def __init__(self, model, optimizer, loss_fn, metric):
        self.model = model         # 模型
        self.optimizer = optimizer # 优化器
        self.loss_fn = loss_fn     # 损失函数   
        self.metric = metric       # 评估指标

    # 模型训练
    def train(self, train_dataset, dev_dataset=None, **kwargs):
        pass

    # 模型评价
    def evaluate(self, data_set, **kwargs):
        pass

    # 模型预测
    def predict(self, x, **kwargs):
        pass

    # 模型保存
    def save_model(self, save_path):
        pass

    # 模型加载
    def load_model(self, model_path):
        pass

In [12]:
import pandas as pd

train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


训练数据集包括1460个样本，每个样本80个特征和1个标签， 而测试数据集包含1459个样本，每个样本80个特征。

In [13]:
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000


可以观察到：第一个特征是ID， 这有助于模型识别每个训练样本。 虽然这很方便，但它不携带任何用于预测的信息。 因此，在将数据提供给模型之前，我们将其从数据集中删除。

In [14]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

print(all_features.shape)

(2919, 79)


 首先，我们将所有缺失的值替换为相应特征的平均值。然后，为了将所有特征放在一个共同的尺度上， 我们通过将特征重新缩放到零均值和单位方差来标准化数据。

In [15]:
# 若无法获得测试数据，则可根据训练数据计算均值和标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

# 在标准化数据之后，所有均值消失，因此我们可以将缺失值设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [16]:
# “Dummy_na=True”将“na”（缺失值）视为有效的特征值，并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

(2919, 331)

从pandas格式中提取特征，将其转换为张量表示用于训练。

In [17]:
n_train = train_data.shape[0]
train_features = paddle.to_tensor(all_features[:n_train].values, dtype='float32')
test_features = paddle.to_tensor(all_features[n_train:].values, dtype='float32')
train_labels = paddle.to_tensor(train_data.SalePrice.values.reshape(-1, 1), dtype='float32')

train_features.shape, test_features.shape, train_labels.shape

([1460, 331], [1459, 331], [1460, 1])

In [18]:
train_dataset = (train_features, train_labels)

In [19]:
# 模型实例化
input_size = 331
model=Linear(input_size)

In [20]:
# 为与Kaggle评价标准保持一致，使用log-RMSE

def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    # clipped_preds = torch.clamp(net(features), 1, float('inf'))
    clipped_preds = paddle.clip(net(features), 1, float('inf'))
    rmse = paddle.sqrt(loss(paddle.log(clipped_preds),
                           paddle.log(labels)))
    return rmse.item()

In [21]:
import paddle
import os
from nndl.opitimizer import optimizer_lsm

class Runner(object):
    def __init__(self, model, optimizer, loss_fn, metric):
        # 优化器和损失函数为None,不再关注

        # 模型
        self.model=model
        # 评估指标
        self.metric = metric
        # 优化器
        self.optimizer = optimizer
    
    def train(self, dataset, reg_lambda, model_dir):
        X,y = dataset
        self.optimizer(self.model, X, y, reg_lambda)

        # 保存模型
        self.save_model(model_dir)
    
    def evaluate(self, dataset, **kwargs):
        X,y = dataset

        y_pred = self.model(X)
        result = self.metric(y_pred, y)

        return result

    def predict(self, X, **kwargs):
        return self.model(X)
    
    def save_model(self, model_dir):
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        
        params_saved_path = os.path.join(model_dir,'params.pdtensor')
        paddle.save(model.params,params_saved_path)

    def load_model(self, model_dir):
        params_saved_path = os.path.join(model_dir,'params.pdtensor')
        self.model.params=paddle.load(params_saved_path)

optimizer = optimizer_lsm

# 实例化Runner
runner = Runner(model, optimizer=optimizer,loss_fn=None, metric=log_rmse)

In [22]:
# 模型保存文件夹
saved_dir = './work/models'

# 启动训练
runner.train(train_dataset, reg_lambda=0.1, model_dir=saved_dir)

In [23]:
columns_list = all_features.columns.to_list()
weights = runner.model.params['w'].tolist()
b = runner.model.params['b'].item()

for i in range(len(weights)):
    print(columns_list[i],"weight:",weights[i])

print("b:",b)

MSSubClass weight: -2672.570068359375
LotFrontage weight: 267.6170654296875
LotArea weight: 5165.794921875
OverallQual weight: 9862.0380859375
OverallCond weight: 6298.2138671875
YearBuilt weight: 8863.8203125
YearRemodAdd weight: 1894.0037841796875
MasVnrArea weight: 3717.07763671875
BsmtFinSF1 weight: 6722.0185546875
BsmtFinSF2 weight: 1598.1077880859375
BsmtUnfSF weight: -101.866943359375
TotalBsmtSF weight: 7564.85595703125
1stFlrSF weight: 5657.9287109375
2ndFlrSF weight: 13275.34765625
LowQualFinSF weight: -1462.463623046875
GrLivArea weight: 15145.9404296875
BsmtFullBath weight: 1171.6802978515625
BsmtHalfBath weight: -55.33821105957031
FullBath weight: 2337.06005859375
HalfBath weight: 958.1455078125
BedroomAbvGr weight: -2902.66064453125
KitchenAbvGr weight: -2922.5517578125
TotRmsAbvGrd weight: 3245.8173828125
Fireplaces weight: 3888.0185546875
GarageYrBlt weight: -552.3795166015625
GarageCars weight: 4234.49560546875
GarageArea weight: 3041.69287109375
WoodDeckSF weight: 201

In [30]:
runner.load_model(saved_dir)
preds = runner.predict(test_features).detach().numpy()
# 重新格式化以导出到kaggle
test_data['SalePrice'] = pd.Series(preds.reshape([1, -1])[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)

+ kaggle评测得分0.18711，评价指标为log-RMSE，训练时为与测试指标保持一致将作业中的损失也从MSE改为log-RMSE。

![](https://ai-studio-static-online.cdn.bcebos.com/4fadf2e9b0b84ccfa7cd270d0a3597a3d57ead861321436cb78d365447625ded)



###  3.小明想用线性回归模型来建模房子各个属性与房价之间的关系，包括房子的楼层数、每层的长、宽、高、底面周长、底面面积、底面面积的2倍等。请分析这些房屋属性同时用来预测房价是否合理？<span style="color:red">(附加题&简答题&加分题)</span>

我认为**不合理**。

*底面面积的2倍*和*底面面积*是可以相互推断出的，存在直接的线性相关关系，这样并不是一个好的特征工程。在特征选择中我们往往希望去除掉冗余的特征，它们不能提供更多的分类信息，反而有可能造成分类器的过学习。
