# 汽车油耗预测实战

- 利用全连接网络模型来完成汽车的效能指标 MPG(Mile Per Gallon，
- 每加仑 燃油英里数)的预测问题实战。 
### 1. 准备数据

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses

In [2]:
# 下载数据集
# dataset_path = keras.utils.get_file("auto-mpg.data","http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/autompg.data")
# 利用 pandas 读取数据集，字段有效能（公里数每加仑），气缸数，排量，马力，重量, 加速度，型号年份，产地
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin'] 
raw_dataset = pd.read_csv('auto-mpg.data', names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True) 
dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [3]:
dataset.shape

(398, 8)

**清除数据缺失项**

In [4]:
dataset.isna().sum()  # 统计空白数据
# dataset = dataset.dropna() # 删除空白数据项

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [5]:
dataset.shape

(398, 8)

In [6]:
# 处理类别型数据，其中 origin 列代表了类别 1,2,3,分布代表产地：美国、欧洲、日本 
# 先弹出(删除并返回)origin 这一列 
origin = dataset.pop('Origin')
# 根据 origin 列来写入新的 3 个列 
dataset['USA'] = (origin == 1)*1.0 
dataset['Europe'] = (origin == 2)*1.0 
dataset['Japan'] = (origin == 2)*1.0
dataset.tail() # 查看新表格的后几项

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
393,27.0,4,140.0,86.0,2790.0,15.6,82,1.0,0.0,0.0
394,44.0,4,97.0,52.0,2130.0,24.6,82,0.0,1.0,1.0
395,32.0,4,135.0,84.0,2295.0,11.6,82,1.0,0.0,0.0
396,28.0,4,120.0,79.0,2625.0,18.6,82,1.0,0.0,0.0
397,31.0,4,119.0,82.0,2720.0,19.4,82,1.0,0.0,0.0


**切分数据为训练集和测试集**

In [7]:
# 切分为训练集和测试集 
train_dataset = dataset.sample(frac=0.8,random_state=0) 
test_dataset = dataset.drop(train_dataset.index) 
# 移动 MPG 油耗效能这一列为真实标签 Y 
train_labels = train_dataset.pop('MPG') 
test_labels = test_dataset.pop('MPG') 

In [8]:
# 查看训练集的输入 X 的统计数据 
train_stats = train_dataset.describe() 
train_stats = train_stats.transpose() # 转置 

In [21]:
# 标准化数据
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [22]:
print(normed_train_data.shape,train_labels.shape)
print(normed_test_data.shape, test_labels.shape)

(318, 9) (318,)
(80, 9) (80,)


In [23]:
train_db = tf.data.Dataset.from_tensor_slices((normed_train_data.values,train_labels.values)) # 构建 Dataset 对象 
train_db = train_db.shuffle(100).batch(32) # 随机打散，批量化 

In [24]:
train_db

<BatchDataset shapes: ((None, 9), (None,)), types: (tf.float64, tf.float64)>

### 2. 创建网络

In [None]:
class Network(keras.Model):
    # 回归网络
    def __init__(self):
        super(Network, self).__init__()
        # 创建3个全连接层
        self.fc1 = layers.Dense(64, activation='relu')
        self.fc2 = layers.Dense(64, activation='relu')
        self.fc3 = layers.Dense(1)

    def call(self, inputs, training=None, mask=None):
        # 依次通过3个全连接层
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.fc3(x)

        return x

In [None]:
model = Network()
# 通过 build 函数完成内部张量的创建，其中 4 为任意设置的 batch 数量，9 为输入特征长度 
model.build(input_shape=(None, 9))
model.summary()
# 创建优化器，指定学习率
optimizer = tf.keras.optimizers.RMSprop(0.001)

In [None]:
tf.keras.backend.set_floatx('float64')
train_mae_losses = []
test_mae_losses = []
for epoch in range(100):
    for step, (x,y) in enumerate(train_db): # 遍历一次训练集
        # 梯度记录器，训练时使用
        with tf.GradientTape() as tape:
            out = model(x) # 通过网络获得输出
            loss = tf.reduce_mean(losses.MSE(y, out)) # 计算 MSE
            mae_loss = tf.reduce_mean(losses.MAE(y, out)) # MAE

        if step % 10 == 0:
            print(epoch, step, float(loss))

        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

    train_mae_losses.append(float(mae_loss))
    out = model(tf.constant(normed_test_data.values))
    test_mae_losses.append(tf.reduce_mean(losses.MAE(test_labels, out)))

In [None]:
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.plot(train_mae_losses,  label='Train')

plt.plot(test_mae_losses, label='Test')
plt.legend()