# Project one – predicting the fuel efficiency of a car

## Working with feature columns

In [3]:
import pandas as pd 
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=" ", skipinitialspace=True)

## drop the NA rows
df = df.dropna()
df = df.reset_index(drop=True)

In [4]:
## train/test splits:
import sklearn
import sklearn.model_selection 

df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8, random_state=1)
train_stats = df_train.describe().transpose()

In [5]:
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration' ]
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std = train_stats.loc[col_name, 'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name]-mean)/std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/std

df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


In [8]:
# Group the ModelYear feature values into buckets
import torch

boundries = torch.tensor([73, 76, 79])

v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(v, boundries, right=True)

v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v, boundries, right=True)

numeric_column_names.append('Model Year Bucketed')

In [9]:
# use the one-hot-encoding approach on the categorical feature in order to convert it into the dense format
from torch.nn.functional import one_hot
total_origin = len(set(df_train_norm['Origin']))

origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values) % total_origin)
X_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)
X_train = torch.cat([X_train_numeric, origin_encoded], 1).float()   

origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values) % total_origin)
X_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
X_test = torch.cat([X_test_numeric, origin_encoded], 1).float()

In [10]:
X_train

tensor([[-0.8243, -0.5309, -0.4992,  ...,  0.0000,  1.0000,  0.0000],
        [ 0.3511,  0.3456,  0.1865,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8243, -0.8913, -0.5256,  ...,  0.0000,  0.0000,  1.0000],
        ...,
        [ 1.5266,  1.1443,  0.7139,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8243, -0.8913, -1.0530,  ...,  0.0000,  1.0000,  0.0000],
        [ 1.5266,  1.5631,  1.6369,  ...,  0.0000,  1.0000,  0.0000]])

In [11]:
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

## Training a DNN regression model

In [12]:
# create a data loader that uses a batch size of 8
from torch.utils.data import DataLoader, TensorDataset

train_ds =  TensorDataset(X_train, y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [16]:
# build a model with two fully connected layers where one has 8 hidden units and another has 4
import torch.nn as nn

hidden_units = [8, 4]
input_size = X_train.shape[1]
all_layers = []

for unit in hidden_units:
    layer = nn.Linear(input_size, unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = unit
all_layers.append(nn.Linear(hidden_units[-1], 1))

model = nn.Sequential(*all_layers) #translate the lists into layers
model

Sequential(
  (0): Linear(in_features=10, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [18]:
# define the MSE loss function for regression and use stochastic gradient descent for optimization

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [20]:
# train the model for 200 epochs and display the train loss for every 20 epochs
torch.manual_seed(1)
num_epochs = 200 
log_epochs = 20

for epoch in range(num_epochs):
    loss_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)[:, 0]
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs == 0:
        print(f'Epoch {epoch} Loss ' f'{loss_hist_train/len(train_dl):.4f}')

Epoch 0 Loss 8.0101
Epoch 20 Loss 7.1461
Epoch 40 Loss 6.8620
Epoch 60 Loss 6.6624
Epoch 80 Loss 6.4889
Epoch 100 Loss 6.4133
Epoch 120 Loss 6.2305
Epoch 140 Loss 6.3004
Epoch 160 Loss 7.0872
Epoch 180 Loss 5.7517


In [21]:
# evaluate the regression performance of the trained model on the test dataset.

with torch.no_grad():
    pred = model(X_test.float())[:, 0]
    loss = loss_fn(pred, y_test)
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {nn.L1Loss()(pred, y_test).item():.4f}')

Test MSE: 9.0929
Test MAE: 2.1459
