# Project 1: Car fuel efficiency

In [5]:
import pandas as pd
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ["MPG","Cylinders","Displacement","Horsepower","Weight","Acceleration","Model Year","Origin"]
df = pd.read_csv(url,names=column_names,na_values="?",comment='\t',sep=" ", skipinitialspace=True)

In [6]:
df.shape

(398, 8)

In [7]:
# Drop the rows wo values
df = df.dropna()
df = df.reset_index(drop=True)

In [9]:
# train/test splits
import sklearn
import sklearn.model_selection
from pandas import DataFrame
from typing import Tuple
split: Tuple[DataFrame,DataFrame] = sklearn.model_selection.train_test_split(df,train_size=0.8,random_state=1)
df_train, df_test = split
train_stats = df_train.describe().transpose()

numeric_column_names = ['Cylinders','Displacement','Horsepower','Weight','Acceleration']
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()
df_train_norm[numeric_column_names] = df_train_norm[numeric_column_names].astype(float)
df_test_norm[numeric_column_names] = df_test_norm[numeric_column_names].astype(float)
for col_name in numeric_column_names:
    mean = train_stats.loc[col_name,'mean']
    std = train_stats.loc[col_name,'std']
    df_train_norm.loc[:,col_name] = (df_train_norm.loc[:,col_name]-mean)/std
    df_test_norm.loc[:,col_name] = (df_test_norm.loc[:,col_name]-mean)/std
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


Now we are going to group the fine-grained model year information, into buckets, assigning each car into one of four
year buckets, as follows:
$$bucket=\begin{cases}
    0 \text{ if }year < 73 \\
    1 \text{ if }73 \leq year < 76 \\
    2 \text{ if }76 \leq year < 79 \\
    3 \text{ if }year \geq 79 \\
\end{cases}$$

In [10]:
import torch
boundaries = torch.tensor([73,76,79])
v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(v,boundaries,right=True) # The right is for inclusion or
# exclusion from an interval if a value is on the threshold.
v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v,boundaries,right=True) # The right is for inclusion or
numeric_column_names.append('Model Year Bucketed')

For a categorical approach we can choose between encoding categories in one-hot-encoded vectors, or to use 
`nn.Embedding` to convert each category in a dense vector of floats (the `nn.Embedding` can be trained, and it's optimal
for use cases with many different categories).

In [11]:
# Here we will use one-hot-encoding
from torch.nn.functional import one_hot
total_origin = len(set(df_train_norm['Origin']))
origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values)%total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)
x_train = torch.cat((x_train_numeric,origin_encoded),1).float()
origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values)%total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
x_test = torch.cat((x_test_numeric,origin_encoded),1).float()

In [12]:
# Creating label tensors
y_train = torch.tensor(df_train_norm["MPG"].values).float()
y_test = torch.tensor(df_test_norm["MPG"].values).float()

# Training a DNN regression model

In [18]:
from torch.utils.data import TensorDataset, DataLoader
train_ds = TensorDataset(x_train,y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds,batch_size,True)

In [19]:
import torch.nn as nn
# Creating the model
hidden_units = [8, 4]
input_size = x_train.shape[1]
all_layers = []
for hidden_unit in hidden_units:
    all_layers.append(nn.Linear(input_size,hidden_unit))
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1],1))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [20]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

In [21]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20
model = model.to("mps")

In [None]:
# Training cycle
for epoch in range(num_epochs):
    loss_hist_train = 0
    for x_batch, y_batch in train_dl:
        x_batch, y_batch = x_batch.to("mps"), y_batch.to("mps") 
        pred: torch.Tensor = model(x_batch)[:,0] # Used to have a flat vector (not a rank-1 vector)
        loss: torch.Tensor = loss_fn(pred,y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs == 0:
        print(f'Epoch {epoch} Loss {loss_hist_train/len(train_dl):.4f}')
        # The length of train dl is the length of the dataset devided by the batch size

Epoch 0 Loss 536.1047
Epoch 20 Loss 8.4361
Epoch 40 Loss 7.8695
Epoch 60 Loss 7.1891
Epoch 80 Loss 6.7062
Epoch 100 Loss 6.7599
Epoch 120 Loss 6.3124
Epoch 140 Loss 6.6864
Epoch 160 Loss 6.7648
Epoch 180 Loss 6.2156


In [26]:
# Testing on the test dataset
with torch.no_grad():
    pred = model(x_test.float().to("mps"))[:,0]
    loss = loss_fn(pred,y_test.to("mps"))
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {nn.L1Loss()(pred,y_test.to("mps")).item():.4f}')

Test MSE: 9.6130
Test MAE: 2.1211
