In [1]:
import torch
from torch import nn
from torch.nn.functional import one_hot
from torch.utils.data import DataLoader,TensorDataset
import pandas as pd
from ucimlrepo import fetch_ucirepo

import sklearn
import sklearn.model_selection

In [2]:
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets
df = pd.concat([y,X],axis=1)
df.head()

Unnamed: 0,mpg,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
0,18.0,307.0,8,130.0,3504,12.0,70,1
1,15.0,350.0,8,165.0,3693,11.5,70,1
2,18.0,318.0,8,150.0,3436,11.0,70,1
3,16.0,304.0,8,150.0,3433,12.0,70,1
4,17.0,302.0,8,140.0,3449,10.5,70,1


In [3]:
# Check for missing values
print(f'Shape: {df.shape}')
df.apply(lambda x: sum(x.isna()),axis='index')

Shape: (398, 8)


mpg             0
displacement    0
cylinders       0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [4]:
# Drop datapoints with missing data
df = df.dropna()
df.reset_index(drop=True)

Unnamed: 0,mpg,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
0,18.0,307.0,8,130.0,3504,12.0,70,1
1,15.0,350.0,8,165.0,3693,11.5,70,1
2,18.0,318.0,8,150.0,3436,11.0,70,1
3,16.0,304.0,8,150.0,3433,12.0,70,1
4,17.0,302.0,8,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
387,27.0,140.0,4,86.0,2790,15.6,82,1
388,44.0,97.0,4,52.0,2130,24.6,82,2
389,32.0,135.0,4,84.0,2295,11.6,82,1
390,28.0,120.0,4,79.0,2625,18.6,82,1


In [5]:
# train / test splits
df_train,df_val = sklearn.model_selection.train_test_split(df,train_size=0.8,random_state=47)

In [6]:
# normalization
train_stats = df_train.describe().transpose()
numeric_columns = ['displacement','cylinders','horsepower','weight','acceleration']
df_train_norm, df_val_norm = df_train.copy(), df_val.copy()
for col in numeric_columns:
    mean = train_stats.loc[col,'mean']
    std = train_stats.loc[col,'std']
    df_train_norm.loc[:,col] = (df_train_norm.loc[:,col] - mean) / std
    df_val_norm.loc[:,col] = (df_val_norm.loc[:,col] - mean) / std
df_train_norm.tail()

Unnamed: 0,mpg,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
73,13.0,1.054933,1.458082,0.650544,1.293824,-0.553654,72,1
266,30.0,-0.920014,-0.858566,-0.94024,-0.960426,0.332124,78,1
329,44.6,-0.98616,-0.858566,-0.965897,-1.314284,-0.624516,80,3
396,28.0,-0.712124,-0.858566,-0.658004,-0.415137,1.076178,82,1
137,13.0,1.461262,1.458082,1.1637,1.991098,-0.376498,74,1


In [7]:
sorted(df_train_norm['model_year'].unique())

[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82]

In [8]:
# Bucketize model_year col
boundaries = torch.tensor([73,76,79]) # (-inf,73),[73,76),[76,79),[79,inf)

v = torch.tensor(df_train_norm['model_year'].values)
df_train_norm['model_year'] = torch.bucketize(v,boundaries,right=True)

v = torch.tensor(df_val_norm['model_year'].values)
df_val_norm['model_year'] = torch.bucketize(v,boundaries,right=True)

numeric_columns.append('model_year')
df_train_norm.tail()

Unnamed: 0,mpg,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
73,13.0,1.054933,1.458082,0.650544,1.293824,-0.553654,0,1
266,30.0,-0.920014,-0.858566,-0.94024,-0.960426,0.332124,2,1
329,44.6,-0.98616,-0.858566,-0.965897,-1.314284,-0.624516,3,3
396,28.0,-0.712124,-0.858566,-0.658004,-0.415137,1.076178,3,1
137,13.0,1.461262,1.458082,1.1637,1.991098,-0.376498,1,1


In [9]:
sorted(df_train_norm['origin'].unique())

[1, 2, 3]

In [10]:
# one-hot encode origin
n_origin = len(df_train_norm['origin'].unique())

origin_encoded = one_hot(torch.from_numpy(df_train_norm['origin'].values) % n_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_columns].values)
x_train = torch.cat([x_train_numeric,origin_encoded],axis=1).float()

origin_encoded = one_hot(torch.from_numpy(df_val_norm['origin'].values) % n_origin)
x_val_numeric = torch.tensor(df_val_norm[numeric_columns].values)
x_val = torch.cat([x_val_numeric,origin_encoded],axis=1).float()

y_train = torch.tensor(df_train_norm['mpg'].values).float()
y_val = torch.tensor(df_val_norm['mpg'].values).float()

In [11]:
x_train.shape

torch.Size([313, 9])

In [84]:
torch.manual_seed(47)

train_ds = TensorDataset(x_train,y_train)
batch_size = 8
train_dl = DataLoader(train_ds,batch_size,shuffle=True)
hidden_units = [8,4]
input_size = x_train.shape[1]
all_layers = []

for hidden_unit in hidden_units:
    all_layers.append(nn.Linear(input_size,hidden_unit))
    all_layers.append(nn.ReLU())
    input_size = hidden_unit
all_layers.append(nn.Linear(hidden_units[-1],1))

model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [85]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=1e-3)

In [86]:
num_epochs = 201
log_epochs = 20

for epoch in range(num_epochs):
    loss_hist_train = 0
    for x_batch,y_batch in train_dl:
        preds = model(x_batch)[:,0]
        loss = loss_fn(preds,y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs == 0:
        print(f'Epoch {epoch}   loss {loss_hist_train/len(train_dl):.4f}')

with torch.no_grad():
    preds = model(x_val.float())[:,0]
    loss = loss_fn(preds,y_val)
    print(f'Val MSE: {loss.item():.4f}')
    print(f'Val MAE: {nn.L1Loss()(preds,y_val).item():.4f}')

Epoch 0   loss 318.2053
Epoch 20   loss 7.9971
Epoch 40   loss 7.8842
Epoch 60   loss 7.9316
Epoch 80   loss 7.1628
Epoch 100   loss 6.9388
Epoch 120   loss 6.9286
Epoch 140   loss 6.8324
Epoch 160   loss 6.8620
Epoch 180   loss 6.6586
Epoch 200   loss 6.3261
Val MSE: 7.0224
Val MAE: 2.0085


In [19]:
# r squared
1-(7.0224/torch.var(y_val)).item()

0.869275689125061