Predicting Car Mileage

Dataset: Auto MPG Dataset

Features: Cylinders, displacement, horsepower, weight, acceleration, model year, origin.

Target: Miles per gallon (mpg).

Steps
Load and preprocess the dataset.
Define a model using Keras.
Tune hyperparameters (number of layers, units, learning rate).
Train and evaluate the model.
Make predictions.

In [1]:
!pip install -q -U keras-tuner


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/129.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [40]:
import tensorflow as tf
from tensorflow import keras

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import keras_tuner as kt

In [42]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

In [43]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

In [44]:
'''
Assigns proper column names.
Treats '?' as missing data (NaN).
Ignores comment lines starting with tabs.
Handles space-separated values and trims extra spaces.
'''

dataset = pd.read_csv(url, names = column_names, sep=' ', comment='\t', skipinitialspace=True, na_values = '?')

In [45]:
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [46]:
dataset.isnull().sum()

Unnamed: 0,0
MPG,0
Cylinders,0
Displacement,0
Horsepower,6
Weight,0
Acceleration,0
Model Year,0
Origin,0


In [47]:
dataset.dtypes

Unnamed: 0,0
MPG,float64
Cylinders,int64
Displacement,float64
Horsepower,float64
Weight,float64
Acceleration,float64
Model Year,int64
Origin,int64


In [48]:
dataset['Origin'].value_counts()

Unnamed: 0_level_0,count
Origin,Unnamed: 1_level_1
1,249
3,79
2,70


In [49]:
dataset.shape

(398, 8)

In [50]:
dataset.isna().sum()

Unnamed: 0,0
MPG,0
Cylinders,0
Displacement,0
Horsepower,6
Weight,0
Acceleration,0
Model Year,0
Origin,0


In [51]:
dataset = dataset.dropna()

Why is This Important?

1. Eliminates Misinterpretation of Numerical Data:

2. The "Origin" column is categorical, but represented as numbers.
Without one-hot encoding, models might misinterpret the numerical values (e.g., "3 > 2 > 1") as meaningful, which isn't true.
Improves Model Performance:

3. Many models (especially linear regression and neural networks) work better with one-hot encoded categorical variables.

4. Machine-Readable Format:

By converting "Origin" into binary columns, the model can directly use these columns for training.


In [52]:
# One hot encoding with the  Origin column (categorical feature)

In [54]:
'''
The use of .astype(float) ensures:

Compatibility with machine learning models.
Uniformity in the dataset's numeric format.
Avoidance of potential errors when using Boolean values.

'''
dataset['USA'] = (dataset['Origin']==1).astype(float)
dataset['Europe'] = (dataset['Origin'] == 2).astype(float)
dataset['Japan'] = (dataset['Origin']==3).astype(float)

dataset = dataset.drop(columns = 'Origin', axis =1)


In [63]:
dataset.shape

(392, 10)

In [56]:
# Splitting dataset into features and target

In [57]:
x = dataset.drop('MPG', axis =1)
y = dataset['MPG']

In [60]:
x.shape

(392, 9)

In [61]:
y.shape

(392,)

In [64]:
x.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
0,8,307.0,130.0,3504.0,12.0,70,1.0,0.0,0.0
1,8,350.0,165.0,3693.0,11.5,70,1.0,0.0,0.0
2,8,318.0,150.0,3436.0,11.0,70,1.0,0.0,0.0
3,8,304.0,150.0,3433.0,12.0,70,1.0,0.0,0.0
4,8,302.0,140.0,3449.0,10.5,70,1.0,0.0,0.0


In [65]:
y.head()

Unnamed: 0,MPG
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


In [67]:
# Spliting the data into train and test split
x_train, x_test, y_train, y_test  = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [68]:
# Standardize the data

scalar = StandardScaler()

x_train = scalar.fit_transform(x_train)
x_test = scalar.transform(x_test)

In [77]:
# Defining model building function for the keras tuner


def model_builder(hp):

  model = keras.Sequential()

  model.add(keras.layers.Dense(units=hp.Int('units', min_value=32, max_value=512, step=32),
                               activation = 'relu', input_shape = (x_train.shape[1],)))

  # Add a tunable number of layers

  for i in range(hp.Int('num_layers', 1, 3)):
    model.add(keras.layers.Dense(units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
                                 activation = 'relu'))

  # Output layer

  model.add(keras.layers.Dense(1))

  model.compile(optimizer = keras.optimizers.Adam(learning_rate= hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                 loss = 'mse',
                 metrics = ['mae'])

  return model


In [78]:
# Initialize the tuner

tuner = kt.Hyperband(
    model_builder,
    objective = 'val_mae',
    max_epochs = 20,
    factor = 3,
    directory = 'auto_mpg',
    project_name = 'mpg_tuning'
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [79]:
# Stop early if there is no improvement

stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [80]:
tuner.search(x_train,y_train, epochs = 20, validation_split = 0.2, callbacks = [stop_early])

Trial 30 Complete [00h 00m 11s]
val_mae: 1.927830457687378

Best val_mae So Far: 1.703674077987671
Total elapsed time: 00h 02m 21s


In [81]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The optimal number of units in the first layer is {best_hps.get('units')},
the optimal number of layers is {best_hps.get('num_layers')},
and the best learning rate is {best_hps.get('learning_rate')}.
""")


The optimal number of units in the first layer is 352,
the optimal number of layers is 2,
and the best learning rate is 0.01.



In [82]:
# Build the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [84]:
history = model.fit(x_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])


Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - loss: 318.0388 - mae: 14.8135 - val_loss: 142.9904 - val_mae: 8.4091
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 69.8680 - mae: 5.6475 - val_loss: 33.1589 - val_mae: 4.8421
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 30.7363 - mae: 4.5468 - val_loss: 14.6024 - val_mae: 2.8774
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 15.7096 - mae: 3.0823 - val_loss: 10.5823 - val_mae: 2.4492
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 10.9081 - mae: 2.5745 - val_loss: 9.9134 - val_mae: 2.3125
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 9.5256 - mae: 2.2818 - val_loss: 8.1131 - val_mae: 2.0876
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 7.8

In [86]:
# Evaluate the model
test_loss, test_mae = model.evaluate(x_test, y_test)
print(f"Test MAE: {test_mae}")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 8.7587 - mae: 2.3474 
Test MAE: 2.351032018661499


In [88]:
# Predict on test data
predictions = model.predict(x_test[:10])
print("Predictions for first 10 samples:", predictions.flatten())
print("True values for first 10 samples:", y_test[:10].values)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
Predictions for first 10 samples: [26.468546 23.809534 37.497158 26.335276 30.476772 31.540688 13.674051
 32.262436 21.26896  32.380955]
True values for first 10 samples: [26.  21.6 36.1 26.  27.  28.  13.  26.  19.  29. ]


In [90]:
import numpy as np

# Assuming x_test is the test dataset, and you want to predict for the first sample
single_sample = np.expand_dims(x_test[0], axis=0)  # Adds an extra dimension to match input shape
prediction = model.predict(single_sample)

print("Prediction for the first test sample:", prediction)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
Prediction for the first test sample: [[26.468555]]


In [94]:
y_test[0]

18.0