<a href="https://colab.research.google.com/github/Raptor-sj22/ML-Training/blob/main/Linear_Regression/Linear_Regression_Exercise_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Summary

This code performs linear regression using TensorFlow/Keras to predict car fuel efficiency (`mpg`) based on input features: `cylinders`, `displacement`, and `weight`. The input features are normalized using a `Normalization` layer to improve training stability. The model is trained on **300 samples** and validated on **98 samples**, using **150 epochs**, a **batch size of 2**, and a **learning rate of 0.001**. It uses **Mean Squared Error (MSE)** as the loss function and **Stochastic Gradient Descent**


In [None]:
#@title Install required libraries

!pip install keras~=3.8.0 \
  matplotlib~=3.10.0 \
  numpy~=2.0.0 \
  pandas~=2.2.0 \
  tensorflow~=2.18.0

print('\n\nAll requirements successfully installed.')



All requirements successfully installed.


In [None]:
#@title Import Packages

import numpy as np
import plotly.express as px
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers
import matplotlib.pyplot as plt
print(f" All Packages successfuly imported.")

 All Packages successfuly imported.


In [None]:
#@title Defining Data
file_id = '1tmYsEsEWWvAznAT4k6vFY-qalJx-1w_t'
url = f'https://drive.google.com/uc?id={file_id}'
Original_Dataset = pd.read_csv(url)
display(Original_Dataset)
Original_Dataset.info()
Original_Dataset.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,1.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


In [None]:
#@title Plotting the correlation matrix
numeric_data = Original_Dataset.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
corr_matrix = numeric_data.corr()

# Plot the correlation matrix using Plotly Express
fig = px.imshow(corr_matrix,text_auto=".2f",color_continuous_scale='bluered',title='Correlation Matrix Heatmap',aspect='auto')
fig.update_layout(width=1000, height=1000)
fig.show()

In [None]:
#@title Cleaning Data
clean_data = Original_Dataset[['mpg', 'cylinders', 'displacement','weight']].iloc[:300].dropna()

# Separate into X1, X2, X3,and y
y = clean_data['mpg'].values
X1 = clean_data['cylinders'].values
X2 = clean_data['displacement'].values
X3 = clean_data['weight'].values

# Display the results
display( f"Data successfully cleaned from NaN values",f"Data Length: {len(X1)}" )

'Data successfully cleaned from NaN values'

'Data Length: 300'

In [None]:
#@title Create the 3D scatter plot using Plotly Express
fig = px.scatter_3d(clean_data, x='cylinders', y='displacement', z='weight', color='mpg',title="Interactive 3D Scatter Plot", width=700, height=700)

# Show the plot
fig.show()

In [None]:
#@title Normalizing Data
normalizer = keras.layers.Normalization()


# Adapt the normalizer to your input data
X_raw = np.column_stack((X1, X2, X3))

# Normalize
normalizer = layers.Normalization()
normalizer.adapt(X_raw)  # Important: adapt to raw (unscaled) data
display( f"Data successfully Normalized." )

'Data successfully Normalized.'

In [None]:
#@title Defining and Traning the model
normalizer = keras.layers.Normalization()


# Adapt the normalizer to your input data
X = np.column_stack((X1, X2, X3))

# Normalize
normalizer = layers.Normalization()
normalizer.adapt(X)

display( f"Data successfully Normalized." )
# Hyperparameters
epochs = 150
batch_size = 2
learning_rate = 0.001

# Custom callback to log RMSE and epoch numbers
class RMSELogger(keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        self.epoch_nums = []
        self.rmses = []

    def on_epoch_end(self, epoch, logs=None):
        rmse = np.sqrt(logs.get('loss'))
        self.epoch_nums.append(epoch + 1)  # +1 to make it human-readable (start from 1)
        self.rmses.append(rmse)


# Build the model with normalization as the first layer
model = keras.Sequential([
    normalizer,                      # Automatically normalizes inputs
    layers.Dense(1)                 # Simple linear regression
])

# Compile the model
optimizer = optimizers.SGD(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='mse')

# Train model with RMSE logger
rmse_logger = RMSELogger()
model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[rmse_logger])




# Plot RMSE vs Epochs with explicit x-axis (epoch numbers)
# Create a DataFrame for plotting with plotly(required syntax of plotly!)
rmse_data = pd.DataFrame({'Epoch': rmse_logger.epoch_nums, 'RMSE': rmse_logger.rmses})

fig = px.line(rmse_data, x='Epoch', y='RMSE', title='RMSE vs. Epochs', labels={'Epoch': 'Epoch', 'RMSE': 'RMSE'})
fig.update_layout(width=500, height=500)
fig.show()

# Get the weights from the model
trained_weights = model.layers[1].get_weights()  # layer[1] because layer[0] is the normalizer

normalized_w = trained_weights[0].flatten()
normalized_b = trained_weights[1][0]

# Get mean and variance from the normalizer (used during standardization)
means = normalizer.mean.numpy()
stds = np.sqrt(normalizer.variance.numpy())

# Transform weights back to original scale
Actual_weights = normalized_w / stds
Actual_bias = normalized_b - np.sum((normalized_w * means) / stds)

# Print true weights and bias
print(f"\nActual Learned Model (original scale): y = {Actual_weights[0,0]} * X1 + {Actual_weights[0,1]} * X2 + {Actual_weights[0,2]} * X3 + {Actual_bias}")


'Data successfully Normalized.'

Epoch 1/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 404.8569
Epoch 2/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 222.9604
Epoch 3/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 122.6292
Epoch 4/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 73.5398
Epoch 5/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 43.9569
Epoch 6/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 24.4394
Epoch 7/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 18.2931
Epoch 8/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 12.6413
Epoch 9/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 10.6356
Epoch 10/150
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [


Actual Learned Model (original scale): y = -0.16004440188407898 * X1 + -0.009427981451153755 * X2 + -0.0048990556970238686 * X3 + 39.107688903808594


In [None]:
#@title Validiating the model
validate_clean_data = Original_Dataset[['mpg', 'cylinders', 'displacement','weight']].iloc[300:300+98].dropna()
# Separate into X1, X2, X3,and y
y = validate_clean_data['mpg'].values
X1 = validate_clean_data['cylinders'].values
X2 = validate_clean_data['displacement'].values
X3 = validate_clean_data['weight'].values

Actual_mpg = validate_clean_data['mpg'].values
Predicted_mpg=Actual_weights[0,0] * X1 + Actual_weights[0,1] * X2 + Actual_weights[0,2] * X3 + Actual_bias
Loss_L1=abs(Actual_mpg-Predicted_mpg)
df = pd.DataFrame({
    'Actual_mpg': Actual_mpg,
    'Predicted_mpg': Predicted_mpg,
    'Loss_L1': Loss_L1
})
display(df)
Total_loss=sum(Loss_L1)
print('total loss',Total_loss)

Unnamed: 0,Actual_mpg,Predicted_mpg,Loss_L1
0,23.9,18.621288,5.278712
1,34.2,26.699651,7.500349
2,34.5,26.944603,7.555397
3,31.8,27.770040,4.029960
4,37.3,27.174576,10.125424
...,...,...,...
93,27.0,23.479228,3.520772
94,44.0,27.118008,16.881992
95,32.0,25.951401,6.048599
96,28.0,24.476132,3.523868


total loss 671.4917900493368
