<a href="https://colab.research.google.com/github/Raptor-sj22/ML-Training/blob/main/Linear_Regression/Linear_Regression_Exercise_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Summary

This code builds a linear regression model using TensorFlow/Keras to predict taxi trip tips (`TIPS`) based on `TRIP_MILES` and `PICKUP_COMMUNITY_AREA`, using a cleaned subset of the Chicago Taxi dataset. The model is trained on **10,000 samples** and validated on **110 samples**, using **50 epochs**, a **batch size of 350**, and a **learning rate of 0.0001**. The model is optimized using **Stochastic Gradient Descent (SGD)** and evaluated using **Root Mean Squared Error (RMSE)**.


In [None]:
#@title Install required libraries

!pip install keras~=3.8.0 \
  matplotlib~=3.10.0 \
  numpy~=2.0.0 \
  pandas~=2.2.0 \
  tensorflow~=2.18.0

print('\n\nAll requirements successfully installed.')

In [None]:
#@title Import Packages
import numpy as np
import plotly.express as px
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers
import matplotlib.pyplot as plt
print(f" All Packages successfuly imported")

In [None]:
#@title Defining Data
chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")
display(chicago_taxi_dataset)
chicago_taxi_dataset.info()
chicago_taxi_dataset.describe()

In [None]:
#@title Plotting the correlation matrix

#Drop non-numeric columns
numeric_data = chicago_taxi_dataset.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
corr_matrix = numeric_data.corr()

# Plot the correlation matrix using Plotly Express
fig = px.imshow(corr_matrix,text_auto=".2f",color_continuous_scale='bluered',title='Correlation Matrix Heatmap',aspect='auto')
fig.update_layout(width=1000, height=1000)
fig.show()

In [None]:
#@title Cleaning Data

#Remove rows with NaN values in any of the columns (TRIP_MILES, PICKUP_COMMUNITY_AREA, FARE)
clean_data = chicago_taxi_dataset[['TRIP_MILES', 'PICKUP_COMMUNITY_AREA', 'TIPS']].iloc[:10000].dropna()

# Separate into X1, X2, and y
X1 = clean_data['TRIP_MILES'].values
X2 = clean_data['PICKUP_COMMUNITY_AREA'].values
y = clean_data['TIPS'].values

# Display the results
display( f"Data successfully cleaned from NaN values",f"Data Length: {len(X1)}" )

In [None]:
#@title Create the 3D scatter plot using Plotly Express
fig = px.scatter_3d(clean_data, x='TRIP_MILES', y='PICKUP_COMMUNITY_AREA', z='TIPS',title="Interactive 3D Scatter Plot: TRIP_MILES, PICKUP_COMMUNITY_AREA, and Tips",  color=y, color_continuous_scale='bluered', width=700, height=700)
# Show the plot
fig.show()

In [None]:
#@title Defining and Traning the model


# Combine X1 and X2 into a single input array
X = np.column_stack((X1, X2))  # Shape: (n_samples, 2)

# Hyperparameters
epochs = 50
batch_size = 350
learning_rate = 0.0001

# Custom callback to log RMSE and epoch numbers
class RMSELogger(keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        self.epoch_nums = []
        self.rmses = []

    def on_epoch_end(self, epoch, logs=None):
        rmse = np.sqrt(logs.get('loss'))
        self.epoch_nums.append(epoch + 1)  # +1 to make it human-readable (start from 1)
        self.rmses.append(rmse)

# Build model
model = keras.Sequential([
    keras.Input(shape=(2,)),   # Replaces input_shape warning
    layers.Dense(1)
])

# Compile the model
optimizer = optimizers.SGD(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='mse')

# Train model with RMSE logger
rmse_logger = RMSELogger()
model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[rmse_logger])




# Plot RMSE vs Epochs with explicit x-axis (epoch numbers)
# Create a DataFrame for plotting with plotly(required syntax of plotly!)
rmse_data = pd.DataFrame({'Epoch': rmse_logger.epoch_nums, 'RMSE': rmse_logger.rmses})

fig = px.line(rmse_data, x='Epoch', y='RMSE', title='RMSE vs. Epochs', labels={'Epoch': 'Epoch', 'RMSE': 'RMSE'})
fig.update_layout(width=500, height=500)
fig.show()

# Print the trained model formula
weights = model.get_weights()
w1, w2 = weights[0]
b=weights[1]

print(f"\nLearned Model Formula: y = {w1} * X1 + {w2} * X2 + {b}")

In [None]:
#@title Validiating the model
validate_clean_data = chicago_taxi_dataset[['TRIP_MILES', 'PICKUP_COMMUNITY_AREA', 'TIPS']].iloc[10000:10000+110].dropna()
X1 = validate_clean_data['TRIP_MILES'].values
X2 = validate_clean_data['PICKUP_COMMUNITY_AREA'].values

Actual_Tips = validate_clean_data['TIPS'].values
Predicted_Tips=(w1*X1)+(w2*X2)+b
Loss_L1=abs(Actual_Tips-Predicted_Tips)
df = pd.DataFrame({
    'Actual_Tips': Actual_Tips,
    'Predicted_Tips': Predicted_Tips,
    'Loss_L1': Loss_L1
})
df