## Packages

In [2]:
import numpy as np
import pandas as pd
import kagglehub
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Dataset import

In [3]:
path = kagglehub.dataset_download("debajyotipodder/co2-emission-by-vehicles")
df = pd.read_csv(path + "/CO2 Emissions_Canada.csv")



## Data Exploration, Transformation & Cleaning

In [4]:
print("\nFirst 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
    Make       Model Vehicle Class  Engine Size(L)  Cylinders Transmission  \
0  ACURA         ILX       COMPACT             2.0          4          AS5   
1  ACURA         ILX       COMPACT             2.4          4           M6   
2  ACURA  ILX HYBRID       COMPACT             1.5          4          AV7   
3  ACURA     MDX 4WD   SUV - SMALL             3.5          6          AS6   
4  ACURA     RDX AWD   SUV - SMALL             3.5          6          AS6   

  Fuel Type  Fuel Consumption City (L/100 km)  \
0         Z                               9.9   
1         Z                              11.2   
2         Z                               6.0   
3         Z                              12.7   
4         Z                              12.1   

   Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)  \
0                              6.7                               8.5   
1                              7.7                            

In [5]:
print("\nLast 5 rows of the dataset:")
print(df.tail())



Last 5 rows of the dataset:
       Make        Model   Vehicle Class  Engine Size(L)  Cylinders  \
7380  VOLVO  XC40 T5 AWD     SUV - SMALL             2.0          4   
7381  VOLVO  XC60 T5 AWD     SUV - SMALL             2.0          4   
7382  VOLVO  XC60 T6 AWD     SUV - SMALL             2.0          4   
7383  VOLVO  XC90 T5 AWD  SUV - STANDARD             2.0          4   
7384  VOLVO  XC90 T6 AWD  SUV - STANDARD             2.0          4   

     Transmission Fuel Type  Fuel Consumption City (L/100 km)  \
7380          AS8         Z                              10.7   
7381          AS8         Z                              11.2   
7382          AS8         Z                              11.7   
7383          AS8         Z                              11.2   
7384          AS8         Z                              12.2   

      Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)  \
7380                              7.7                               9.4   
738

#### Vehicle Classes:

In [6]:
print(df["Vehicle Class"].unique())

['COMPACT' 'SUV - SMALL' 'MID-SIZE' 'TWO-SEATER' 'MINICOMPACT'
 'SUBCOMPACT' 'FULL-SIZE' 'STATION WAGON - SMALL' 'SUV - STANDARD'
 'VAN - CARGO' 'VAN - PASSENGER' 'PICKUP TRUCK - STANDARD' 'MINIVAN'
 'SPECIAL PURPOSE VEHICLE' 'STATION WAGON - MID-SIZE'
 'PICKUP TRUCK - SMALL']


#### Transforming categorical features into numerical based on the expected gradation of impact on the CO2 emission

In [7]:


# Vehicle classes sorted from those usually associated with being smaller
# to those usually associated with being larger
vehicle_classes = [
    "TWO-SEATER", "MINICOMPACT", "SUBCOMPACT", "COMPACT", "MID-SIZE", "FULL-SIZE",
    "STATION WAGON - SMALL", "STATION WAGON - MID-SIZE", "SUV - SMALL", "SUV - STANDARD",
    "PICKUP TRUCK - SMALL", "PICKUP TRUCK - STANDARD", "MINIVAN",
    "VAN - PASSENGER", "VAN - CARGO", "SPECIAL PURPOSE VEHICLE"
]

# Fuel types sorted from those usually associated with being less polluting
# to those usually associated with greater polution.
fuel_types = ["N", "E", "Z", "X", "D"]


vehicle_class_numbers = {cls: i for i, cls in enumerate(vehicle_classes)}
vehicle_fuel_types = {cls: i for i, cls in enumerate(fuel_types)}

df["Vehicle Class"] = df["Vehicle Class"].map(vehicle_class_numbers)
df["Fuel Type"] = df["Fuel Type"].map(vehicle_fuel_types)

In [8]:
print(df["Vehicle Class"].head())
print(df["Vehicle Class"].tail())
print(df["Fuel Type"].head())
print(df["Fuel Type"].tail())

0    3
1    3
2    3
3    8
4    8
Name: Vehicle Class, dtype: int64
7380    8
7381    8
7382    8
7383    9
7384    9
Name: Vehicle Class, dtype: int64
0    2
1    2
2    2
3    2
4    2
Name: Fuel Type, dtype: int64
7380    2
7381    2
7382    2
7383    2
7384    2
Name: Fuel Type, dtype: int64


## Getting rid of untransformable categorical values and data irrelevan & biased for the model training such as brands, IDs.

In [9]:
df.drop('Make', axis=1, inplace=True)
df.drop('Model', axis=1, inplace=True)
df.drop('Transmission', axis=1, inplace=True)

## Check for rows with missing values

In [10]:
df.isna().any(axis=1).sum()

0

## Data normalization

In [11]:
features = df.columns.difference(["CO2 Emissions(g/km)"])
y_variable = "CO2 Emissions(g/km)"

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)

## Manual Approach of Building a Linear Regression Model

In [18]:

X = df.drop('CO2 Emissions(g/km)', axis=1)
y = df['CO2 Emissions(g/km)']

# Removing a last sample for a manual test of the CO2 emission prediction
X_manual_test = X.iloc[-1:]
y_manual_test = y.iloc[-1]

X = X.iloc[:-1]
y = y.iloc[:-1]

In [51]:
def train_test_splt(x, y, test_set_percentage):
    x_test_size = (round((x.shape[0]) / 100) * test_set_percentage)
    y_test_size = (round((y.shape[0]) / 100) * test_set_percentage)

    x_train = np.array(x[:x_test_size])
    y_train = np.array(y[:y_test_size])
    x_test = np.array(x[x_test_size:])
    y_test = np.array(y[y_test_size:])

    return x_train, y_train, x_test, y_test

In [52]:
x_train, y_train, x_test, y_test = train_test_splt(X, y, 40)

In [139]:
W = np.zeros(X.shape[1])
b = 0
linear_equation = lambda X: np.dot(X,W)+b

In [140]:

def gradient_descent():
    global W, b

 
    learning_rate = 0.001

    iteration = 0
    while True:
        W_old, b_old = W.copy(), b
        predictions = linear_equation(x_train)
        errors = predictions - y_train
        errors = errors.reshape(-1, 1)
        w_gradient = np.mean(errors * x_train, axis=0)
        b_gradient = np.mean(errors)

        W = W - learning_rate * w_gradient
        b = b - learning_rate * b_gradient

        w_change = abs(W_old - W)
        b_change = abs(b_old - b)

        if iteration % 1000 == 0:
            print(f"Iteration {iteration}")
            print(f"Average W change: {np.mean(w_change):.10f}")
            print(f"b change: {b_change:.10f}")

        min_tolerated_difference = 0.001
        if (np.mean(w_change) < min_tolerated_difference and b_change < min_tolerated_difference): break

        iteration+=1

In [142]:
gradient_descent()

Iteration 0
Average W change: 0.0000028397
b change: 0.0002094029
Iteration 1000
Average W change: 0.0000028344
b change: 0.0002089689
Iteration 2000
Average W change: 0.0000028291
b change: 0.0002085357
Iteration 3000
Average W change: 0.0000028237
b change: 0.0002081034
Iteration 4000
Average W change: 0.0000028184
b change: 0.0002076721
Iteration 5000
Average W change: 0.0000028132
b change: 0.0002072416
Iteration 6000
Average W change: 0.0000028079
b change: 0.0002068120
Iteration 7000
Average W change: 0.0000028026
b change: 0.0002063833
Iteration 8000
Average W change: 0.0000027973
b change: 0.0002059555
Iteration 9000
Average W change: 0.0000027921
b change: 0.0002055286
Iteration 10000
Average W change: 0.0000027869
b change: 0.0002051026
Iteration 11000
Average W change: 0.0000027816
b change: 0.0002046774
Iteration 12000
Average W change: 0.0000027764
b change: 0.0002042531
Iteration 13000
Average W change: 0.0000027712
b change: 0.0002038297
Iteration 14000
Average W change:

In [143]:
print("OPTIMAL WEIGHTS")
print(f"W: {W}")
print(f"b: {b}")

OPTIMAL WEIGHTS
W: [-1.66894945  1.51466697  8.10900455 26.18248679  2.32210757  5.67930609
  3.20931136 -3.01507044]
b: 108.57130919312597


In [144]:
mse = lambda predicted, actual: np.mean((predicted - actual)**2)

In [145]:
train_error = mse(linear_equation(x_train), y_train)
test_error = mse(linear_equation(x_test), y_test)

In [146]:
print(f"Train error: {train_error}")
print(f"Test error: {test_error}")

Train error: 257.24401392723735
Test error: 245.92038369581027


In [148]:
manual_test_prediction = linear_equation(X_manual_test)[0]
manual_test_actual_value = y_manual_test

print("MANUAL TEST with a prediction for a single value.")
print(f"Predicted value: {manual_test_prediction}")
print(f"Actual value: {manual_test_actual_value}")

MANUAL TEST with a prediction for a single value.
Predicted value: 215.06856533529373
Actual value: 248


## Visualiziation of the data

In [None]:
fig = go.Figure()

for feature in features:
    fig.add_trace(go.Scatter(
        x=df_normalized[feature],
        y=df[y_variable],
        mode='markers',
        name=feature
    ))

fig.update_layout(
    title="Normalized Features vs CO2 Emissions",
    xaxis_title="Normalized Feature Values",
    yaxis_title="CO2 Emissions (g/km)",
    legend_title="Features",
)

fig.show()

## Applying the linear regression model with polynomial features

In [119]:

X = df.drop('CO2 Emissions(g/km)', axis=1)
y = df['CO2 Emissions(g/km)']

# Removing a last sample for a manual test of the CO2 emission prediction
X_manual_test = X.iloc[-1:]
y_manual_test = y.iloc[-1]

X = X.iloc[:-1]
y = y.iloc[:-1]


In [120]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [121]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [122]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
print(f"Root Mean Squared Error: {rmse:.2f}")


In [118]:
print("Model Score (R²):", model.score(X_test_scaled, y_test))

Model Score (R²): 0.9324495944393014


## Manual CO2 emission prediction test

In [127]:
X_manual_test_scaled = scaler.fit_transform(X_manual_test)
predicted_emission = model.predict(X_manual_test_scaled)[0]
actual_emission = y_manual_test
print(f"Predicted CO2 emission {predicted_emission}, while actual value is {actual_emission}")
print(f"Difference just {predicted_emission - actual_emission}")
print("Pretty good acuracy.")

Predicted CO2 emission 251.3812425935331, while actual value is 248
Difference just 3.381242593533102
Pretty good acuracy.


## Regression Line vs Actual Values Visualization

In [128]:
y_test_predicted = model.predict(X_test_scaled)
X_test_original = pd.DataFrame(scaler.inverse_transform(X_test_scaled), columns=X_train.columns)

for feature in X_train.columns:
    fig = px.scatter(
        x=X_test_original[feature],
        y=y_test,
        title=f"Regression: {feature} vs CO₂ Emissions",
        labels={"x": feature, "y": "CO₂ Emissions (g/km)"},
        trendline="ols"
    )

    fig.add_scatter(
        x=X_test_original[feature], y=y_test_predicted,
        mode="markers",
        name="Predicted",
        marker=dict(color="red")
    )

    fig.show()