## Packages

In [36]:
import numpy as np
import pandas as pd
import kagglehub
import plotly as plt
from sklearn.preprocessing import OneHotEncoder

## Dataset import

In [37]:
path = kagglehub.dataset_download("debajyotipodder/co2-emission-by-vehicles")
df = pd.read_csv(path + "/CO2 Emissions_Canada.csv")



## Data Exploration, Transformation & Cleaning

In [38]:
print("\nFirst 5 rows of the dataset:")
print(df.head())


First 5 rows of the dataset:
    Make       Model Vehicle Class  Engine Size(L)  Cylinders Transmission  \
0  ACURA         ILX       COMPACT             2.0          4          AS5   
1  ACURA         ILX       COMPACT             2.4          4           M6   
2  ACURA  ILX HYBRID       COMPACT             1.5          4          AV7   
3  ACURA     MDX 4WD   SUV - SMALL             3.5          6          AS6   
4  ACURA     RDX AWD   SUV - SMALL             3.5          6          AS6   

  Fuel Type  Fuel Consumption City (L/100 km)  \
0         Z                               9.9   
1         Z                              11.2   
2         Z                               6.0   
3         Z                              12.7   
4         Z                              12.1   

   Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)  \
0                              6.7                               8.5   
1                              7.7                            

In [39]:
print("\nLast 5 rows of the dataset:")
print(df.tail())



Last 5 rows of the dataset:
       Make        Model   Vehicle Class  Engine Size(L)  Cylinders  \
7380  VOLVO  XC40 T5 AWD     SUV - SMALL             2.0          4   
7381  VOLVO  XC60 T5 AWD     SUV - SMALL             2.0          4   
7382  VOLVO  XC60 T6 AWD     SUV - SMALL             2.0          4   
7383  VOLVO  XC90 T5 AWD  SUV - STANDARD             2.0          4   
7384  VOLVO  XC90 T6 AWD  SUV - STANDARD             2.0          4   

     Transmission Fuel Type  Fuel Consumption City (L/100 km)  \
7380          AS8         Z                              10.7   
7381          AS8         Z                              11.2   
7382          AS8         Z                              11.7   
7383          AS8         Z                              11.2   
7384          AS8         Z                              12.2   

      Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)  \
7380                              7.7                               9.4   
738

#### Vehicle Classes:

In [40]:
print(df["Vehicle Class"].unique())

['COMPACT' 'SUV - SMALL' 'MID-SIZE' 'TWO-SEATER' 'MINICOMPACT'
 'SUBCOMPACT' 'FULL-SIZE' 'STATION WAGON - SMALL' 'SUV - STANDARD'
 'VAN - CARGO' 'VAN - PASSENGER' 'PICKUP TRUCK - STANDARD' 'MINIVAN'
 'SPECIAL PURPOSE VEHICLE' 'STATION WAGON - MID-SIZE'
 'PICKUP TRUCK - SMALL']


#### Transforming classes into numbers based on average size of the vehicle in this class from smallest to largest

In [41]:


# Vehicle classes sorted from which are usually smaller to usually larger
vehicle_classes = [
    "TWO-SEATER", "MINICOMPACT", "SUBCOMPACT", "COMPACT", "MID-SIZE", "FULL-SIZE",
    "STATION WAGON - SMALL", "STATION WAGON - MID-SIZE", "SUV - SMALL", "SUV - STANDARD",
    "PICKUP TRUCK - SMALL", "PICKUP TRUCK - STANDARD", "MINIVAN",
    "VAN - PASSENGER", "VAN - CARGO", "SPECIAL PURPOSE VEHICLE"
]

vehicle_class_numbers = {cls: i for i, cls in enumerate(vehicle_classes)}

df["Vehicle Class"] = df["Vehicle Class"].map(vehicle_class_numbers)


In [42]:
print(df["Vehicle Class"].head())
print(df["Vehicle Class"].tail())

0    3
1    3
2    3
3    8
4    8
Name: Vehicle Class, dtype: int64
7380    8
7381    8
7382    8
7383    9
7384    9
Name: Vehicle Class, dtype: int64


## Getting rid of untransformable categorical values and data irrelevan & biased for the model training such as brands, IDs.