In [None]:
# Import python libraries

In [4]:
import numpy as np
import pandas as pd
import joblib

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Loading Data

In [6]:
data = pd.read_csv("vehicle_emissions.csv")

In [8]:
data.head()

Unnamed: 0,Model_Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Consumption_in_City(L/100 km),Fuel_Consumption_in_City_Hwy(L/100 km),Fuel_Consumption_comb(L/100km),CO2_Emissions,Smog_Level
0,2021,Acura,ILX,Compact,2.4,4,AM8,9.9,7.0,8.6,199,3
1,2021,Acura,NSX,Two-seater,3.5,6,AM9,11.1,10.8,11.0,256,3
2,2021,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,11.0,8.6,9.9,232,6
3,2021,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,11.3,9.1,10.3,242,6
4,2021,Acura,TLX SH-AWD,Compact,2.0,4,AS10,11.2,8.0,9.8,230,7


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 12 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Model_Year                              935 non-null    int64  
 1   Make                                    935 non-null    object 
 2   Model                                   935 non-null    object 
 3   Vehicle_Class                           935 non-null    object 
 4   Engine_Size                             935 non-null    float64
 5   Cylinders                               935 non-null    int64  
 6   Transmission                            935 non-null    object 
 7   Fuel_Consumption_in_City(L/100 km)      935 non-null    float64
 8   Fuel_Consumption_in_City_Hwy(L/100 km)  935 non-null    float64
 9   Fuel_Consumption_comb(L/100km)          935 non-null    float64
 10  CO2_Emissions                           935 non-null    int64 

In [None]:
# Create features and target variables

In [16]:
X = data.drop(['CO2_Emissions'], axis=1)
y = data['CO2_Emissions']

In [17]:
X

Unnamed: 0,Model_Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Consumption_in_City(L/100 km),Fuel_Consumption_in_City_Hwy(L/100 km),Fuel_Consumption_comb(L/100km),Smog_Level
0,2021,Acura,ILX,Compact,2.4,4,AM8,9.9,7.0,8.6,3
1,2021,Acura,NSX,Two-seater,3.5,6,AM9,11.1,10.8,11.0,3
2,2021,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,11.0,8.6,9.9,6
3,2021,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,11.3,9.1,10.3,6
4,2021,Acura,TLX SH-AWD,Compact,2.0,4,AS10,11.2,8.0,9.8,7
...,...,...,...,...,...,...,...,...,...,...,...
930,2021,Volvo,XC40 T5 AWD,SUV: Small,2.0,4,AS8,10.7,7.7,9.4,5
931,2021,Volvo,XC60 T5 AWD,SUV: Small,2.0,4,AS8,11.1,8.3,9.9,5
932,2021,Volvo,XC60 T6 AWD,SUV: Small,2.0,4,AS8,11.7,8.6,10.3,7
933,2021,Volvo,XC90 T5 AWD,SUV: Standard,2.0,4,AS8,11.5,8.4,10.1,5


In [18]:
y

0      199
1      256
2      232
3      242
4      230
      ... 
930    219
931    230
932    240
933    236
934    245
Name: CO2_Emissions, Length: 935, dtype: int64

In [None]:
# Split Categorical and Numerical Features

In [19]:
numerical_cols = ["Model_Year", "Engine_Size", "Cylinders", "Fuel_Consumption_in_City(L/100 km)", "Fuel_Consumption_in_City_Hwy(L/100 km)", "Fuel_Consumption_comb(L/100km)", "Smog_Level"]
categorical_cols = ["Make", "Model", "Vehicle_Class", "Transmission"]

In [None]:
# Start the pipeline w/ Encoding

In [25]:
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="mean")),
('Scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="most_frequent")),
('encoder', OneHotEncoder(handle_unknown='ignore'))
 ])

In [None]:
# Join the pipelines together

In [26]:
preprocessor = ColumnTransformer([
('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

pipeline = Pipeline([
('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

In [None]:
# Split into training and testing datasets

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [61]:
prediction = pipeline.predict(X_test)

In [62]:
prediction

array([308.77, 224.63, 306.99, 511.64, 205.99, 322.97, 114.34, 214.91,
       108.38, 221.01, 205.88, 288.93, 324.74, 206.07, 220.07, 252.62,
       215.06, 241.26, 277.96, 249.29, 324.14, 229.72, 236.7 , 219.08,
       233.55, 255.48, 219.87, 194.35, 321.68, 366.76, 277.53, 321.39,
       327.47, 269.17, 341.65, 215.97, 300.48, 237.05, 205.99, 322.18,
       288.57, 263.97, 158.42, 122.52, 277.87, 288.25, 209.83, 281.6 ,
       178.18, 348.85, 244.35, 346.02, 212.67, 276.67, 205.26, 137.29,
       114.32, 213.8 , 264.76, 237.11, 212.71, 157.31, 215.2 , 210.16,
       232.68, 383.44, 192.75, 357.07, 258.34, 178.25, 241.28, 169.36,
       263.65, 287.78, 277.31, 194.19, 251.29, 318.44, 275.58, 138.32,
       201.42, 184.14, 262.65, 341.27, 294.2 , 273.59, 274.  , 288.12,
       158.45, 197.11, 214.72, 324.63, 164.79, 386.4 , 172.94, 219.16,
       289.  , 197.09, 348.49, 287.74, 288.21, 287.49, 371.25, 323.56,
       166.23, 217.11, 370.43, 189.37, 400.01, 243.78, 189.46, 227.65,
      

In [67]:
pred = pd.DataFrame(prediction, columns=['Predicted_CO2_Emissions'])

In [68]:
pred

Unnamed: 0,Predicted_CO2_Emissions
0,308.77
1,224.63
2,306.99
3,511.64
4,205.99
...,...
182,181.18
183,192.47
184,232.04
185,209.95


In [None]:
# View and Encoding

In [70]:
encoded_cols = pipeline.named_steps['preprocessor'].named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)
print(encoded_cols)

['Make_Acura' 'Make_Alfa Romeo' 'Make_Aston Martin' 'Make_Audi' 'Make_BMW'
 'Make_Bentley' 'Make_Bugatti' 'Make_Buick' 'Make_Cadillac'
 'Make_Chevrolet' 'Make_Chrysler' 'Make_Dodge' 'Make_FIAT' 'Make_Ford'
 'Make_GMC' 'Make_Genesis' 'Make_Honda' 'Make_Hyundai' 'Make_Infiniti'
 'Make_Jaguar' 'Make_Jeep' 'Make_Kia' 'Make_Lamborghini' 'Make_Lexus'
 'Make_Lincoln' 'Make_MINI' 'Make_Maserati' 'Make_Mazda'
 'Make_Mercedes-Benz' 'Make_Mitsubishi' 'Make_Nissan' 'Make_Porsche'
 'Make_Ram' 'Make_Rolls-Royce' 'Make_Subaru' 'Make_Toyota'
 'Make_Volkswagen' 'Make_Volvo' 'Model_1500' 'Model_1500 4X4 EcoDiesel'
 'Model_1500 4X4 TRX' 'Model_1500 4X4 eTorque' 'Model_1500 Classic'
 'Model_1500 Classic 4X4' 'Model_1500 EcoDiesel'
 'Model_1500 HFE EcoDiesel' 'Model_1500 HFE eTorque' 'Model_1500 eTorque'
 'Model_228i xDrive Gran Coupe' 'Model_230i xDrive Coupe' 'Model_300'
 'Model_300 AWD' 'Model_430i xDrive Coupe' 'Model_4Runner 4WD'
 'Model_4Runner 4WD (Part-Time 4WD)' 'Model_500X AWD'
 'Model_530i xDriv

In [None]:
# Evaluate model Accuracy

In [71]:
mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, prediction)
mae = mean_absolute_error(y_test, prediction)

print(f'Model Performance:')
print(f'R2 Score: {r2}')
print(f'Root Mean Square Error: {rmse}')
print(f'Mean Absolute Error: {mae}')

Model Performance:
R2 Score: 0.9729456684244937
Root Mean Square Error: 10.439390669004917
Mean Absolute Error: 3.1548128342245985


In [None]:
# To Save Pipeline and use later stage whenever we needed

In [72]:
joblib.dump(pipeline, 'vehicle_emission_pipeline.joblib')

['vehicle_emission_pipeline.joblib']