<a href="https://colab.research.google.com/github/PhoenixAlpha23/CO2-emission-prediction-/blob/main/CO2%20emission%20predictor%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
df= pd.read_csv("/content/CO2 Emissions_Canada.csv")
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [4]:
print(df.columns)

Index(['Model year', 'Make', 'Model', 'Vehicle class', 'Engine size (L)',
       'Cylinders', 'Transmission', 'Fuel type', 'City (L/100 km)',
       'Highway (L/100 km)', 'Combined (L/100 km)', 'Combined (mpg)',
       'CO2 emissions (g/km)', 'CO2 rating', 'Smog rating'],
      dtype='object')


In [3]:
extra_columns= ['Make', 'Model', 'Cylinders','Transmission', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (mpg)']
data= df.drop(extra_columns,axis=1)

In [4]:
print(data.shape)
data.head()

(7385, 5)


Unnamed: 0,Vehicle Class,Engine Size(L),Fuel Type,Fuel Consumption Comb (L/100 km),CO2 Emissions(g/km)
0,COMPACT,2.0,Z,8.5,196
1,COMPACT,2.4,Z,9.6,221
2,COMPACT,1.5,Z,5.9,136
3,SUV - SMALL,3.5,Z,11.1,255
4,SUV - SMALL,3.5,Z,10.6,244


In [5]:
data.columns

Index(['Vehicle Class', 'Engine Size(L)', 'Fuel Type',
       'Fuel Consumption Comb (L/100 km)', 'CO2 Emissions(g/km)'],
      dtype='object')

In [12]:
# Calculate the ratio of each fuel type
Vehicle_type = data['Vehicle Class'].value_counts()
print(Vehicle_type)

Vehicle class
Sport utility vehicle: Small       178
Sport utility vehicle: Standard    128
Mid-size                            82
Pickup truck: Standard              73
Subcompact                          58
Two-seater                          49
Minicompact                         47
Compact                             47
Full-size                           34
Pickup truck: Small                 19
Station wagon: Small                11
Minivan                              7
Station wagon: Mid-size              6
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [7]:
X= data.drop('CO2 Emissions(g/km)',axis=1)
y= data['CO2 Emissions(g/km)']

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Define Pipeline

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define columns
categorical_features = ['Vehicle Class', 'Fuel Type']
numerical_features = ['Engine Size(L)', 'Fuel Consumption Comb (L/100 km)']

# Create pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features) # Update to use OneHotEncoder
    ])
# Create and run pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Fit pipeline
pipe.fit(X_train, y_train)

In [11]:
y_pred= pipe.predict(X_test)
print(y_pred)

[191.2        240.81498808 326.53590079 ... 131.88269902 213.69519517
 310.        ]


# Find best hyper parameters

In [23]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid,
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 5, 10],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to your data
grid_search.fit(X, y)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Model:", best_model)

Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Engine Size(L)',
                                                   'Fuel Consumption Comb '
                                                   '(L/100 km)']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['Vehicle Class',
                                                   'Fuel Type'])])),
                ('regressor', RandomForestRegressor(max_depth=10))])


In [24]:
best_y_pred = grid_search.best_estimator_.predict(X_test)

# Error Calculation

In [12]:
print("CO2 Emissions Statistics:")
print(f"Min: {y.min():.2f}")
print(f"Max: {y.max():.2f}")
print(f"Mean: {y.mean():.2f}")
print(f"Standard Deviation: {y.std():.2f}")

CO2 Emissions Statistics:
Min: 96.00
Max: 522.00
Mean: 250.58
Standard Deviation: 58.51


In [26]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
mae= mean_absolute_error(y_test,best_y_pred)
mse = mean_squared_error(y_test, best_y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("evaluation metrics:")
print(f'MAE:{mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2 Score: {r2}')

evaluation metrics:
MAE:1.7858456759002135
MSE: 5.804571555016223
RMSE: 2.409267846258739
R2 Score: 0.9962888737697775


# Save model


In [27]:
# Used joblib because its more compatible with pipelines

import joblib
joblib.dump(pipe, 'co2_emissions_model.pkl')

['co2_emissions_model.pkl']

# Threshold for cargo vehicles

In [28]:
def predict_emissions_with_cargo(model, vehicle_class, engine_size, fuel_type, fuel_consumption, weight):
    """Predicts CO2 emissions for cargo vehicles with adjustable weight impact.
    """

    # Validate inputs
    if vehicle_class not in ['Cargo Van', 'Pickup Truck', 'SUV - STANDARD']:
        raise ValueError("Invalid vehicle class. Only 'Cargo Van', 'Pickup Truck', and 'SUV - STANDARD' are supported for this function.")
    if not isinstance(weight, (int, float)) or weight <= 0 :
        raise ValueError("Invalid weight. Weight should be a positive number.")


    # Create a DataFrame for prediction
    input_data = pd.DataFrame({
        'Vehicle Class': [vehicle_class],
        'Engine Size(L)': [engine_size],
        'Fuel Type': [fuel_type],
        'Fuel Consumption Comb (L/100 km)': [fuel_consumption]
    })

    # Predict emissions
    emissions = model.predict(input_data)[0]

    # Adjust emissions based on weight
    emissions += weight * 0.073 # 73 g/km per kg[based on my research]

    return emissions

# Example usage:
loaded_model = joblib.load('co2_emissions_model.pkl')

# Example prediction with cargo weight

In [30]:
vehicle_class = 'Pickup Truck'
engine_size = 4
fuel_type = 'X' #fuel type
fuel_consumption = 12
weight = 2000

predicted_emissions = predict_emissions_with_cargo(loaded_model, vehicle_class, engine_size, fuel_type, fuel_consumption, weight)

print(f"Predicted CO2 emissions for {vehicle_class} with {weight}kg load: {predicted_emissions:.2f} g/km")


Predicted CO2 emissions for Pickup Truck with 2000kg load: 290.64 g/km
