In [57]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [58]:
class VehicleEmissionPipeline:
    def __init__(self, data_path):
        self.data_path = data_path
        self.pipeline = None
        self.numerical_cols = [
            "Model_Year", "Engine_Size", "Cylinders", 
            "Fuel_Consumption_in_City(L/100 km)", 
            "Fuel_Consumption_in_City_Hwy(L/100 km)", 
            "Fuel_Consumption_comb(L/100km)", "Smog_Level"
        ]
        self.categorical_cols = ["Make", "Model", "Vehicle_Class", "Transmission"]
        self.target = 'CO2_Emissions'

    def extract_data(self):
        if os.path.exists(self.data_path):
            df = pd.read_csv(self.data_path)
            print(f"--- Step: Extract | Rows: {df.shape[0]} ---")
            return df
        else:
            raise FileNotFoundError(f"File {self.data_path} not found.")

    def build_transformer(self):
        num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="mean")),
            ('scaler', StandardScaler())
        ])
        cat_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        return ColumnTransformer([
            ('num', num_pipeline, self.numerical_cols),
            ('cat', cat_pipeline, self.categorical_cols)
        ])

    def evaluate_model(self, y_true, y_pred):
        mse = mean_squared_error(y_true, y_pred)
        print(f"\nModel Performance Metrics:")
        print(f"R2 Score: {r2_score(y_true, y_pred):.4f}")
        print(f"RMSE: {np.sqrt(mse):.4f}")
        print(f"MAE: {mean_absolute_error(y_true, y_pred):.4f}\n")

In [59]:
# Initialize the ETL Object
etl_job = VehicleEmissionPipeline("vehicle_emissions.csv")

# EXTRACT
df = etl_job.extract_data()
X = df.drop([etl_job.target], axis=1)
y = df[etl_job.target]

# TRANSFORM 
preprocessor = etl_job.build_transformer()
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
print("Training Model")
full_pipeline.fit(X_train, y_train)
predictions = full_pipeline.predict(X_test)
etl_job.evaluate_model(y_test, predictions)

joblib.dump(full_pipeline, 'vehicle_emission_pipeline.joblib')
print("Load or Pipeline saved as 'vehicle_emission_pipeline.joblib'")

--- Step: Extract | Rows: 935 ---
Training Model

Model Performance Metrics:
R2 Score: 0.9742
RMSE: 10.1898
MAE: 3.0989

Load or Pipeline saved as 'vehicle_emission_pipeline.joblib'


In [60]:
loaded_pipeline = joblib.load('vehicle_emission_pipeline.joblib')
sample_test = X_test.iloc[:5]
print("Predictions for Sample Data:")
print(loaded_pipeline.predict(sample_test))

Predictions for Sample Data:
[308.91 224.19 306.97 510.01 206.13]
