In [None]:
# ============================================================
# üöó VEHICLE EMISSION INDEX PREDICTION SYSTEM (XGBoost)
# Author: Amit Mali
# Description: Predicts continuous Emission Index and classifies via thresholds
# ============================================================

# ‚úÖ IMPORT LIBRARIES
import os
import joblib
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

# ============================================================
# üß© LOAD DATASET
# ============================================================
data_path = r"C:\Users\ADMIN\Desktop\CI LAB\engine_fault_system\data\vehicle_emission_dataset_synthetic_v3_labeled.csv"
df = pd.read_csv(data_path)

print("‚úÖ Dataset loaded successfully")
display(df.head())

# ============================================================
# üîß FEATURE & TARGET SPLIT
# ============================================================
# Predict continuous emission index
if 'Emission_Index' not in df.columns:
    raise ValueError("‚ùå The dataset must contain an 'Emission_Index' column for regression target.")

X = df.drop(columns=['Emission Level', 'Emission_Index'], errors='ignore')
y = df['Emission_Index']

print("\nFeatures (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

# Identify numerical & categorical features
numerical_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("\nNumerical features:", numerical_features)
print("Categorical features:", categorical_features)

# ============================================================
# ‚úÇÔ∏è TRAIN-TEST SPLIT
# ============================================================
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("\nTrain/Test Split:")
print("X_train:", X_train_raw.shape, "| X_test:", X_test_raw.shape)

# ============================================================
# ‚öôÔ∏è PREPROCESSING PIPELINE
# ============================================================
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X_train = preprocessor.fit_transform(X_train_raw)
X_test = preprocessor.transform(X_test_raw)

print("\n‚úÖ Preprocessing complete ‚Äî Shapes:")
print("X_train:", X_train.shape, "| X_test:", X_test.shape)

# ============================================================
# ü§ñ MODEL TRAINING ‚Äî XGBOOST REGRESSOR
# ============================================================
print("\nüöÄ Training XGBoost Regressor for Emission Index...")

xgb_model = XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.0,
    reg_alpha=0.1,
    random_state=42,
    objective='reg:squarederror',
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# ============================================================
# üìà MODEL EVALUATION
# ============================================================
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nüìä Model Performance:")
print(f" - MSE: {mse:.6f}")
print(f" - R¬≤ Score: {r2:.4f}")

# ------------------------------------------------------------
# Emission Level Categorization (for visualization)
# ------------------------------------------------------------
def categorize_index(value):
    if value < 0.33:
        return "Low"
    elif value < 0.66:
        return "Medium"
    else:
        return "High"

df_eval = pd.DataFrame({'Actual_Index': y_test, 'Predicted_Index': y_pred})
df_eval['Predicted_Level'] = df_eval['Predicted_Index'].apply(categorize_index)
print("\nSample predictions (with derived levels):")
print(df_eval.head())

# ============================================================
# üìä VISUALIZATION
# ============================================================
plt.figure(figsize=(7, 6))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolor='k')
plt.xlabel("Actual Emission Index")
plt.ylabel("Predicted Emission Index")
plt.title("XGBoost Emission Index Prediction Performance")
plt.plot([0, 1], [0, 1], 'r--', lw=2)
plt.grid(True)
plt.show()

# ============================================================
# üíæ SAVE MODEL + PREPROCESSOR
# ============================================================
save_dir = r"C:\Users\ADMIN\Desktop\CI LAB\engine_fault_system\models"
os.makedirs(save_dir, exist_ok=True)

model_path = os.path.join(save_dir, "emission_index_xgb_model.pkl")
preproc_path = os.path.join(save_dir, "emission_index_preprocessor.pkl")

joblib.dump(xgb_model, model_path)
joblib.dump(preprocessor, preproc_path)

print(f"\nüíæ Saved XGBoost model and preprocessor:")
print(f"   ‚Ä¢ Model: {model_path}")
print(f"   ‚Ä¢ Preprocessor: {preproc_path}")

# ============================================================
# ‚úÖ VERIFY RELOAD
# ============================================================
try:
    model_loaded = joblib.load(model_path)
    preproc_loaded = joblib.load(preproc_path)
    X_check = preproc_loaded.transform(X_test_raw)
    preds_check = model_loaded.predict(X_check)
    print(f"\n‚úÖ Reload check passed ‚Äî Sample predictions: {preds_check[:5]}")
except Exception as e:
    print(f"‚ö†Ô∏è Reload verification failed: {e}")


‚úÖ Dataset loaded successfully


Unnamed: 0,Vehicle Type,Fuel Type,Engine Size,Age of Vehicle,Mileage,Speed,Acceleration,Road Type,Traffic Conditions,Temperature,Humidity,Wind Speed,Air Pressure,CO2 Emissions,NOx Emissions,PM2.5 Emissions,VOC Emissions,SO2 Emissions,Emission Level,Emission_Index
0,Motorcycle,Electric,2.747609,22,291288,49.083255,1.887738,Highway,Moderate,13.880458,52.883905,18.938861,955.411642,285.333301,0.535792,0.088781,0.105212,0.028507,Low,0.0597
1,Bus,Electric,5.743714,3,188398,23.460311,3.544147,City,Heavy,19.136947,5.697343,14.716526,1035.763062,212.122597,0.769889,0.160845,0.113082,0.161983,Low,0.054372
2,Bus,Hybrid,4.606368,17,281451,115.50835,4.646886,Highway,Free flow,8.123428,3.953116,18.644522,976.207064,1988.4351,5.936141,0.279258,1.635782,0.326102,High,0.46995
3,Truck,Electric,3.913024,4,151321,18.540217,3.581004,Rural,Heavy,35.994893,0.509678,11.951859,966.919462,150.65371,1.045159,0.216989,0.179697,0.160652,Low,0.05027
4,Truck,Hybrid,1.611297,15,91810,109.596566,3.879303,Highway,Moderate,14.793481,52.450884,2.761138,953.229351,722.650839,6.048134,0.025248,1.674203,0.035663,Medium,0.209511



Features (X) shape: (10000, 18)
Target (y) shape: (10000,)

Numerical features: ['Engine Size', 'Age of Vehicle', 'Mileage', 'Speed', 'Acceleration', 'Temperature', 'Humidity', 'Wind Speed', 'Air Pressure', 'CO2 Emissions', 'NOx Emissions', 'PM2.5 Emissions', 'VOC Emissions', 'SO2 Emissions']
Categorical features: ['Vehicle Type', 'Fuel Type', 'Road Type', 'Traffic Conditions']

Train/Test Split:
X_train: (8000, 18)  | X_test: (2000, 18)
y_train: (8000,)  | y_test: (2000,)

‚úÖ Preprocessing complete ‚Äî Shapes:
X_train: (8000, 28)  | X_test: (2000, 28)

üöÄ Training Gradient Boosting Regressor for Emission Index...

üìä Model Performance:
 - MSE: 0.000037
 - R¬≤ Score: 0.9992

Sample Predictions:
        Actual  Predicted Predicted_Level
6252  0.349310   0.352251          Medium
4684  0.381383   0.376674          Medium
1731  0.059477   0.053777             Low
4742  0.031086   0.028204             Low
4521  0.253320   0.255679             Low

üíæ Saved Emission Index model and p

In [6]:
import joblib
p = joblib.load("C:\\Users\\ADMIN\\Desktop\\CI LAB\\engine_fault_system\\models\\emission_level_preprocessor.pkl")
print(type(p))



<class 'sklearn.compose._column_transformer.ColumnTransformer'>
