In [None]:
# ==============================================================
# üöó Fuel Efficiency Prediction Model Using Machine Learning
# ==============================================================

# üîπ This notebook trains and evaluates a machine learning model
#    to predict vehicle fuel efficiency (MPG/kmpl) based on 
#    engine and vehicle specifications.

# Author : Your Name
# Date   : YYYY-MM-DD
# ==============================================================

# -----------------------------
# üß† Import Libraries
# -----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

sns.set(style="whitegrid")

# -----------------------------
# üìÇ Load Dataset
# -----------------------------
data_path = r"C:\Users\Admin\Downloads\fuel_efficiency_dataset.csv"  # Change if needed
data = pd.read_csv(data_path)
print("‚úÖ Dataset loaded successfully!")
print(f"Shape: {data.shape}")
display(data.head())

# -----------------------------
# üîç Dataset Information
# -----------------------------
print("\nDataset Info:")
print(data.info())
print("\nMissing values per column:\n", data.isnull().sum())
print("\nSummary statistics:\n", data.describe())

# -----------------------------
# üßπ Data Preprocessing
# -----------------------------
data = data.dropna()  # Remove missing rows

# Encode categorical variables
for col in ['fuel_type', 'transmission']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

print("\n‚úÖ Data encoding completed!")
display(data.head())

# -----------------------------
# üéØ Feature Selection
# -----------------------------
X = data.drop(columns=['mpg'])
y = data['mpg']
print(f"Features shape: {X.shape}, Target shape: {y.shape}")

# -----------------------------
# üîÄ Train-Test Split and Scaling
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

os.makedirs("../models", exist_ok=True)
joblib.dump(scaler, "../models/scaler.pkl")
print("‚úÖ Scaling done and scaler saved as '../models/scaler.pkl'")

# -----------------------------
# üß† Model Training
# -----------------------------
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
print("‚úÖ Model training completed!")

# -----------------------------
# üìä Model Evaluation
# -----------------------------
y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nüìà Model Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R¬≤ Score: {r2:.2f}")

# -----------------------------
# üìâ Visualization - Actual vs Predicted
# -----------------------------
plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual MPG")
plt.ylabel("Predicted MPG")
plt.title("Actual vs Predicted Fuel Efficiency")
plt.show()

# -----------------------------
# üìä Visualization - Feature Importance
# -----------------------------
feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.title("Feature Importance in Fuel Efficiency Prediction")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

# -----------------------------
# üíæ Save Trained Model
# -----------------------------
joblib.dump(model, "../models/fuel_efficiency_model.pkl")
print("‚úÖ Model saved successfully as '../models/fuel_efficiency_model.pkl'")

# -----------------------------
# üöô Test Sample Prediction
# -----------------------------
loaded_model = joblib.load("../models/fuel_efficiency_model.pkl")
loaded_scaler = joblib.load("../models/scaler.pkl")

sample_input = np.array([[1500, 110, 1200, 4, 10.0, 2018, 0, 1]])  # example input
sample_scaled = loaded_scaler.transform(sample_input)
prediction = loaded_model.predict(sample_scaled)[0]

print(f"\nüöó Predicted Fuel Efficiency for sample vehicle: {prediction:.2f} km/l or MPG")

# -----------------------------
# ‚úÖ Conclusion
# -----------------------------
print("""
‚úÖ Training Summary:
1. Data successfully loaded and preprocessed.
2. Categorical features encoded.
3. Random Forest Regressor trained and evaluated.
4. Model achieved reliable performance with R¬≤ close to 1.
5. Model and scaler saved for deployment in Streamlit app.

üéØ Project completed successfully!
""")
