# Model Training & Evaluation

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from feature_engineering import engineer_features, prepare_data
from model import train_models
from evaluation import compare_models, evaluate_model

In [None]:
df = pd.read_csv('../data/raw/fuel_data.csv')
df = engineer_features(df)
X, y, scaler = prepare_data(df)

print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")
print(f"\nFeature Names:\n{list(X.columns)}")

In [None]:
trained_models, X_train, X_test, y_train, y_test = train_models(X, y)
print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
results = compare_models(trained_models, X_test, y_test)
results_df = pd.DataFrame(results).T
print("\nðŸ“Š MODEL PERFORMANCE COMPARISON")
print("="*60)
print(results_df.round(4))

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (name, model) in enumerate(trained_models.items()):
    y_pred = model.predict(X_test)
    axes[idx].scatter(y_test, y_pred, alpha=0.5)
    axes[idx].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[idx].set_xlabel('Actual Fuel (L)')
    axes[idx].set_ylabel('Predicted Fuel (L)')
    axes[idx].set_title(f'{name}\nRÂ² = {results[name]["R2"]:.4f}')

plt.tight_layout()
plt.show()

In [None]:
# XGBoost Only: Actual vs Predicted Fuel Consumption
xgb_model = trained_models['XGBoost']
y_pred_xgb = xgb_model.predict(X_test)

plt.figure(figsize=(10, 8))
plt.scatter(y_test, y_pred_xgb, alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3, label='Perfect Prediction')
plt.xlabel('Actual Fuel Consumption (Liters)', fontsize=14, fontweight='bold')
plt.ylabel('Predicted Fuel Consumption (Liters)', fontsize=14, fontweight='bold')
plt.title(f'XGBoost Model: Actual vs Predicted\nRÂ² = {results["XGBoost"]["R2"]:.4f} | RMSE = {results["XGBoost"]["RMSE"]:.2f} L', fontsize=16, fontweight='bold')
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
best_model = trained_models['XGBoost']
feature_names = X.columns
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances[indices])
plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45, ha='right')
plt.title('Feature Importance (XGBoost)')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.show()

print("\nTop 5 Most Important Features:")
for i in range(5):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

In [None]:
y_pred = best_model.predict(X_test)
residuals = y_test - y_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(y_pred, residuals, alpha=0.5)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted Fuel (L)')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')

axes[1].hist(residuals, bins=30, edgecolor='black')
axes[1].set_xlabel('Residuals')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Residual Distribution')

plt.tight_layout()
plt.show()

In [None]:
metrics_df = results_df[['MAE', 'RMSE', 'R2']]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, metric in enumerate(['MAE', 'RMSE', 'R2']):
    metrics_df[metric].plot(kind='bar', ax=axes[idx], color=['#1f77b4', '#ff7f0e', '#2ca02c'])
    axes[idx].set_title(f'{metric} Comparison')
    axes[idx].set_ylabel(metric)
    axes[idx].set_xlabel('Model')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()