In [4]:
!pip install seaborn




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# ============================================================
# Energy Efficiency Project – Visualization (Saved to Files)
# ============================================================

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
DATASET_PATH = "../dataset/raw/energy_efficiency_full_dataset.csv"
OUTPUT_DIR = "visualizations"

os.makedirs(OUTPUT_DIR, exist_ok=True)

FEATURES = [
    "Relative Compactness",
    "Surface Area",
    "Wall Area",
    "Roof Area",
    "Overall Height",
    "Orientation",
    "Glazing Area",
    "Glazing Area Distribution"
]

TARGETS = ["Heating Load", "Cooling Load"]

# ------------------------------------------------------------
# Load Dataset
# ------------------------------------------------------------
df = pd.read_csv(DATASET_PATH)

# ------------------------------------------------------------
# 1. Target Distributions
# ------------------------------------------------------------
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df["Heating Load"], kde=True)
plt.title("Heating Load Distribution")

plt.subplot(1, 2, 2)
sns.histplot(df["Cooling Load"], kde=True)
plt.title("Cooling Load Distribution")

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/01_target_distribution.png")
plt.close()

# ------------------------------------------------------------
# 2. Feature Distributions
# ------------------------------------------------------------
df[FEATURES].hist(figsize=(16, 12), bins=30)
plt.suptitle("Feature Distributions", y=1.02)
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/02_feature_distributions.png")
plt.close()

# ------------------------------------------------------------
# 3. Correlation Heatmap
# ------------------------------------------------------------
plt.figure(figsize=(14, 10))
corr = df[FEATURES + TARGETS].corr()

sns.heatmap(
    corr,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5
)

plt.title("Feature–Target Correlation Heatmap")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/03_correlation_heatmap.png")
plt.close()

# ------------------------------------------------------------
# 4. Feature vs Target Scatter Plots
# ------------------------------------------------------------
for target in TARGETS:
    plt.figure(figsize=(18, 12))
    for i, feature in enumerate(FEATURES):
        plt.subplot(3, 3, i + 1)
        sns.scatterplot(x=df[feature], y=df[target], alpha=0.6)
        plt.xlabel(feature)
        plt.ylabel(target)

    plt.suptitle(f"{target} vs Features", y=1.02)
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/04_{target.lower().replace(' ', '_')}_vs_features.png")
    plt.close()

# ------------------------------------------------------------
# 5. Actual vs Predicted Plot
# ------------------------------------------------------------
def save_actual_vs_pred(y_true, y_pred, label):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, alpha=0.6)
    plt.plot(
        [y_true.min(), y_true.max()],
        [y_true.min(), y_true.max()],
        linestyle="--"
    )
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(f"Actual vs Predicted – {label}")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/05_actual_vs_pred_{label.lower().replace(' ', '_')}.png")
    plt.close()

# Example usage:
# save_actual_vs_pred(y_test[:, 0], y_pred[:, 0], "Heating Load")
# save_actual_vs_pred(y_test[:, 1], y_pred[:, 1], "Cooling Load")

# ------------------------------------------------------------
# 6. Residual Analysis
# ------------------------------------------------------------
def save_residual_plots(y_true, y_pred, label):
    residuals = y_true - y_pred

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.histplot(residuals, kde=True)
    plt.title(f"Residual Distribution – {label}")

    plt.subplot(1, 2, 2)
    plt.scatter(y_pred, residuals, alpha=0.6)
    plt.axhline(0, linestyle="--")
    plt.xlabel("Predicted")
    plt.ylabel("Residual")
    plt.title(f"Residuals vs Predicted – {label}")

    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/06_residuals_{label.lower().replace(' ', '_')}.png")
    plt.close()

# Example usage:
# save_residual_plots(y_test[:, 0], y_pred[:, 0], "Heating Load")
# save_residual_plots(y_test[:, 1], y_pred[:, 1], "Cooling Load")

# ------------------------------------------------------------
# 7. Model Comparison Plots
# ------------------------------------------------------------
metrics_df = pd.DataFrame({
    "Model": [
        "Naive Mean",
        "Linear Regression",
        "Decision Tree",
        "Random Forest",
        "Gradient Boosting"
    ],
    "RMSE": [9.95, 3.08, 2.35, 1.55, 0.97],
    "R2": [-0.01, 0.90, 0.94, 0.97, 0.99]
})

plt.figure(figsize=(10, 5))
sns.barplot(data=metrics_df, x="Model", y="RMSE")
plt.title("Model Comparison – RMSE")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/07_model_comparison_rmse.png")
plt.close()

plt.figure(figsize=(10, 5))
sns.barplot(data=metrics_df, x="Model", y="R2")
plt.title("Model Comparison – R² Score")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/08_model_comparison_r2.png")
plt.close()

# ------------------------------------------------------------
# 8. Feature Importance
# ------------------------------------------------------------
def save_feature_importance(model):
    importance = pd.Series(model.feature_importances_, index=FEATURES).sort_values()

    plt.figure(figsize=(8, 6))
    importance.plot(kind="barh")
    plt.title("Feature Importance – Gradient Boosting")
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/09_feature_importance.png")
    plt.close()

# Example usage:
# save_feature_importance(trained_model)

# ------------------------------------------------------------
# 9. Pair Plot (Optional – Heavy)
# ------------------------------------------------------------
pairplot = sns.pairplot(df[FEATURES + TARGETS], corner=True)
pairplot.savefig(f"{OUTPUT_DIR}/10_pairplot.png")
plt.close("all")

print("All visualizations saved successfully in the 'visualizations/' folder.")


All visualizations saved successfully in the 'visualizations/' folder.
