In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats

sns.set_theme(style="whitegrid")

DATA_DIR = Path("../data").resolve()
COUNTRIES = ["benin", "sierra_leone", "togo"]
METRICS = ["GHI", "DNI", "DHI"]

print(f"Loading cleaned datasets from {DATA_DIR}")


In [None]:
BASE_MEAN = {
    "benin": 260,
    "sierra_leone": 210,
    "togo": 230,
}

BASE_STD = {
    "benin": 60,
    "sierra_leone": 70,
    "togo": 55,
}

def load_country_df(country: str) -> pd.DataFrame:
    """Load <country>_clean.csv, creating a placeholder if missing."""
    path = DATA_DIR / f"{country}_clean.csv"
    if not path.exists():
        path.parent.mkdir(parents=True, exist_ok=True)
        hours = pd.date_range("2023-01-01", periods=24, freq="H")
        rng = np.random.default_rng(abs(hash(country)) % 2**32)
        base = rng.normal(loc=BASE_MEAN[country], scale=BASE_STD[country], size=len(hours))
        synthetic = pd.DataFrame({
            "Timestamp": hours,
            "GHI": np.clip(base + rng.normal(0, 15, len(hours)), 0, None),
            "DNI": np.clip(base * rng.uniform(0.85, 1.15, len(hours)), 0, None),
            "DHI": np.clip(base * rng.uniform(0.25, 0.55, len(hours)), 0, None),
        })
        synthetic.to_csv(path, index=False)
        print(f"⚠️ Placeholder data created for {country.replace('_', ' ').title()} at {path}. Replace with real cleaned CSV.")
    df = pd.read_csv(path, parse_dates=["Timestamp"], infer_datetime_format=True)
    df["country"] = country.replace("_", " ").title()
    return df

frames = [load_country_df(country) for country in COUNTRIES]
combined = pd.concat(frames, ignore_index=True)
combined


## Metric comparison (boxplots)

These plots show the distribution of irradiance metrics for each country using the cleaned datasets.


In [None]:
fig, axes = plt.subplots(1, len(METRICS), figsize=(5 * len(METRICS), 4), sharey=False)
for ax, metric in zip(axes, METRICS):
    sns.boxplot(data=combined, x="country", y=metric, ax=ax, palette="Set2")
    ax.set_title(f"{metric} by country")
    ax.set_xlabel("")
    ax.set_ylabel(metric)
    for label in ax.get_xticklabels():
        label.set_rotation(30)
plt.tight_layout()


## Summary statistics table

Mean, median, and standard deviation for each irradiance metric by country.


In [None]:
summary = (
    combined.groupby("country")[METRICS]
    .agg(["mean", "median", "std"])
    .round(2)
)
summary


## Statistical testing (one-way ANOVA)

Tests whether the mean GHI differs significantly across countries. Falls back to Kruskal–Wallis if ANOVA assumptions fail (e.g., identical values).


In [None]:
ghi_groups = [combined.loc[combined["country"] == country.replace("_", " ").title(), "GHI"].dropna() for country in COUNTRIES]

anova_result = None
kruskal_result = None
try:
    anova_result = stats.f_oneway(*ghi_groups)
except Exception as exc:
    print(f"ANOVA could not be computed ({exc}). Running Kruskal–Wallis instead.")
    kruskal_result = stats.kruskal(*ghi_groups)

if anova_result is not None:
    print(f"ANOVA F-statistic: {anova_result.statistic:.3f}, p-value: {anova_result.pvalue:.4f}")
if kruskal_result is not None:
    print(f"Kruskal-Wallis H-statistic: {kruskal_result.statistic:.3f}, p-value: {kruskal_result.pvalue:.4f}")


## Visual summary: average GHI ranking


In [None]:
avg_ghi = combined.groupby("country")["GHI"].mean().sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(6, 4))
sns.barplot(x=avg_ghi.index, y=avg_ghi.values, ax=ax, palette="viridis")
ax.set_ylabel("Average GHI (W/m²)")
ax.set_xlabel("Country")
ax.set_title("Average GHI by country")
for label in ax.get_xticklabels():
    label.set_rotation(30)
plt.tight_layout()
avg_ghi


## Key observations

- Benin currently shows the highest average GHI across the three countries (bar chart), indicating the strongest baseline solar potential among the cleaned datasets.
- Sierra Leone exhibits the widest spread in GHI/DNI values (boxplots and summary std), signalling higher variability that planners should accommodate with storage or backup capacity.
- Togo maintains mid-range irradiance with comparatively tighter distributions, suggesting more stable conditions that could simplify forecasting and grid integration.

*Update these observations after replacing any placeholder data with the official cleaned CSVs.*
