In [1]:
"""
Script: build_temperature_clusters_and_map_v2.py
Description:
    - Loads the consolidated dataset (dataset_meteo_com_consumo.csv)
    - Aggregates seasonal mean temperature per zone for a selected year
    - Runs K-Means clustering on zones using seasonal means as features
    - Chooses K either via CLI (--k) or automatically via silhouette (K in [2..5])
    - Saves a clusters CSV ready for mapping
    - Generates an interactive Folium map (HTML) coloring zones by cluster
    - NEW: Draws a translucent rectangular bounding box per cluster (to visualize overlap),
           and keeps the optional convex hull outline.

Requirements (install once):
    pip install pandas scikit-learn folium shapely
Usage:
    python build_temperature_clusters_and_map_v2.py --csv dataset_meteo_com_consumo.csv --year 2024
    python build_temperature_clusters_and_map_v2.py --csv dataset_meteo_com_consumo.csv --year 2024 --k 3
    python build_temperature_clusters_and_map_v2.py --csv dataset_meteo_com_consumo.csv --year 2024 --out-prefix clusters_temp_2024
"""

import argparse
from datetime import datetime
import pandas as pd
import numpy as np

# Optional deps for map
try:
    import folium
    from shapely.geometry import MultiPoint
except Exception:
    folium = None
    MultiPoint = None

# ML
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

SEASONS = ["Winter", "Spring", "Summer", "Autumn"]


def log(msg: str):
    now = datetime.now().strftime("%H:%M:%S")
    print(f"[{now}] {msg}", flush=True)


def month_to_season(m: int) -> str:
    if m in (12, 1, 2):
        return "Winter"
    elif m in (3, 4, 5):
        return "Spring"
    elif m in (6, 7, 8):
        return "Summer"
    else:
        return "Autumn"


def seasonal_means(df: pd.DataFrame, year: int) -> pd.DataFrame:
    df = df.copy()
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
    if "year" not in df.columns and "date" in df.columns:
        df["year"] = df["date"].dt.year
    df = df[df["year"] == year].copy()
    if "month" not in df.columns and "date" in df.columns:
        df["month"] = df["date"].dt.month

    req = {"zone", "latitude", "longitude", "tmean_c", "month"}
    missing = [c for c in req if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    df["season"] = df["month"].apply(month_to_season)
    out = (
        df.groupby(["zone", "latitude", "longitude", "season"], as_index=False)[
            "tmean_c"
        ]
        .mean()
        .rename(columns={"tmean_c": "tmean_c_mean"})
    )
    wide = out.pivot_table(
        index=["zone", "latitude", "longitude"], columns="season", values="tmean_c_mean"
    ).reset_index()
    for s in SEASONS:
        if s not in wide.columns:
            wide[s] = np.nan
    wide = wide[["zone", "latitude", "longitude"] + SEASONS]
    return wide


def choose_k_and_cluster(X: np.ndarray, k: int | None):
    best_labels = None
    best_k = None
    scores = {}
    if k is not None:
        model = KMeans(n_clusters=k, n_init=25, random_state=42)
        labels = model.fit_predict(X)
        return labels, k, scores

    log("Selecting K via silhouette in range [2..5] ...")
    best_score = -1
    for kk in range(2, min(5, len(X)) + 1):
        try:
            model = KMeans(n_clusters=kk, n_init=25, random_state=42)
            labels = model.fit_predict(X)
            score = silhouette_score(X, labels, metric="euclidean")
            scores[kk] = score
            log(f"  K={kk}: silhouette={score:.3f}")
            if score > best_score:
                best_score = score
                best_k = kk
                best_labels = labels
        except Exception as e:
            log(f"  K={kk}: silhouette failed: {e}")
    if best_labels is None:
        best_k = 2
        best_labels = KMeans(n_clusters=2, n_init=25, random_state=42).fit_predict(X)
    log(f"Chosen K={best_k}")
    return best_labels, best_k, scores


def draw_cluster_rectangles(m, df_clusters: pd.DataFrame, colors: list[str]):
    for cl in sorted(df_clusters["cluster"].astype(int).unique()):
        sub = df_clusters[df_clusters["cluster"] == cl]
        south = float(sub["latitude"].min())
        north = float(sub["latitude"].max())
        west = float(sub["longitude"].min())
        east = float(sub["longitude"].max())
        color = colors[cl % len(colors)]
        fg = folium.FeatureGroup(name=f"Cluster {cl} – bbox", show=True)
        m.add_child(fg)
        folium.Rectangle(
            bounds=[[south, west], [north, east]],
            color=color,
            weight=2,
            dash_array="5,5",
            fill=True,
            fill_color=color,
            fill_opacity=0.10,
            tooltip=f"Cluster {cl} bounding box",
        ).add_to(fg)


def build_map(df_clusters: pd.DataFrame, html_path: str):
    if folium is None:
        log("Folium not installed; skipping map generation.")
        return

    center_lat = df_clusters["latitude"].mean()
    center_lon = df_clusters["longitude"].mean()

    m = folium.Map(
        location=[center_lat, center_lon],
        zoom_start=6.3,
        control_scale=True,
        tiles="CartoDB positron",
    )

    palette = [
        "red",
        "blue",
        "green",
        "purple",
        "orange",
        "darkred",
        "cadetblue",
        "darkpurple",
    ]

    groups = {}
    for cl in sorted(df_clusters["cluster"].astype(int).unique()):
        fg = folium.FeatureGroup(name=f"Cluster {cl} – points", show=True)
        m.add_child(fg)
        groups[cl] = fg

    for _, r in df_clusters.iterrows():
        cl = int(r["cluster"])
        color = palette[cl % len(palette)]
        popup = folium.Popup(
            f"<b>{r['zone']}</b><br>"
            + "<br>".join(
                [
                    f"{s}: {r[s]:.1f}°C"
                    for s in ["Winter", "Spring", "Summer", "Autumn"]
                    if pd.notna(r[s])
                ]
            ),
            max_width=300,
        )
        folium.CircleMarker(
            location=[float(r["latitude"]), float(r["longitude"])],
            radius=8,
            color="black",
            weight=1,
            fill=True,
            fill_color=color,
            fill_opacity=0.9,
            tooltip=f"Cluster {cl} — {r['zone']}",
            popup=popup,
        ).add_to(groups[cl])

    # Optional convex hulls
    if MultiPoint is not None:
        try:
            for cl in sorted(df_clusters["cluster"].astype(int).unique()):
                sub = df_clusters[df_clusters["cluster"] == cl]
                pts = [
                    (float(x), float(y))
                    for x, y in zip(sub["longitude"], sub["latitude"])
                ]
                if len(pts) >= 3:
                    hull = MultiPoint(pts).convex_hull
                    if hasattr(hull, "exterior"):
                        coords = [(lat, lon) for lon, lat in hull.exterior.coords]
                        folium.Polygon(
                            locations=coords,
                            color=palette[cl % len(palette)],
                            fill=False,
                            weight=2,
                            tooltip=f"Cluster {cl} hull",
                        ).add_to(m)
        except Exception:
            pass

    # NEW: bounding rectangles
    draw_cluster_rectangles(m, df_clusters, palette)

    folium.LayerControl(collapsed=False).add_to(m)
    m.save(html_path)
    log(f"Interactive map saved to: {html_path}")


def main():
    ap = argparse.ArgumentParser(
        description="Cluster zones by seasonal mean temperatures and build an interactive map with cluster envelopes."
    )
    ap.add_argument(
        "--csv", default="dataset_meteo_com_consumo.csv", help="Path to input dataset"
    )
    ap.add_argument(
        "--year", type=int, default=2024, help="Year to aggregate (e.g., 2024)"
    )
    ap.add_argument(
        "--k",
        type=int,
        default=None,
        help="Number of clusters; if omitted, auto-select via silhouette (2..5)",
    )
    ap.add_argument(
        "--out-prefix",
        default=None,
        help="Output prefix (default: clusters_temp_<year>)",
    )
    args = ap.parse_args()

    out_prefix = args.out_prefix or f"clusters_temp_{args.year}"

    log("=== STEP 1: Loading & aggregating seasonal means ===")
    df = pd.read_csv(args.csv)
    per_zone = seasonal_means(df, args.year)
    log(f"Per-zone seasonal means shape: {per_zone.shape}")

    log("=== STEP 2: Clustering ===")
    X = per_zone[SEASONS].to_numpy(dtype=float)
    if np.isnan(X).any():
        col_means = np.nanmean(X, axis=0, keepdims=True)
        inds = np.where(np.isnan(X))
        X[inds] = np.take(col_means, inds[1], axis=1)

    labels, k_chosen, scores = choose_k_and_cluster(X, args.k)
    per_zone["cluster"] = labels.astype(int)
    per_zone["k_used"] = k_chosen

    csv_out = f"{out_prefix}.csv"
    per_zone.to_csv(csv_out, index=False)
    log(f"Clusters CSV saved to: {csv_out}")

    html_out = f"{out_prefix}_map.html"
    build_map(per_zone, html_out)

    log("=== DONE ===")
    log(
        "Columns in clusters CSV: zone, latitude, longitude, Winter, Spring, Summer, Autumn, cluster, k_used"
    )


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--csv CSV] [--year YEAR] [--k K]
                             [--out-prefix OUT_PREFIX]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\pcata\AppData\Roaming\jupyter\runtime\kernel-4b26b891-6dbe-4a53-bb31-eb2aa26f801b.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [10]:
"""
Script: build_temperature_clusters_and_map_v2.py (Versão Corrigida Final v3)
"""

import argparse
from datetime import datetime
import pandas as pd
import numpy as np
import sys 
# Optional deps for map
try:
    import folium
    from shapely.geometry import MultiPoint
except Exception:
    folium = None
    MultiPoint = None

# ML
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

SEASONS = ["Winter", "Spring", "Summer", "Autumn"]


def log(msg: str):
    now = datetime.now().strftime("%H:%M:%S")
    print(f"[{now}] {msg}", flush=True)


def month_to_season(m: int) -> str:
    if m in (12, 1, 2):
        return "Winter"
    elif m in (3, 4, 5):
        return "Spring"
    elif m in (6, 7, 8):
        return "Summer"
    else:
        return "Autumn"


def seasonal_means(df: pd.DataFrame, year: int) -> pd.DataFrame:
    df = df.copy()
    
    if "year" not in df.columns:
        df["year"] = df["date"].dt.year
    df = df[df["year"] == year].copy()
    if "month" not in df.columns:
        df["month"] = df["date"].dt.month

    # Since the columns have already been cleaned in the main, we assume that ‘zone’, ‘latitude’, ‘longitude’, and ‘tmean_c’ exist in lowercase
    req = {"zone", "latitude", "longitude", "tmean_c", "month"}
    missing = [c for c in req if c not in df.columns]
    if missing:
        # If there is an error here, it is because the column has the wrong name in the CSV file.
        raise ValueError(f"A key column is missing after cleaning: {missing}. Verify that the columns ‘zone’, ‘latitude’, ‘longitude’, and ‘tmean_c’ exist in the CSV.")

    df["season"] = df["month"].apply(month_to_season)
    out = (
        df.groupby(["zone", "latitude", "longitude", "season"], as_index=False)[
            "tmean_c"
        ]
        .mean()
        .rename(columns={"tmean_c": "tmean_c_mean"})
    )
    wide = out.pivot_table(
        index=["zone", "latitude", "longitude"], columns="season", values="tmean_c_mean"
    ).reset_index()
    for s in SEASONS:
        if s not in wide.columns:
            wide[s] = np.nan
    wide = wide[["zone", "latitude", "longitude"] + SEASONS]
    return wide


def choose_k_and_cluster(X: np.ndarray, k: int | None):
    best_labels = None
    best_k = None
    scores = {}
    
    if len(X) < 2:
        log("Data set is too small for clustering (less than 2 samples).")
        return np.array([]), 0, {}

    if k is not None:
        model = KMeans(n_clusters=k, n_init=25, random_state=42)
        labels = model.fit_predict(X)
        return labels, k, scores

    log("Selecting K via silhouette in range [2..5] ...")
    best_score = -1
    max_k = min(5, len(X))
    if max_k < 2:
         log("Data set too small for silhouette score calculation.")
         best_k = 1 
         return np.zeros(len(X), dtype=int), 1, {}

    for kk in range(2, max_k + 1):
        try:
            model = KMeans(n_clusters=kk, n_init=25, random_state=42)
            labels = model.fit_predict(X)
            score = silhouette_score(X, labels, metric="euclidean")
            scores[kk] = score
            log(f"  K={kk}: silhouette={score:.3f}")
            if score > best_score:
                best_score = score
                best_k = kk
                best_labels = labels
        except Exception as e:
            log(f"  K={kk}: silhouette failed: {e}")
            
    if best_labels is None:
        # Fallback to K=2 if silhouette fails for all Ks
        best_k = 2
        best_labels = KMeans(n_clusters=2, n_init=25, random_state=42).fit_predict(X)
        
    log(f"Chosen K={best_k}")
    return best_labels, best_k, scores


def draw_cluster_rectangles(m, df_clusters: pd.DataFrame, colors: list[str]):
    for cl in sorted(df_clusters["cluster"].astype(int).unique()):
        sub = df_clusters[df_clusters["cluster"] == cl]
        south = float(sub["latitude"].min())
        north = float(sub["latitude"].max())
        west = float(sub["longitude"].min())
        east = float(sub["longitude"].max())
        color = colors[cl % len(colors)]
        fg = folium.FeatureGroup(name=f"Cluster {cl} – bbox", show=True)
        m.add_child(fg)
        folium.Rectangle(
            bounds=[[south, west], [north, east]],
            color=color,
            weight=2,
            dash_array="5,5",
            fill=True,
            fill_color=color,
            fill_opacity=0.10,
            tooltip=f"Cluster {cl} bounding box",
        ).add_to(fg)


def build_map(df_clusters: pd.DataFrame, html_path: str):
    if folium is None:
        log("Folium not installed; skipping map generation.")
        return
    
    if df_clusters.empty:
        log("Cannot build map: No data for clustering.")
        return

    center_lat = df_clusters["latitude"].mean()
    center_lon = df_clusters["longitude"].mean()

    m = folium.Map(
        location=[center_lat, center_lon],
        zoom_start=6.3,
        control_scale=True,
        tiles="CartoDB positron",
    )

    palette = [
        "red",
        "blue",
        "green",
        "purple",
        "orange",
        "darkred",
        "cadetblue",
        "darkpurple",
    ]

    groups = {}
    for cl in sorted(df_clusters["cluster"].astype(int).unique()):
        fg = folium.FeatureGroup(name=f"Cluster {cl} – points", show=True)
        m.add_child(fg)
        groups[cl] = fg

    for _, r in df_clusters.iterrows():
        cl = int(r["cluster"])
        color = palette[cl % len(palette)]
        popup = folium.Popup(
            f"<b>{r['zone']}</b><br>"
            + "<br>".join(
                [
                    f"{s}: {r[s]:.1f}°C"
                    for s in ["Winter", "Spring", "Summer", "Autumn"]
                    if pd.notna(r[s])
                ]
            ),
            max_width=300,
        )
        folium.CircleMarker(
            location=[float(r["latitude"]), float(r["longitude"])],
            radius=8,
            color="black",
            weight=1,
            fill=True,
            fill_color=color,
            fill_opacity=0.9,
            tooltip=f"Cluster {cl} — {r['zone']}",
            popup=popup,
        ).add_to(groups[cl])

    # Optional convex hulls
    if MultiPoint is not None:
        try:
            for cl in sorted(df_clusters["cluster"].astype(int).unique()):
                sub = df_clusters[df_clusters["cluster"] == cl]
                pts = [
                    (float(x), float(y))
                    for x, y in zip(sub["longitude"], sub["latitude"])
                ]
                if len(pts) >= 3:
                    hull = MultiPoint(pts).convex_hull
                    if hasattr(hull, "exterior"):
                        coords = [(lat, lon) for lon, lat in hull.exterior.coords]
                        folium.Polygon(
                            locations=coords,
                            color=palette[cl % len(palette)],
                            fill=False,
                            weight=2,
                            tooltip=f"Cluster {cl} hull",
                        ).add_to(m)
        except Exception:
            pass

    # NEW: bounding rectangles
    draw_cluster_rectangles(m, df_clusters, palette)

    folium.LayerControl(collapsed=False).add_to(m)
    m.save(html_path)
    log(f"Interactive map saved to: {html_path}")


def main():
    ap = argparse.ArgumentParser(
        description="Cluster zones by seasonal mean temperatures and build an interactive map with cluster envelopes."
    )
    ap.add_argument(
        "--csv", default="dataset_meteo_com_consumo.csv", help="Path to input dataset"
    )
    ap.add_argument(
        "--year", type=int, default=2024, help="Year to aggregate (e.g., 2024)"
    )
    ap.add_argument(
        "--k",
        type=int,
        default=None,
        help="Number of clusters; if omitted, auto-select via silhouette (2..5)",
    )
    ap.add_argument(
        "--out-prefix",
        default=None,
        help="Output prefix (default: clusters_temp_<year>)",
    )
    
    # CRITICAL CORRECTION FOR EXECUTION IN JUPYTER (resolves the ‘-f’ error)
    if 'ipykernel' in sys.modules:
        if len(sys.argv) > 1 and sys.argv[1].startswith('-f'):
             sys.argv = [sys.argv[0]]
             
    args = ap.parse_args()

    out_prefix = args.out_prefix or f"clusters_temp_{args.year}"

    log("=== STEP 1: Loading & aggregating seasonal means ===")
    
    # 1. Reading CSV (Assuming separator ‘,’)
    df = pd.read_csv(args.csv, sep=',')
    
    # 2. LAST CRITICAL ADJUSTMENT: Standardizes and cleans columns
    df.columns = df.columns.str.lower().str.strip() 
    
    # DIAGNOSTICS: Prints the columns so we can see exactly what is in the DataFrame
    log(f"Columns read (clean): {df.columns.tolist()}")

    # 3. FINAL CHECK of the Column 'date'
    if 'date' not in df.columns:
        if 'data' in df.columns:
             df.rename(columns={'data': 'date'}, inplace=True)
             log("Column ‘date’ found and renamed to 'date'.")
        else:
             raise KeyError(f"The date column was not found. ‘date’ (or ‘data’) was expected. Available columns: {df.columns.tolist()}")


    # 4. ✅ FINAL FILTER ADJUSTMENT: Use dayfirst=True for DD/MM/YYYY Portuguese format
    # This solves the ‘NaT’ problem that was filtering all lines..
    df["date"] = pd.to_datetime(df["date"], dayfirst=True, errors="coerce")
    data_limite = pd.to_datetime("2025-09-30")
    
    # Removes lines where the date failed conversion (now more robust)
    df.dropna(subset=['date'], inplace=True)
    
    df_original_len = len(df)
    # Filtro metodológico
    df = df[df["date"] <= data_limite].copy()
    
    if len(df) == 0:
        log("CRITICAL DATA ERROR: The filter resulted in 0 rows. Please verify that the dates in your CSV are not all AFTER 2025-09-30, or that the actual format is ‘MM/DD/YYYY’ (remove ‘dayfirst=True’).")
        return
    
    log(f"Filtered data: {df_original_len} -> {len(df)} linhas (Corte: 2025-09-30)")


    # Now that the DataFrame is clean and has ‘date’ in datetime format, we can use it.
    per_zone = seasonal_means(df, args.year)
    log(f"Per-zone seasonal means shape: {per_zone.shape}")
    
    if per_zone.empty:
        log("There is no average temperature data per zone for the selected year and filter.")
        return

    log("=== STEP 2: Clustering ===")
    # Note: X should now have 4 columns (SEASONS) and N rows (ZONES > 0)
    X = per_zone[SEASONS].to_numpy(dtype=float)
    if np.isnan(X).any():
        col_means = np.nanmean(X, axis=0, keepdims=True)
        inds = np.where(np.isnan(X))
        X[inds] = np.take(col_means, inds[1], axis=1)

    labels, k_chosen, scores = choose_k_and_cluster(X, args.k)
    
    if len(labels) > 0:
        per_zone["cluster"] = labels.astype(int)
        per_zone["k_used"] = k_chosen
    else:
        log("Clustering skipped: There are no valid samples after cleaning NaNs.")
        return
        
    csv_out = f"{out_prefix}.csv"
    per_zone.to_csv(csv_out, index=False)
    log(f"Clusters CSV saved to: {csv_out}")

    html_out = f"{out_prefix}_map.html"
    build_map(per_zone, html_out)

    log("=== DONE ===")
    log(
        "Columns in clusters CSV: zone, latitude, longitude, Winter, Spring, Summer, Autumn, cluster, k_used"
    )


if __name__ == "__main__":
    main()

[03:19:37] === STEP 1: Loading & aggregating seasonal means ===
[03:19:38] Columns read (clean): ['time', 'tmax_c', 'tmin_c', 'tmean_c', 'precip_mm', 'wind_speed_max', 'wind_gusts_max', 'rad_solar', 'sunshine_sec', 'humidade_relativa', 'nebulosidade_media', 'sunrise', 'sunset', 'date', 'zone', 'latitude', 'longitude', 'hdd18', 'cdd22', 'amp_termica', 'day_length_hours', 'dow', 'is_weekend', 'month', 'year', 'is_holiday', 'is_dst', 'consumo_gwh']
[03:19:38] Filtered data: 83862 -> 82404 lines (Cut: 2025-09-30)
[03:19:38] Per-zone seasonal means shape: (54, 7)
[03:19:38] === STEP 2: Clustering ===
[03:19:38] Selecting K via silhouette in range [2..5] ...
[03:19:41]   K=2: silhouette=0.390
[03:19:42]   K=3: silhouette=0.428
[03:19:42]   K=4: silhouette=0.431
[03:19:42]   K=5: silhouette=0.428
[03:19:42] Chosen K=4
[03:19:42] Clusters CSV saved to: clusters_temp_2024.csv
[03:19:42] Folium not installed; skipping map generation.
[03:19:42] === DONE ===
[03:19:42] Columns in clusters CSV: zo