In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script: feature_analysis_pack.py
Purpose:
    Run a consistent "feature analysis" on 1..N CSV datasets (national, zonal, consolidated),
    focusing on data integrity, descriptive statistics, and feature relevance w.r.t. daily
    national consumption -- without training predictive models.
"""

import argparse
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

LATEX_EXPORT = False


# ------------------------------
# LaTeX helpers
# ------------------------------
def ensure_dir(d: Path):
    d.mkdir(parents=True, exist_ok=True)


def df_to_latex(df: pd.DataFrame, out_tex: Path, caption: str, label: str):
    """Exports a DataFrame to LaTeX (simple table, booktabs)."""
    try:
        ensure_dir(out_tex.parent)
      
        df_limited = df.copy()
       
        for c in df_limited.columns:
            if pd.api.types.is_float_dtype(df_limited[c]):
                df_limited[c] = df_limited[c].astype(float).round(4)
        tex = df_limited.to_latex(
            index=False,
            escape=True,
            longtable=False,
            bold_rows=False,
            caption=caption,
            label=label,
        )
        header = (
            "%% Auto-generated by feature_analysis_pack.py -- do not edit by hand\n"
        )
        out_tex.write_text(header + tex, encoding="utf-8")
        log(f"Saved LaTeX: {out_tex}")
    except Exception as e:
        log(f"LaTeX export failed for {out_tex.name}: {e}")


def export_key_tables_to_latex(out_dir: Path, ds_name: str):
    """Reads CSVs generated in the dataset and exports"""
    latex_dir = out_dir / "latex"
    ensure_dir(latex_dir)

    def try_tex(csv_name: str, caption: str, label_suffix: str):
        csv_path = out_dir / csv_name
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            tex_path = latex_dir / (csv_name.replace(".csv", ".tex"))
            df_to_latex(df, tex_path, caption, f"tab:{ds_name}_{label_suffix}")
        else:
            log(f"(skip) CSV not found for LaTeX: {csv_name}")

    try_tex("schema_columns.csv", f"Schema (columns and dtypes) ‚Äî {ds_name}.", "schema")
    try_tex(
        "missingness_by_column.csv",
        f"Missingness by column (%) ‚Äî {ds_name}.",
        "missingness",
    )
    try_tex(
        "date_coverage.csv", f"Date coverage (min/max) ‚Äî {ds_name}.", "date_coverage"
    )
    try_tex("date_rows_per_year.csv", f"Rows per year ‚Äî {ds_name}.", "rows_year")
    try_tex(
        "date_rows_per_year_month.csv",
        f"Rows per year-month ‚Äî {ds_name}.",
        "rows_year_month",
    )
    try_tex(
        "integrity_checks.csv", f"Integrity checks summary ‚Äî {ds_name}.", "integrity"
    )
    try_tex(
        "numeric_descriptives.csv",
        f"Descriptive statistics of numeric features ‚Äî {ds_name}.",
        "descriptives",
    )
    try_tex(
        "outlier_iqr_summary.csv",
        f"Outlier summary by IQR rule ‚Äî {ds_name}.",
        "outliers",
    )
    try_tex(
        "correlations_pearson.csv",
        f"Pearson correlation with consumption ‚Äî {ds_name}.",
        "pearson",
    )
    try_tex(
        "correlations_spearman.csv",
        f"Spearman correlation with consumption ‚Äî {ds_name}.",
        "spearman",
    )
    try_tex(
        "partial_corr_month.csv",
        f"Partial correlation (controlling for month) ‚Äî {ds_name}.",
        "partial_corr",
    )
    try_tex(
        "mutual_information.csv",
        f"Mutual information with consumption ‚Äî {ds_name}.",
        "mi",
    )
    try_tex(
        "corr_matrix.csv",
        f"Feature correlation matrix (flattened) ‚Äî {ds_name}.",
        "corr_matrix",
    )
    try:
        
        corr_path = out_dir / "corr_matrix.csv"
        if corr_path.exists():
            corr_df = pd.read_csv(corr_path)
            if "feature" in corr_df.columns:
                
                pass
            else:
                
                corr_df = pd.read_csv(corr_path, header=0)
                
                if corr_df.columns[0] != "feature":
                    corr_df = corr_df.rename(columns={corr_df.columns[0]: "feature"})
                long = corr_df.melt(
                    id_vars=["feature"], var_name="feature_2", value_name="pearson"
                )
                df_to_latex(
                    long,
                    (out_dir / "latex" / "corr_matrix_long.tex"),
                    f"Feature correlation matrix (long format) ‚Äî {ds_name}.",
                    f"tab:{ds_name}_corr_matrix_long",
                )
    except Exception as e:
        log(f"LaTeX export (corr matrix long) failed: {e}")


def log(msg: str) -> None:
    now = datetime.now().strftime("%H:%M:%S")
    print(f"[{now}] {msg}", flush=True)


def read_csv_robust(path: Path) -> pd.DataFrame:
    trials = [(e, s) for e in ("utf-8", "latin1") for s in (",", ";", "\t")]
    last_err = None
    for enc, sep in trials:
        try:
            df = pd.read_csv(path, encoding=enc, sep=sep)
            if df.shape[1] >= 6 and df.shape[0] >= 100:
                log(f"Parsed CSV with enc={enc} sep='{sep}' shape={df.shape}")
                return df
        except Exception as e:
            last_err = e
    raise RuntimeError(f"Could not parse CSV '{path}'. Last error: {last_err}")


def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    return df


def detect_columns(df: pd.DataFrame) -> Tuple[str, Optional[str], Optional[str]]:
    date_col = None
    for c in df.columns:
        if c in ("date", "data"):
            date_col = c
            break
    if date_col is None:
        cand = [c for c in df.columns if "date" in c or "data" in c]
        date_col = cand[0] if cand else None

    zone_col = None
    for cand in ("zone", "zona", "municipio", "munic√≠pio", "concelho"):
        if cand in df.columns:
            zone_col = cand
            break

    consumo_col = None
    for cand in (
        "consumo_gwh",
        "consumo",
        "target",
        "gwh",
        "consumo_(gwh)",
        "consumo_diario_gwh",
    ):
        if cand in df.columns:
            consumo_col = cand
            break

    return date_col, zone_col, consumo_col


def coerce_dates(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce", dayfirst=True)
    if df[date_col].isna().all():
        df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    return df


def derive_meteo_numeric(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    df = df.copy()
    meteo_candidates = [
        "tmean_c",
        "tmax_c",
        "tmin_c",
        "hdd18",
        "cdd22",
        "amp_termica",
        "precip_mm",
        "rad_solar",
        "sunshine_sec",
        "humidade_relativa",
        "nebulosidade_media",
        "wind_speed_max",
        "wind_gusts_max",
        "day_length_hours",
        "sunshine_h",
        "day_length_h",
        "relative_humidity_2m_mean",
        "cloudcover_mean",
    ]
    present = [c for c in meteo_candidates if c in df.columns]
    for c in present:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    if "sunshine_sec" in df.columns and "sunshine_h" not in df.columns:
        df["sunshine_h"] = pd.to_numeric(df["sunshine_sec"], errors="coerce") / 3600.0
        present.append("sunshine_h")
    if "day_length_hours" in df.columns and "day_length_h" not in df.columns:
        df["day_length_h"] = pd.to_numeric(df["day_length_hours"], errors="coerce")
        present.append("day_length_h")
    return df, present


def expand_with_aggregated_columns(
    df: pd.DataFrame, meteo_cols: List[str]
) -> List[str]:
    """Includes typical aggregate variants from the national view (ex.: tmean_c_mean, hdd18_mean)."""
    cols = set(meteo_cols)
    bases = [
        "tmean_c",
        "tmax_c",
        "tmin_c",
        "hdd18",
        "cdd22",
        "amp_termica",
        "precip_mm",
        "rad_solar",
        "sunshine_h",
        "humidade_relativa",
        "nebulosidade_media",
        "wind_speed_max",
        "wind_gusts_max",
        "day_length_h",
    ]
    for b in bases:
        cand = f"{b}_mean"
        if cand in df.columns:
            cols.add(cand)
    for suf in ["_min", "_max", "_std"]:
        cand = f"tmean_c{suf}"
        if cand in df.columns:
            cols.add(cand)
    return list(cols)


def save_csv(df: pd.DataFrame, path: Path) -> None:
    df.to_csv(path, index=False)
    log(f"Saved: {path.resolve()} (rows={len(df):,})")


def plot_line(x, y, title: str, out_path: Path, xlabel: str = "", ylabel: str = ""):
    plt.figure()
    plt.plot(x, y)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(out_path, dpi=140)
    plt.close()


def plot_scatter(x, y, title: str, out_path: Path, xlabel: str = "", ylabel: str = ""):
    plt.figure()
    plt.scatter(x, y, s=9)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(out_path, dpi=140)
    plt.close()


def plot_heatmap(mat: np.ndarray, labels: List[str], title: str, out_path: Path):
    plt.figure()
    plt.imshow(mat, aspect="auto")
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.yticks(range(len(labels)), labels)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_path, dpi=140)
    plt.close()


def plot_bar(x, y, title: str, out_path: Path, xlabel: str = "", ylabel: str = ""):
    plt.figure()
    plt.bar(x, y)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(out_path, dpi=140)
    plt.close()


# ===============================================================
# INTEGRITY CHECKS
# ---------------------------------------------------------------
# This part ensures that data is complete and consistent:
# - Number of rows/columns
# - Missing values per column
# - Temporal coverage (min/max date)
# - Year/month record counts
# ===============================================================


def integrity_checks(
    df: pd.DataFrame, date_col: str, zone_col: Optional[str], out_dir: Path
) -> None:
    schema = pd.DataFrame(
        {"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]}
    )
    save_csv(schema, out_dir / "schema_columns.csv")

    miss = pd.DataFrame(
        {
            "column": df.columns,
            "missing_count": [df[c].isna().sum() for c in df.columns],
            "missing_pct": [100.0 * df[c].isna().mean() for c in df.columns],
        }
    ).sort_values("missing_pct", ascending=False)
    save_csv(miss, out_dir / "missingness_by_column.csv")

    if df[date_col].notna().any():
        coverage = [
            ("min_date", str(df[date_col].min().date())),
            ("max_date", str(df[date_col].max().date())),
        ]
        years = (
            df[df[date_col].notna()]
            .groupby(df[date_col].dt.year)
            .size()
            .reset_index(name="rows")
        )
        years.columns = ["year", "rows"]
        mask = df[date_col].notna()
        years_ser = df.loc[mask, date_col].dt.year.rename("year")
        months_ser = df.loc[mask, date_col].dt.month.rename("month")
        months = (
            df.loc[mask]
            .groupby([years_ser, months_ser])
            .size()
            .reset_index(name="rows")
        )
        save_csv(
            pd.DataFrame(coverage, columns=["metric", "value"]),
            out_dir / "date_coverage.csv",
        )
        save_csv(years, out_dir / "date_rows_per_year.csv")
        save_csv(months, out_dir / "date_rows_per_year_month.csv")

    if zone_col:
        dup_key = df.duplicated(subset=[date_col, zone_col]).sum()
        dup_df = pd.DataFrame(
            {"metric": ["duplicates_date_zone"], "value": [int(dup_key)]}
        )
        zones_day = (
            df.groupby(date_col)[zone_col].nunique().reset_index(name="zones_per_day")
        )
        zones_day.to_csv(out_dir / "zones_per_day.csv", index=False)
        zsum = (
            zones_day["zones_per_day"]
            .describe()
            .to_frame(name="zones_per_day")
            .reset_index()
            .rename(columns={"index": "stat"})
        )
        zsum["stat"] = "zones_per_day_" + zsum["stat"]
    else:
        dup_key = df.duplicated(subset=[date_col]).sum()
        dup_df = pd.DataFrame({"metric": ["duplicates_date"], "value": [int(dup_key)]})
        zsum = pd.DataFrame(columns=["stat", "zones_per_day"])

    integrity = pd.concat([dup_df, zsum], ignore_index=True)
    save_csv(integrity, out_dir / "integrity_checks.csv")


# ===============================================================
# DESCRIPTIVE STATISTICS
# ---------------------------------------------------------------
# Computes central tendency, dispersion, skewness, kurtosis, and detects outliers via IQR.
# üß© EXTEND HERE: Add visualization of feature distributions or boxplots for key variables.
# ===============================================================


def feature_statistics(df: pd.DataFrame, out_dir: Path) -> List[str]:
    # Seleciona apenas num√©ricos N√ÉO booleanos (bool causa erro em percentis)
    numeric_cols = []
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]) and not pd.api.types.is_bool_dtype(
            df[c]
        ):
            numeric_cols.append(c)
    if not numeric_cols:
        save_csv(
            pd.DataFrame(columns=["feature"]), out_dir / "numeric_descriptives.csv"
        )
        save_csv(
            pd.DataFrame(columns=["feature", "pct_below_fence", "pct_above_fence"]),
            out_dir / "outlier_iqr_summary.csv",
        )
        return []
    desc = df[numeric_cols].describe().T
    desc["skew"] = df[numeric_cols].skew(numeric_only=True)
    desc["kurtosis"] = df[numeric_cols].kurtosis(numeric_only=True)

    rows = []
    for c in numeric_cols:
        s = df[c].dropna()
        if len(s) < 10:
            rows.append((c, np.nan, np.nan))
            continue
        q1, q3 = np.percentile(s, [25, 75])
        iqr = q3 - q1
        lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        pct_lo = 100.0 * (s < lo).mean()
        pct_hi = 100.0 * (s > hi).mean()
        rows.append((c, pct_lo, pct_hi))
    outlier_df = pd.DataFrame(
        rows, columns=["feature", "pct_below_fence", "pct_above_fence"]
    )

    save_csv(
        desc.reset_index().rename(columns={"index": "feature"}),
        out_dir / "numeric_descriptives.csv",
    )
    save_csv(outlier_df, out_dir / "outlier_iqr_summary.csv")
    return numeric_cols


def partial_corr_month(
    df: pd.DataFrame, y_col: str, x_cols: List[str], date_col: str
) -> pd.DataFrame:
    if y_col not in df.columns:
        return pd.DataFrame(columns=["feature", "partial_corr_month"])
    # Drop rows with missing target OR missing date (avoid NaT -> NaN month)
    d = df.copy().dropna(subset=[y_col, date_col])
    if d.empty:
        return pd.DataFrame(columns=["feature", "partial_corr_month"])

    # Extract month safely as Int64 (but no NaN remains after drop)
    d["month"] = d[date_col].dt.month.astype("Int64")
    dm = pd.get_dummies(d["month"], prefix="m", drop_first=True)

    def residualize(target: np.ndarray, controls: np.ndarray) -> np.ndarray:
        ones = np.ones((controls.shape[0], 1))
        X = np.hstack([ones, controls])
        try:
            beta = np.linalg.pinv(X.T @ X) @ (X.T @ target)
            resid = target - (X @ beta)
            return resid
        except Exception:
            return target

    y = d[y_col].astype(float).values
    controls = dm.values
    y_resid = residualize(y, controls)

    rows = []
    for c in x_cols:
        if c not in d.columns:
            continue
        x = d[c].astype(float).values
        x_resid = residualize(x, controls)
        if np.isfinite(y_resid).sum() > 5 and np.isfinite(x_resid).sum() > 5:
            r = np.corrcoef(y_resid, x_resid)[0, 1]
        else:
            r = np.nan
        rows.append((c, r))
    return pd.DataFrame(rows, columns=["feature", "partial_corr_month"])


def feature_relevance(
    df: pd.DataFrame,
    date_col: str,
    consumo_col: Optional[str],
    meteo_cols: List[str],
    out_dir: Path,
) -> None:
    if consumo_col is None or consumo_col not in df.columns:
        log("No consumption column found; skipping relevance vs consumo.")
        return


    if not meteo_cols:
        meteo_cols = expand_with_aggregated_columns(df, meteo_cols)

    if not meteo_cols:
        log("No meteo features found; skipping relevance.")
        return

    cols = [c for c in meteo_cols if c in df.columns]
    d = df[[date_col, consumo_col] + cols].copy().dropna(subset=[consumo_col])

    pear = d[[consumo_col] + cols].corr(method="pearson")[consumo_col].dropna()
    spear = d[[consumo_col] + cols].corr(method="spearman")[consumo_col].dropna()
    save_csv(
        pear.reset_index().rename(columns={"index": "feature", consumo_col: "pearson"}),
        out_dir / "correlations_pearson.csv",
    )
    save_csv(
        spear.reset_index().rename(
            columns={"index": "feature", consumo_col: "spearman"}
        ),
        out_dir / "correlations_spearman.csv",
    )

    pc = partial_corr_month(d, consumo_col, cols, date_col)
    save_csv(pc, out_dir / "partial_corr_month.csv")

    try:
        from sklearn.feature_selection import mutual_info_regression

        X = d[cols].astype(float).values
        y = d[consumo_col].astype(float).values
        if X.shape[1] > 0:
            mi = mutual_info_regression(X, y, random_state=0)
            mi_df = pd.DataFrame({"feature": cols, "mutual_information": mi})
            save_csv(
                mi_df.sort_values("mutual_information", ascending=False),
                out_dir / "mutual_information.csv",
            )
    except Exception as e:
        log(f"sklearn not available or MI failed ({e}); skipping mutual information.")

    # Lags: aceitar base ou base_mean
    lag_candidates = []
    for base in ["tmean_c", "hdd18", "cdd22"]:
        if base in cols:
            lag_candidates.append(base)
        elif f"{base}_mean" in cols:
            lag_candidates.append(f"{base}_mean")

    rows = []
    for c in lag_candidates:
        s1 = d.set_index(date_col)[consumo_col].astype(float).sort_index()
        s2 = d.set_index(date_col)[c].astype(float).sort_index()
        for lag in range(0, 8):
            if lag == 0:
                r = s1.corr(s2)
            else:
                r = s1.corr(s2.shift(lag))
            rows.append((c, lag, r))
    save_csv(
        pd.DataFrame(rows, columns=["feature", "lag_days", "corr"]),
        out_dir / "lag_ccf_basic.csv",
    )

    charts = out_dir / "charts"
    charts.mkdir(exist_ok=True)
    for c in lag_candidates:
        merged = d[[date_col, consumo_col, c]].dropna().sort_values(date_col)
        plot_scatter(
            merged[c],
            merged[consumo_col],
            f"Consumption vs {c}",
            charts / f"scatter_consumo_vs_{c}.png",
            xlabel=c,
            ylabel="consumo_gwh",
        )


def collinearity_and_structure(
    df: pd.DataFrame, meteo_cols: List[str], out_dir: Path
) -> None:
    charts = out_dir / "charts"
    charts.mkdir(exist_ok=True)

    if len(meteo_cols) >= 2:
        corr = df[meteo_cols].corr(method="pearson")
        save_csv(
            corr.reset_index().rename(columns={"index": "feature"}),
            out_dir / "corr_matrix.csv",
        )
        plot_heatmap(
            corr.values,
            meteo_cols,
            "Feature Correlation Matrix",
            charts / "corr_matrix_heatmap.png",
        )

    try:
        import statsmodels.api as sm

        vif_rows = []
        X = df[meteo_cols].dropna().astype(float)
        X = X.replace([np.inf, -np.inf], np.nan).dropna()
        if X.shape[0] > 50 and X.shape[1] >= 2:
            X = (X - X.mean()) / (X.std(ddof=0) + 1e-9)
            for i, col in enumerate(meteo_cols):
                y = X[col].values
                X_ = np.delete(X.values, i, axis=1)
                X_ = sm.add_constant(X_)
                model = sm.OLS(y, X_).fit()
                r2 = model.rsquared
                vif = 1.0 / (1.0 - r2) if r2 < 0.9999 else np.inf
                vif_rows.append((col, vif))
            vif_df = pd.DataFrame(vif_rows, columns=["feature", "vif"])
            save_csv(vif_df.sort_values("vif", ascending=False), out_dir / "vif.csv")
    except Exception as e:
        log(f"statsmodels not available or VIF failed ({e}); skipping VIF.")

    try:
        from sklearn.decomposition import PCA

        X = df[meteo_cols].dropna().astype(float)
        X = X.replace([np.inf, -np.inf], np.nan).dropna()
        if X.shape[0] > 100 and X.shape[1] >= 2:
            X = (X - X.mean()) / (X.std(ddof=0) + 1e-9)
            pca = PCA()
            pca.fit(X)
            explained = pd.DataFrame(
                {
                    "component": [
                        f"PC{i+1}" for i in range(len(pca.explained_variance_ratio_))
                    ],
                    "explained_variance_ratio": pca.explained_variance_ratio_,
                }
            )
            save_csv(explained, out_dir / "pca_explained.csv")
    except Exception as e:
        log(f"sklearn not available or PCA failed ({e}); skipping PCA.")


def quick_timeseries(
    df: pd.DataFrame, date_col: str, consumo_col: Optional[str], out_dir: Path
) -> None:
    if consumo_col is None or consumo_col not in df.columns:
        return
    charts = out_dir / "charts"
    charts.mkdir(exist_ok=True)

    d = df[[date_col, consumo_col]].dropna().sort_values(date_col)
    if d.empty:
        return

    plot_line(
        d[date_col],
        d[consumo_col],
        "National Daily Consumption (GWh)",
        charts / "consumo_timeseries.png",
        xlabel="Date",
        ylabel="GWh",
    )

    tmp = d.copy()
    tmp["month"] = tmp[date_col].dt.month
    by_month = tmp.groupby("month")[consumo_col].mean().reset_index()
    plot_bar(
        by_month["month"],
        by_month[consumo_col],
        "Average Consumption by Month",
        charts / "consumo_by_month.png",
        xlabel="Month",
        ylabel="GWh",
    )

    tmp["dow"] = tmp[date_col].dt.dayofweek
    by_dow = tmp.groupby("dow")[consumo_col].mean().reset_index()
    plot_bar(
        by_dow["dow"],
        by_dow[consumo_col],
        "Average Consumption by Day of Week (0=Mon)",
        charts / "consumo_by_dow.png",
        xlabel="DOW",
        ylabel="GWh",
    )

    series = d.set_index(date_col)[consumo_col].astype(float).sort_index()
    x = series.values
    x = x - np.nanmean(x)
    max_lag = 60
    acf = [1.0]
    for lag in range(1, max_lag + 1):
        v1 = x[:-lag]
        v2 = x[lag:]
        ac = np.nansum(v1 * v2) / np.nansum(x * x)
        acf.append(ac)
    plt.figure()
    plt.stem(range(0, max_lag + 1), acf)  # compat without 'use_line_collection'
    plt.title("ACF of Daily Consumption (lag up to 60)")
    plt.xlabel("Lag")
    plt.ylabel("Autocorrelation")
    plt.tight_layout()
    plt.savefig(charts / "acf_consumo_lag60.png", dpi=140)
    plt.close()


def process_dataset(in_path: Path, out_root: Path) -> None:
    log(f"Loading dataset: {in_path.name}")
    df = read_csv_robust(in_path)
    df = normalize_columns(df)
    date_col, zone_col, consumo_col = detect_columns(df)
    if date_col is None:
        raise SystemExit("Date column not found (expected 'date' or 'data').")
    df = coerce_dates(df, date_col)
    df, meteo_cols = derive_meteo_numeric(df)
    meteo_cols = expand_with_aggregated_columns(df, meteo_cols)

    ds_name = in_path.stem
    out_dir = out_root / ds_name
    charts_dir = out_dir / "charts"
    out_dir.mkdir(parents=True, exist_ok=True)
    charts_dir.mkdir(exist_ok=True)

    integrity_checks(df, date_col, zone_col, out_dir)
    numeric_cols = feature_statistics(df, out_dir)
    feature_relevance(df, date_col, consumo_col, meteo_cols, out_dir)
    collinearity_and_structure(df, meteo_cols, out_dir)
    quick_timeseries(df, date_col, consumo_col, out_dir)

    if LATEX_EXPORT:
        export_key_tables_to_latex(out_dir, ds_name)

    log(f"Finished dataset: {in_path.name}")


# ===============================================================
# MAIN DRIVER
# ---------------------------------------------------------------
# Runs the full analysis for each dataset provided in arguments.
# üß© EXTEND HERE: You can add graphical dashboards or feature importance plots here.
# ===============================================================
def main():
    parser = argparse.ArgumentParser(
        description="Feature analysis pack for energy datasets (integrity, statistics, relevance)."
    )
    parser.add_argument(
        "--latex",
        action="store_true",
        help="Export key CSV outputs as LaTeX tables (.tex)",
    )
    parser.add_argument(
        "--inputs", nargs="+", required=True, help="Paths to 1..N CSV files"
    )
    parser.add_argument(
        "--out-root", default="eda_features", help="Root output directory"
    )
    args = parser.parse_args()
    global LATEX_EXPORT
    LATEX_EXPORT = bool(args.latex)

    out_root = Path(args.out_root)
    out_root.mkdir(parents=True, exist_ok=True)

    for inp in args.inputs:
        process_dataset(Path(inp), out_root)

    log("All datasets processed.")


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--latex] --inputs INPUTS [INPUTS ...]
                             [--out-root OUT_ROOT]
ipykernel_launcher.py: error: the following arguments are required: --inputs


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
!python "C:\Users\pcata\OneDrive\Ambiente de Trabalho\feature_analysis_pack.py" \
    --inputs \
    "C:\Users\pcata\OneDrive\Ambiente de Trabalho\final_nacional_diario.csv" \
    "C:\Users\pcata\OneDrive\Ambiente de Trabalho\dataset_meteo_com_consumo.csv" \
    "C:\Users\pcata\OneDrive\Ambiente de Trabalho\final_zonal_diario.csv"


^C
[16:24:03] Loading dataset: final_nacional_diario.csv
[16:24:03] Parsed CSV with enc=utf-8 sep=';' shape=(3931, 24)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\schema_columns.csv (rows=24)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\missingness_by_column.csv (rows=24)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\date_coverage.csv (rows=2)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\date_rows_per_year.csv (rows=11)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\date_rows_per_year_month.csv (rows=129)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\integrity_checks.csv (rows=1)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\numeric_descriptives.csv (rows=23)
[16:24:03] Saved: C:\Users\pcata\00_Thesis\eda_features\final_nacional_diario\outlier_iqr_summary.csv (ro