In [1]:
from __future__ import annotations
import pyreadr
import yaml

### WARNING: Running cells in this jupyter notebook at random wil overwrite data files. Run all once, then proceed with caution.

# That said, run this cell first to load the config. Reload this cell to reload config.
config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)

In [None]:
from pathlib import Path

IN_DIR = Path.cwd() / config["data_preparation"]["raw_dataset"]
rds_files = sorted(IN_DIR.rglob("*.rds"))

print("Found", len(rds_files), "rds files")
for p in sorted(rds_files, key=lambda x: x.stat().st_size, reverse=True)[:20]:
    print(f"{p}  {p.stat().st_size/1024**3:.2f} GB")

In [None]:
from pathlib import Path
import pyreadr
import pandas as pd

# Run this cell to generate convert rds to csv.

IN_DIR = Path.cwd() / config["data_preparation"]["raw_dataset"]
OUT_DIR = Path.cwd() / config["data_preparation"]["raw_dataset_csv"]
OUT_DIR.mkdir(parents=True, exist_ok=True)

rds_files = sorted(IN_DIR.rglob("*.rds"))
print("Found", len(rds_files), "rds files")

for i, rds_path in enumerate(rds_files, 1):
    rel = rds_path.relative_to(IN_DIR)
    out_csv = (OUT_DIR / rel).with_suffix(".csv")
    out_csv.parent.mkdir(parents=True, exist_ok=True)

    size_gb = rds_path.stat().st_size / 1024**3
    print(f"[{i}/{len(rds_files)}] Reading {rel} ({size_gb:.2f} GB)")

    try:
        result = pyreadr.read_r(str(rds_path))

        # pick largest dataframe if multiple objects
        dfs = [(k, v) for k, v in result.items() if isinstance(v, pd.DataFrame)]
        if not dfs:
            print(f"  [SKIP] no DataFrame inside")
            continue

        name, df = max(dfs, key=lambda t: len(t[1]))
        print(f"  -> writing {out_csv} (rows={len(df)}, cols={df.shape[1]})")

        df.to_csv(out_csv, index=False)
        print("  [OK] wrote", out_csv)

    except MemoryError:
        print(f"  [ERR] MemoryError on {rel} â€” this file is too large for Python/pandas on your RAM.")
    except Exception as e:
        print(f"  [ERR] {rel}: {e}")
    finally:
        # critical: free memory
        try:
            del df
        except Exception:
            pass
        try:
            del result
        except Exception:
            pass
        # gc.collect()

In [3]:
from pathlib import Path
import numpy as np
import pandas as pd


def build_epochs_npz(
    root_dir: str | Path,
    out_path: str | Path,
    value_col: str = "max",
    metric_filter: str = "load_power",
    resample_rule: str = "2h",  # 2-hour averages
    recursive: bool = True,
    min_len: int = 10,
):
    """
    Load all CSVs under root_dir, build list[np.ndarray] of 1D arrays,
    save as compressed NPZ, and return the list.

    Each file becomes one epoch (after filtering + resampling).
    """

    root_dir = Path(root_dir)
    out_path = Path(out_path)

    # ---------------------------
    # Collect CSV files
    # ---------------------------
    pattern = "**/*.csv" if recursive else "*.csv"
    files = sorted(root_dir.glob(pattern))

    if not files:
        raise RuntimeError(f"No CSV files found in {root_dir}")

    print(f"[INFO] Found {len(files)} CSV files")

    epochs = []

    for i, path in enumerate(files):
        try:
            df = pd.read_csv(path)

            # Filter metric
            if "metric" in df.columns:
                df = df[df["metric"].astype(str).str.lower() == metric_filter.lower()]
                if df.empty:
                    continue

            # Require utc
            if "utc" not in df.columns:
                print(f"[WARN] Skipping {path.name}: no 'utc' column")
                continue

            df["utc"] = pd.to_datetime(df["utc"], errors="coerce")
            df = df.dropna(subset=["utc"]).sort_values("utc")

            if value_col not in df.columns:
                print(f"[WARN] Skipping {path.name}: no '{value_col}' column")
                continue

            # Numeric conversion
            df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
            df = df.dropna(subset=[value_col])

            if df.empty:
                continue

            # ---------------------------
            # 2-hour resampling
            # ---------------------------
            df = df.set_index("utc")
            y = (
                df[value_col]
                .resample(resample_rule)
                .mean()
                .dropna()
                .to_numpy(dtype=np.float64)
            )

            if len(y) < min_len:
                continue

            # Optional mean-centering
            y = y - float(np.mean(y))

            epochs.append(y)

            if (i + 1) % 50 == 0:
                print(f"[INFO] Processed {i+1}/{len(files)} files")

        except Exception as e:
            print(f"[ERROR] Failed {path}: {e}")

    if not epochs:
        raise RuntimeError("No valid epochs were built.")

    print(f"[INFO] Built {len(epochs)} epochs")

    # ---------------------------
    # Save NPZ
    # ---------------------------
    # Store each epoch as separate array entry
    save_dict = {f"arr_{i}": arr for i, arr in enumerate(epochs)}
    np.savez_compressed(out_path, **save_dict)

    print(f"[INFO] Saved NPZ to {out_path.resolve()}")

    return epochs

build_epochs_npz(
    config["data_preparation"]["raw_dataset_csv"],
    config["data_preparation"]["npz_data_output"],
    value_col="max",
    metric_filter="load_power",
    recursive=True,
)

[INFO] Found 505 CSV files
[INFO] Processed 50/505 files
[INFO] Processed 100/505 files
[INFO] Processed 150/505 files
[INFO] Processed 200/505 files
[INFO] Processed 250/505 files
[INFO] Built 270 epochs
[INFO] Saved NPZ to C:\Users\plasm\Documents\Coding\HackEurope\hackathon_IMAC\data\processed_npz


[array([ 0.2246578 ,  0.51821138,  0.04939177, ..., -1.07656751,
        -0.47960083,  0.40684556], shape=(1141,)),
 array([-0.19862474, -0.18569141, -0.27396641, ..., -0.36734356,
         0.21250992, -0.13843856], shape=(5124,)),
 array([-0.74597769, -0.63021935, -0.59102769, ..., -0.17077554,
        -0.19979039, -0.84709102], shape=(5124,)),
 array([ 0.82086559,  0.85586559,  0.78686559,  0.79486559, -1.58513441,
         0.79041687,  0.79707771,  0.79786559,  0.77486559,  0.80086559,
         0.80686559,  0.75886559, -1.03868798, -0.03468798, -1.94618704,
        -2.00470818, -2.20048679, -1.721939  ,  0.83353226,  0.88277648,
         0.77886559,  0.06518138, -0.53196094, -1.12682012, -1.63228934,
        -0.12383138, -0.64271774,  0.27367019,  0.87811559,  0.81722823,
         0.7811741 ,  0.77899767,  0.76814649,  0.56139191,  0.24400845,
         0.08889621,  0.19126559, -0.89589198,  0.63128226,  1.85186559,
         0.80686559,  0.79579059, -1.12648884, -1.67513441,  0.80636

In [6]:
from pathlib import Path
from typing import Iterable, Optional, Sequence
import pandas as pd


def _safe_write(df: pd.DataFrame, out_path: Path) -> None:
    """
    Write a window file in a "fast" format if possible (Parquet),
    otherwise fall back to compressed CSV.
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path.with_suffix(".csv"), index=False)


def create_timeseries_windows_dataset(
    dataset_path: str | Path,
    window_months_list: Sequence[int] = (12, 6, 3),
    out_root: str | Path = "data_windows_fast",
    time_col: str = "utc",
    sort: bool = True,
    recursive: bool = True,
    require_full_window: bool = True,
    keep_cols: Optional[Iterable[str]] = None,
) -> None:
    """
    Build a new dataset of time-series windows from CSV files.

    - Reads each CSV in dataset_path (default: recursive).
    - Splits each file into consecutive windows of length window_months.
    - Saves each window as a separate file under:
        out_root/{window_months}m/

    Parameters
    ----------
    dataset_path : folder containing CSVs (e.g. "data_csv")
    window_months_list : e.g. (12,6,3)
    out_root : output folder
    time_col : timestamp column name in the CSV (e.g. "utc")
    require_full_window : if True, only windows that fully fit are saved
                          if False, the final partial window is also saved
    keep_cols : optionally restrict columns saved (e.g. ["utc","min","max"])
                if None, saves all columns
    """
    dataset_path = Path.cwd() / config["data_preparation"]["raw_dataset_csv"]
    out_root = Path.cwd() / config["data_preparation"]["windowed_dataset"]

    pattern = "**/*.csv" if recursive else "*.csv"
    csv_files = sorted(dataset_path.glob(pattern))

    print(f"[INFO] dataset_path={dataset_path.resolve()}")
    print(f"[INFO] out_root={out_root.resolve()}")
    print(f"[INFO] found {len(csv_files)} CSV files")

    if not csv_files:
        return

    for months in window_months_list:
        (out_root / f"{months}m").mkdir(parents=True, exist_ok=True)

    for i, csv_path in enumerate(csv_files, 1):
        try:
            df = pd.read_csv(csv_path)

            if time_col not in df.columns:
                print(f"[SKIP] {csv_path} (missing '{time_col}')")
                continue

            df[time_col] = pd.to_datetime(df[time_col], errors="coerce", utc=False)
            df = df.dropna(subset=[time_col])

            if df.empty:
                print(f"[SKIP] {csv_path} (no valid timestamps)")
                continue

            if sort:
                df = df.sort_values(time_col)

            if keep_cols is not None:
                keep_cols = list(keep_cols)
                # Always keep time_col
                if time_col not in keep_cols:
                    keep_cols = [time_col] + keep_cols
                existing = [c for c in keep_cols if c in df.columns]
                df = df[existing]

            t0 = df[time_col].iloc[0]
            tN = df[time_col].iloc[-1]

            rel = csv_path.relative_to(dataset_path)
            base_stem = rel.with_suffix("")  # preserves subfolders and stem

            for months in window_months_list:
                out_dir = out_root / f"{months}m" / base_stem.parent
                out_dir.mkdir(parents=True, exist_ok=True)

                start = t0
                win_idx = 0

                while True:
                    end = start + pd.DateOffset(months=int(months))

                    if require_full_window and end > tN:
                        break

                    # if not require_full_window, allow last partial window
                    if (not require_full_window) and (start > tN):
                        break

                    # slice: [start, end)
                    if require_full_window:
                        w = df[(df[time_col] >= start) & (df[time_col] < end)]
                    else:
                        w = df[(df[time_col] >= start) & (df[time_col] < min(end, tN))]

                    # If a window ends up empty (e.g. gaps), skip it
                    if not w.empty:
                        out_name = f"{base_stem.name}_{start:%Y%m%d}{end:%Y%m%d}__w{win_idx:04d}"
                        out_path = out_dir / out_name
                        _safe_write(w, out_path)

                    win_idx += 1
                    start = end

            if i % 50 == 0:
                print(f"[INFO] processed {i}/{len(csv_files)}")

        except Exception as e:
            print(f"[ERR] {csv_path}: {e}")

    print("[DONE] Windowed dataset created.")

create_timeseries_windows_dataset(
    dataset_path="data_csv",
    window_months_list=(12, 6, 3),
    out_root="data_windows_fast",
    time_col="utc",
    recursive=True,
    require_full_window=True,
    # keep_cols=["utc", "min", "max"]  # optional: keep only these columns
)

[INFO] dataset_path=C:\Users\plasm\Documents\Coding\HackEurope\hackathon_IMAC\data\raw_data_csv
[INFO] out_root=C:\Users\plasm\Documents\Coding\HackEurope\hackathon_IMAC\data\windowed
[INFO] found 505 CSV files


KeyboardInterrupt: 