In [None]:
import pyreadr
import yaml

### WARNING: Running cells in this jupyter notebook at random wil overwrite data files. Run all once, then proceed with caution.

# That said, run this cell first to load the config. Reload this cell to reload config.
config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)

In [None]:
#CREATES DATASET


from pathlib import Path
from typing import Iterable, Optional, Sequence
import pandas as pd


def _safe_write(df: pd.DataFrame, out_path: Path) -> None:
    """
    Write a window file in a "fast" format if possible (Parquet),
    otherwise fall back to compressed CSV.
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path.with_suffix(".csv"), index=False)


def create_timeseries_windows_dataset(
    dataset_path: str | Path,
    window_months_list: Sequence[int] = (12, 6, 3),
    out_root: str | Path = "data_windows_fast",
    time_col: str = "utc",
    sort: bool = True,
    recursive: bool = True,
    require_full_window: bool = True,
    keep_cols: Optional[Iterable[str]] = None,
) -> None:
    """
    Build a new dataset of time-series windows from CSV files.

    - Reads each CSV in dataset_path (default: recursive).
    - Splits each file into consecutive windows of length window_months.
    - Saves each window as a separate file under:
        out_root/{window_months}m/

    Parameters
    ----------
    dataset_path : folder containing CSVs (e.g. "data_csv")
    window_months_list : e.g. (12,6,3)
    out_root : output folder
    time_col : timestamp column name in the CSV (e.g. "utc")
    require_full_window : if True, only windows that fully fit are saved
                          if False, the final partial window is also saved
    keep_cols : optionally restrict columns saved (e.g. ["utc","min","max"])
                if None, saves all columns
    """
    dataset_path = Path.cwd() / config["data_preparation"]["raw_dataset_csv"]
    out_root = Path.cwd() / config["data_preparation"]["windowed_dataset"]

    pattern = "**/*.csv" if recursive else "*.csv"
    csv_files = sorted(dataset_path.glob(pattern))

    print(f"[INFO] dataset_path={dataset_path.resolve()}")
    print(f"[INFO] out_root={out_root.resolve()}")
    print(f"[INFO] found {len(csv_files)} CSV files")

    if not csv_files:
        return

    for months in window_months_list:
        (out_root / f"{months}m").mkdir(parents=True, exist_ok=True)

    for i, csv_path in enumerate(csv_files, 1):
        try:
            df = pd.read_csv(csv_path)

            if time_col not in df.columns:
                print(f"[SKIP] {csv_path} (missing '{time_col}')")
                continue

            df[time_col] = pd.to_datetime(df[time_col], errors="coerce", utc=False)
            df = df.dropna(subset=[time_col])

            if df.empty:
                print(f"[SKIP] {csv_path} (no valid timestamps)")
                continue

            if sort:
                df = df.sort_values(time_col)

            if keep_cols is not None:
                keep_cols = list(keep_cols)
                # Always keep time_col
                if time_col not in keep_cols:
                    keep_cols = [time_col] + keep_cols
                existing = [c for c in keep_cols if c in df.columns]
                df = df[existing]

            t0 = df[time_col].iloc[0]
            tN = df[time_col].iloc[-1]

            rel = csv_path.relative_to(dataset_path)
            base_stem = rel.with_suffix("")  # preserves subfolders and stem

            for months in window_months_list:
                out_dir = out_root / f"{months}m" / base_stem.parent
                out_dir.mkdir(parents=True, exist_ok=True)

                start = t0
                win_idx = 0

                while True:
                    end = start + pd.DateOffset(months=int(months))

                    if require_full_window and end > tN:
                        break

                    # if not require_full_window, allow last partial window
                    if (not require_full_window) and (start > tN):
                        break

                    # slice: [start, end)
                    if require_full_window:
                        w = df[(df[time_col] >= start) & (df[time_col] < end)]
                    else:
                        w = df[(df[time_col] >= start) & (df[time_col] < min(end, tN))]

                    # If a window ends up empty (e.g. gaps), skip it
                    if not w.empty:
                        out_name = f"{base_stem.name}_{start:%Y%m%d}{end:%Y%m%d}__w{win_idx:04d}"
                        out_path = out_dir / out_name
                        _safe_write(w, out_path)

                    win_idx += 1
                    start = end

            if i % 50 == 0:
                print(f"[INFO] processed {i}/{len(csv_files)}")

        except Exception as e:
            print(f"[ERR] {csv_path}: {e}")

    print("[DONE] Windowed dataset created.")

create_timeseries_windows_dataset(
    dataset_path="data_csv",
    window_months_list=(12, 6, 3),
    out_root="data_windows_fast",
    time_col="utc",
    recursive=True,
    require_full_window=True,
    # keep_cols=["utc", "min", "max"]  # optional: keep only these columns
)