In [6]:
import pyreadr
import yaml
### WARNING: Running cells in this jupyter notebook at random wil overwrite data files. Run all once, then proceed with caution.

# That said, run this cell first to load the config. Reload this cell to reload config.
config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)

In [10]:
from pathlib import Path
import pyreadr
import pandas as pd

#This will convert all files in raw_data path in config from rds to csv.

IN_DIR = Path.cwd() / config["data_preparation"]["raw_dataset"]
OUT_DIR = Path.cwd() / config["data_preparation"]["raw_dataset_csv"]

def main():
    print("CWD    :", Path.cwd().resolve())
    print("IN_DIR :", IN_DIR.resolve())
    print("OUT_DIR:", OUT_DIR.resolve())

    if not IN_DIR.exists():
        raise FileNotFoundError(f"Missing folder: {IN_DIR}")

    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # recursive in case there are subfolders
    rds_files = sorted([p for p in IN_DIR.rglob("*") if p.suffix.lower() == ".rds"])
    print(f"Found {len(rds_files)} .rds files under {IN_DIR}")

    for rds_path in rds_files:
        print("Ramiro gay")
        rel = rds_path.relative_to(IN_DIR)
        out_csv = (OUT_DIR / rel).with_suffix(".csv")
        out_csv.parent.mkdir(parents=True, exist_ok=True)

        try:
            result = pyreadr.read_r(str(rds_path))
            if not result:
                print(f"[SKIP] {rel} (empty RDS)")
                continue

            dfs = [(name, obj) for name, obj in result.items() if isinstance(obj, pd.DataFrame)]
            if not dfs:
                print(f"[SKIP] {rel} (no DataFrame inside)")
                continue

            # one CSV per RDS: pick largest DF if multiple exist
            name, df = max(dfs, key=lambda t: len(t[1]))
            df.to_csv(out_csv, index=False)
            print(f"[OK]   {rel} -> {out_csv.relative_to(OUT_DIR)} (used '{name}', rows={len(df)})")

        except Exception as e:
            print(f"[ERR]  {rel}: {e}")

main()

CWD    : C:\Users\plasm\Documents\Coding\HackEurope\hackathon_IMAC
IN_DIR : C:\Users\plasm\Documents\Coding\HackEurope\hackathon_IMAC\data\raw_data
OUT_DIR: C:\Users\plasm\Documents\Coding\HackEurope\hackathon_IMAC\data\raw_data_csv
Found 505 .rds files under C:\Users\plasm\Documents\Coding\HackEurope\hackathon_IMAC\data\raw_data
Ramiro gay
[OK]   load_power_unit_1.rds -> load_power_unit_1.csv (used 'None', rows=136851)
Ramiro gay
[OK]   load_power_unit_10.rds -> load_power_unit_10.csv (used 'None', rows=615901)
Ramiro gay
[OK]   load_power_unit_100.rds -> load_power_unit_100.csv (used 'None', rows=616378)
Ramiro gay
[OK]   load_power_unit_101.rds -> load_power_unit_101.csv (used 'None', rows=26098)
Ramiro gay
[OK]   load_power_unit_102.rds -> load_power_unit_102.csv (used 'None', rows=338560)
Ramiro gay
[OK]   load_power_unit_103.rds -> load_power_unit_103.csv (used 'None', rows=30370)
Ramiro gay
[OK]   load_power_unit_104.rds -> load_power_unit_104.csv (used 'None', rows=616570)
Rami

In [None]:
from pathlib import Path

IN_DIR = Path.cwd() / config["data_preparation"]["raw_data"]
rds_files = sorted(IN_DIR.rglob("*.rds"))

print("Found", len(rds_files), "rds files")
for p in sorted(rds_files, key=lambda x: x.stat().st_size, reverse=True)[:20]:
    print(f"{p}  {p.stat().st_size/1024**3:.2f} GB")

In [None]:
from pathlib import Path
import gc
import pyreadr
import pandas as pd

# Run this cell to generate the windows to train on.

IN_DIR = Path.cwd() / config["data_preparation"]["raw_dataset_csv"]
OUT_DIR = Path.cwd() / config["data_preparation"]["windowed_dataset"]
OUT_DIR.mkdir(parents=True, exist_ok=True)

rds_files = sorted(IN_DIR.rglob("*.rds"))
print("Found", len(rds_files), "rds files")

for i, rds_path in enumerate(rds_files, 1):
    rel = rds_path.relative_to(IN_DIR)
    out_csv = (OUT_DIR / rel).with_suffix(".csv")
    out_csv.parent.mkdir(parents=True, exist_ok=True)

    size_gb = rds_path.stat().st_size / 1024**3
    print(f"[{i}/{len(rds_files)}] Reading {rel} ({size_gb:.2f} GB)")

    try:
        result = pyreadr.read_r(str(rds_path))

        # pick largest dataframe if multiple objects
        dfs = [(k, v) for k, v in result.items() if isinstance(v, pd.DataFrame)]
        if not dfs:
            print(f"  [SKIP] no DataFrame inside")
            continue

        name, df = max(dfs, key=lambda t: len(t[1]))
        print(f"  -> writing {out_csv} (rows={len(df)}, cols={df.shape[1]})")

        df.to_csv(out_csv, index=False)
        print("  [OK] wrote", out_csv)

    except MemoryError:
        print(f"  [ERR] MemoryError on {rel} â€” this file is too large for Python/pandas on your RAM.")
    except Exception as e:
        print(f"  [ERR] {rel}: {e}")
    finally:
        # critical: free memory
        try:
            del df
        except Exception:
            pass
        try:
            del result
        except Exception:
            pass
        # gc.collect()