In [2]:
import gc, time, sys
from pathlib import Path

try:
    import pyreadstat
except ImportError:
    print("Please install pyreadstat: pip install pyreadstat")
    sys.exit(1)

try:
    import pyarrow as pa
    import pyarrow.parquet as pq
except ImportError:
    print("Please install pyarrow: pip install pyarrow")
    sys.exit(1)

INPUT_DIR = Path("/Users/rishaan/Desktop/CERAS-Cognitive-Efficiency-Reasoning-Alignment-System/Dataset")
OUT_DIR = Path("/Users/rishaan/Desktop/CERAS-Cognitive-Efficiency-Reasoning-Alignment-System/data/raw")
CHUNK_ROWS = 200_000  
COMPRESSION = "snappy"

OUT_DIR.mkdir(parents=True, exist_ok=True)

def convert_one_sav_to_parquet(sav_path: Path, out_parquet_path: Path, chunk_rows: int = CHUNK_ROWS):
    offset = 0
    writer = None
    start = time.time()
    try:
        while True:
            try:
                df, meta = pyreadstat.read_sav(str(sav_path), row_limit=chunk_rows, row_offset=offset)
            except Exception as e:
                print(f"  [ERROR] reading chunk at offset {offset} for {sav_path.name}: {type(e).__name__}: {e}")
                raise

            nrows = 0 if df is None else df.shape[0]
            if nrows == 0:
                if offset == 0:
                    print(f"  [WARN] No rows read from {sav_path} (empty or unreadable).")
                break

            table = pa.Table.from_pandas(df, preserve_index=False)

            if writer is None:
                writer = pq.ParquetWriter(str(out_parquet_path), table.schema, compression=COMPRESSION)
                print(f"  Created writer for: {out_parquet_path.name}")

            writer.write_table(table)
            print(f"  Wrote rows {offset} .. {offset + nrows - 1}  ({nrows} rows)")

            offset += nrows
            del df, meta, table
            gc.collect()
            time.sleep(0.05)

    except KeyboardInterrupt:
        print("[INTERRUPT] User requested stop.")
        raise
    finally:
        if writer is not None:
            writer.close()
            print("Parquet writer closed.")

    elapsed = time.time() - start
    print(f"  Completed {sav_path.name} in {elapsed:.1f}s -> {out_parquet_path} (size: {out_parquet_path.stat().st_size / 1024**2:.1f} MB)")

def main():
    if not INPUT_DIR.exists() or not INPUT_DIR.is_dir():
        print("ERROR: INPUT_DIR not found or not a directory:", INPUT_DIR)
        sys.exit(1)

    sav_files = sorted([p for p in INPUT_DIR.glob("*.sav")] + [p for p in INPUT_DIR.glob("*.SAV")])
    if not sav_files:
        print("No .sav files found in:", INPUT_DIR)
        sys.exit(0)

    print(f"Found {len(sav_files)} .sav file(s) in {INPUT_DIR}\n")
    for sav in sav_files:
        try:
            out_parquet = OUT_DIR / (sav.stem + ".parquet")
            if out_parquet.exists():
                print(f"Skipping {sav.name} â€” output already exists: {out_parquet.name}")
                continue
            print(f"Converting: {sav.name}")
            convert_one_sav_to_parquet(sav, out_parquet, chunk_rows=CHUNK_ROWS)
        except Exception as exc:
            print(f"[FAILED] {sav.name}: {type(exc).__name__}: {exc}")
            continue

if __name__ == "__main__":
    print("INPUT DIR:", INPUT_DIR)
    print("OUTPUT DIR:", OUT_DIR)
    print("CHUNK_ROWS:", CHUNK_ROWS)
    print("COMPRESSION:", COMPRESSION)
    print("Starting conversion (run from terminal, not Jupyter recommended)...\n")
    main()

INPUT DIR: /Users/rishaan/Desktop/CERAS-Cognitive-Efficiency-Reasoning-Alignment-System/Dataset
OUTPUT DIR: /Users/rishaan/Desktop/CERAS-Cognitive-Efficiency-Reasoning-Alignment-System/data/raw
CHUNK_ROWS: 200000
COMPRESSION: snappy
Starting conversion (run from terminal, not Jupyter recommended)...

Found 4 .sav file(s) in /Users/rishaan/Desktop/CERAS-Cognitive-Efficiency-Reasoning-Alignment-System/Dataset

Converting: CY08MSP_SCH_QQQ.SAV
  Created writer for: CY08MSP_SCH_QQQ.parquet
  Wrote rows 0 .. 21628  (21629 rows)
Parquet writer closed.
  Completed CY08MSP_SCH_QQQ.SAV in 2.0s -> /Users/rishaan/Desktop/CERAS-Cognitive-Efficiency-Reasoning-Alignment-System/data/raw/CY08MSP_SCH_QQQ.parquet (size: 5.6 MB)
Converting: CY08MSP_STU_COG.SAV
  Created writer for: CY08MSP_STU_COG.parquet
  Wrote rows 0 .. 199999  (200000 rows)
  Wrote rows 200000 .. 399999  (200000 rows)
  Wrote rows 400000 .. 599999  (200000 rows)
  Wrote rows 600000 .. 613743  (13744 rows)
Parquet writer closed.
  Comp