
# Sample NASA Pipeline — End-to-End (Runs on `data/raw-sample`)

This notebook executes the **entire Phase‑1 pipeline** on the **sample NASA subset** so anyone who clones the repo can run it locally without the full raw dataset.

It will:
1. Point the loaders to `data/raw-sample/NASABatteryAging` via `EVFASTCHARGING_RAW_ROOT`.
2. Build a lightweight **manifest** (inventory).
3. Run the **charge-cycle extraction → 1 Hz resampling → feature engineering** pipeline.
4. Materialize **RL-ready tables** (Tbl1/2/3).
5. Run a **small EDA** set.
6. Summarize **provenance and dataset stats**.


In [1]:

from pathlib import Path
import os, sys
import pandas as pd

# --- Robust repo root discovery (works no matter where the notebook is opened) ---
def find_repo_root(start: Path, max_up: int = 5) -> Path:
    cur = start.resolve()
    for _ in range(max_up + 1):
        if (cur / "data").exists() and (cur / "src").exists():
            return cur
        cur = cur.parent
    raise RuntimeError("Could not locate repo root containing both 'data/' and 'src/'.")

repo_root = find_repo_root(Path.cwd())
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# --- Force readers to use SAMPLE raw data (not the full raw folder) ---
sample_raw = repo_root / "data" / "raw-sample" / "NASABatteryAging"
assert sample_raw.exists(), f"Sample raw path not found: {sample_raw} — did you place sample data?"
os.environ["EVFASTCHARGING_RAW_ROOT"] = str(sample_raw)

# --- Processed output dir ---
processed_dir = repo_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

print("Repo root :", repo_root)
print("Using RAW :", os.environ["EVFASTCHARGING_RAW_ROOT"])
print("Processed :", processed_dir)


Repo root : C:\Users\User\OneDrive\Desktop\Tech Projects\EvFastChargingRL
Using RAW : C:\Users\User\OneDrive\Desktop\Tech Projects\EvFastChargingRL\data\raw-sample\NASABatteryAging
Processed : C:\Users\User\OneDrive\Desktop\Tech Projects\EvFastChargingRL\data\processed


## 1) Build a lightweight manifest (inventory)

In [None]:
from nasa_data_extract.build_manifest import main as manifest_main
manifest_main()
manifest_csv = processed_dir / 'nasa_manifest.csv'
display(pd.read_csv(manifest_csv).head(20))

found 0 .mat files
Empty DataFrame
Columns: []
Index: []


Unnamed: 0,subset,file,battery_id,cycles_total,cycles_charge,cycles_discharge,cycles_impedance,cycles_other,approx_points_first_charge
0,1. BatteryAgingARC-FY08Q4,1. BatteryAgingARC-FY08Q4\B0005.mat,B0005,616,338,0,278,0,789
1,1. BatteryAgingARC-FY08Q4,1. BatteryAgingARC-FY08Q4\B0006.mat,B0006,616,338,0,278,0,789
2,1. BatteryAgingARC-FY08Q4,1. BatteryAgingARC-FY08Q4\B0007.mat,B0007,616,338,0,278,0,789
3,1. BatteryAgingARC-FY08Q4,1. BatteryAgingARC-FY08Q4\B0018.mat,B0018,319,266,0,53,0,2816
4,2. BatteryAgingARC_25_26_27_28_P1,2. BatteryAgingARC_25_26_27_28_P1\B0025.mat,B0025,80,59,0,21,0,3815
5,2. BatteryAgingARC_25_26_27_28_P1,2. BatteryAgingARC_25_26_27_28_P1\B0026.mat,B0026,80,59,0,21,0,3815
6,2. BatteryAgingARC_25_26_27_28_P1,2. BatteryAgingARC_25_26_27_28_P1\B0027.mat,B0027,80,59,0,21,0,3815
7,2. BatteryAgingARC_25_26_27_28_P1,2. BatteryAgingARC_25_26_27_28_P1\B0028.mat,B0028,80,59,0,21,0,3815
8,3. BatteryAgingARC_25-44,3. BatteryAgingARC_25-44\B0025.mat,B0025,80,59,0,21,0,3815
9,3. BatteryAgingARC_25-44,3. BatteryAgingARC_25-44\B0026.mat,B0026,80,59,0,21,0,3815


## 2) Run the processing pipeline (charge-only → 1 Hz features)

In [None]:

from nasa_data_extract.run_nasa_pipeline import main as run_pipeline
csv_path = run_pipeline([])   # <- IMPORTANT: pass [] to ignore Jupyter argv
csv_path





[1/3] Extracting charge cycles…


TypeError: c:\Users\User\AppData\Roaming\jupyter\runtime\kernel-v3b25204babf4427332b7955f0d2b4e15dbea6519b.json is not a MATLAB 7.3 file. Load with scipy.io.loadmat() instead.

### Preview processed CSV

In [None]:

df = pd.read_csv(csv_path)
rows, cols = df.shape
display(df.head())
print('Rows:', rows, 'Cols:', cols)
print('Batteries:', df['battery_id'].nunique(), 'Subsets:', df['subset'].nunique(), 'Cycles:', df['cycle_id'].nunique())


## 3) Materialize RL tables (Tbl1 / Tbl2 / Tbl3)

In [None]:

from src.nasa_data_extract import make_tables as mt

tbl_outdir = processed_dir
tbl1 = mt.make_tbl1(csv_path)
tbl2 = mt.make_tbl2(tbl1, soc_target=0.8)
tbl3 = mt.make_tbl3_from_defaults(tbl1)

tbl1_path = tbl_outdir / 'Tbl1_signals.csv'
tbl2_path = tbl_outdir / 'Tbl2_episodes.csv'
tbl3_path = tbl_outdir / 'Tbl3_metadata.csv'

tbl1.to_csv(tbl1_path, index=False)
tbl2.to_csv(tbl2_path, index=False)
tbl3.to_csv(tbl3_path, index=False)

tbl1.shape, tbl2.shape, tbl3.shape


In [None]:

print('Tbl1_signals.csv ->', tbl1_path)
display(tbl1.head())

print('\nTbl2_episodes.csv ->', tbl2_path)
display(tbl2.head())

print('\nTbl3_metadata.csv ->', tbl3_path)
display(tbl3.head())


## 4) Quick EDA (figures saved to `data/processed/figures/`)

In [None]:

from src.nasa_data_extract.eda import eda_plots
fig_dir = processed_dir / 'figures'
eda_plots(csv_path, fig_dir)
sorted(p.name for p in fig_dir.glob('*.png'))


## 5) High-level stats & metadata summary

In [None]:

summary = {
    'rows': int(df.shape[0]),
    'cols': int(df.shape[1]),
    'batteries': int(df['battery_id'].nunique()),
    'subsets': int(df['subset'].nunique()),
    'cycles': int(df['cycle_id'].nunique()),
    'crate_min': float(df['C_rate'].min()),
    'crate_max': float(df['C_rate'].max()),
    'soc_min': float(df['SoC'].min()),
    'soc_max': float(df['SoC'].max()),
    'temp_min_C': float(df['T_cell'].min()),
    'temp_max_C': float(df['T_cell'].max()),
}
summary
