# Environment Setup

In [1]:
# --- MacOS Validator ---
import sys
sys.path.insert(0, './setup_validation')

from macos_validator import main
results = main(run_benchmark=False)

#  --- Path Setup ---
import torch
from pathlib import Path

project_root = Path.cwd()
sys.path.insert(0, './src')


=== DEPENDENCY STATUS ===
 ✓ h5py                 v3.15.1         HDF5 file format support
 ✓ matplotlib           v3.10.7         Plotting library
 ✓ memory_profiler      v0.61.0         Memory profiling tool
 ✓ numpy                v1.26.4         Numerical computing library
 ✓ pandas               v2.3.3          Data manipulation library
 ✓ scipy                v1.16.3         Scientific computing library
 ✓ seaborn              v0.13.2         Statistical visualization library
 ✓ sklearn              v1.7.2          Scikit-learn ML library
 ✓ torch                v2.9.1          PyTorch deep learning framework
 ✓ torchaudio           v2.9.1          PyTorch audio utilities
 ✓ torchvision          v0.24.1         PyTorch vision utilities
 ✓ tqdm                 v4.67.1         Progress bar library
 ✓ yaml                 v6.0.3          Config manipulation library

=== GPU/ACCELERATOR STATUS ===
Platform: arm64
Apple Silicon: Yes

PyTorch MPS:
 Installed: Yes
 Version: 2.9.1
 MPS 

# Config Loading

In [3]:
from src.utils import load_config
config = load_config("config.yaml")
print (config)

{'data_profile': 'data_final', 'model_profile': 'model_final', 'dataset_name': 'sample', 'paths': {'datasetdir': 'data', 'csv_pattern': '*.csv', 'extracted_datadir': 'data_extracted', 'hdf5_filename': 'data_final_sample.h5', 'checkpoint_dir': 'checkpoints'}, 'dataset': {'rated_capacity_ah': 155.0, 'num_cells': 96, 'voltage_window_mv': [3750, 4050]}, 'processing': {'config_file': '{data_profile}.yaml', 'parallel_batch_size': 6}, 'ocv_calibration': {'min_rest_hours': 1.0, 'max_soc_start': 0.65, 'ocv_table_path': None}, 'charging': {'status_value': 3, 'accept_partial_charges': True, 'full_charge_soc_threshold': 0.95, 'full_charge_voltage_v': 4.24}, 'quality_checks': {'min_samples_in_window': 35, 'max_gap_seconds': 90, 'current_bounds_a': [0, 220], 'current_sign_convention': 'physical', 'voltage_smoothing': 'interpolation'}, 'soh_calculation': {'soh_bounds': [0.35, 1.25], 'min_abs_delta_soc': 0.03}, 'model': {'architecture': 'resnet', 'embed_dim': 128, 'seq_len': 151, 'nhead': 4, 'num_laye

# TESTS

In [4]:
# In main.ipynb
from src.data_extract import extract_all_features
from src.utils import load_config
import pandas as pd

config = load_config("config.yaml")
df = pd.read_csv("data/sample/vin1.csv")  # Use one actual file
features = extract_all_features(df, config)
print(features)

None


In [None]:
from src.data_process import get_vehicle_paths
from src.utils import load_config
from src.data_extract import extract_all_features
import pandas as pd

config = load_config("config.yaml")
csv_dir = Path(config['paths']['datasetdir']) / config['dataset_name']
vehicle_paths = get_vehicle_paths(csv_dir, "*.csv")[:5]  # Only first 5

for path in vehicle_paths:
    print(f"\n{'='*50}")
    print(f"Processing: {path.stem}")
    print('='*50)
    
    df = pd.read_csv(path)
    df['vehicleid'] = path.stem
    
    features = extract_all_features(df, config)
    if features:
        print(f"✅ SUCCESS: {len(features['soh_labels'])} samples")
    else:
        print(f"❌ FAILED: No valid segments")

# Data Process & H5 Packaging

## SAMPLE

In [1]:
# -- Paper Sample Feature Extrction --
from src.data_process import extraction_pipeline

config_path = "config.yaml"
extraction_pipeline(config_path)

  from .autonotebook import tqdm as notebook_tqdm


checkpoints/data_paper_sample_checkpoint.json
Data Profile: data_paper
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 50/50 [01:37<00:00,  1.94s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 5
No valid segments: 295
Errors: 0
Total samples extracted: 8
SOH range: 0.518 to 0.832
HDF5 file closed safely





In [2]:
# -- Sample Sample Feature Extrction --
from src.data_process import extraction_pipeline

config_path = "config.yaml"
extraction_pipeline(config_path)

checkpoints/data_sample_sample_checkpoint.json
Data Profile: data_sample
Dataset: sample
Found 300 vehicle files


Processing batches:   0%|          | 0/50 [00:00<?, ?it/s]

Processing batches: 100%|██████████| 50/50 [02:08<00:00,  2.57s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 24
No valid segments: 276
Errors: 0
Total samples extracted: 51
SOH range: 0.308 to 1.210
HDF5 file closed safely





In [6]:
# -- Strict Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_strict_sample_checkpoint.json
Data Profile: data_strict
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 50/50 [01:31<00:00,  1.83s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 1
No valid segments: 299
Errors: 0
Total samples extracted: 2
SOH range: 0.512 to 0.515
HDF5 file closed safely





In [7]:
# -- Lenient Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_lenient_sample_checkpoint.json
Data Profile: data_lenient
Dataset: sample
Found 300 vehicle files


Processing batches:   0%|          | 0/50 [00:00<?, ?it/s]

Processing batches: 100%|██████████| 50/50 [02:42<00:00,  3.25s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 61
No valid segments: 239
Errors: 0
Total samples extracted: 136
SOH range: 0.206 to 1.280
HDF5 file closed safely





In [8]:
# -- Current Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_current_sample_checkpoint.json
Data Profile: data_current
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 50/50 [01:51<00:00,  2.23s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 12
No valid segments: 288
Errors: 0
Total samples extracted: 22
SOH range: 0.353 to 1.072
HDF5 file closed safely





In [9]:
# -- Voltage Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_voltage_sample_checkpoint.json
Data Profile: data_voltage
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 50/50 [01:57<00:00,  2.34s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 15
No valid segments: 285
Errors: 0
Total samples extracted: 27
SOH range: 0.353 to 1.072
HDF5 file closed safely





In [10]:
# -- Temporal Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_temporal_sample_checkpoint.json
Data Profile: data_temporal
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 50/50 [01:39<00:00,  1.99s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 3
No valid segments: 297
Errors: 0
Total samples extracted: 3
SOH range: 0.408 to 0.457
HDF5 file closed safely





In [2]:
# -- Final Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_final_sample_checkpoint.json
Data Profile: data_final
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 100/100 [01:58<00:00,  1.19s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 15
No valid segments: 285
Errors: 0
Total samples extracted: 22
SOH range: 0.434 to 1.093
HDF5 file closed safely





In [6]:
# -- Final Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_final_sample_checkpoint.json
Data Profile: data_final
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 50/50 [01:30<00:00,  1.81s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 15
No valid segments: 285
Errors: 0
Total samples extracted: 22
SOH range: 0.434 to 1.093
HDF5 file closed safely





In [7]:
# -- Final Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_final_sample_checkpoint.json
Data Profile: data_final
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 38/38 [01:24<00:00,  2.23s/it]


EXTRACTION COMPLETE
Total vehicles processed: 300
Successful extractions: 15
No valid segments: 285
Errors: 0
Total samples extracted: 22
SOH range: 0.434 to 1.093
HDF5 file closed safely





In [9]:
# -- Final Sample Feature Extrction --
from src.data_process import extraction_pipeline

config = "config.yaml"
extraction_pipeline(config)

checkpoints/data_final_sample_checkpoint.json
Data Profile: data_final
Dataset: sample
Found 300 vehicle files


Processing batches: 100%|██████████| 34/34 [01:00<00:00,  1.78s/it]


EXTRACTION COMPLETE
Total vehicles processed: 204
Successful extractions: 7
No valid segments: 197
Errors: 0
Total samples extracted: 22
SOH range: 0.434 to 1.093
HDF5 file closed safely





In [10]:
from pathlib import Path
import h5py

# Adjust this import depending on your package layout:
# - if utils.py is in the same folder as the notebook: from utils import load_config
# - if you use a src package: from src.utils import load_config
from utils import load_config  # change to src.utils if needed

# 1. Load merged config (config.yaml + profile)
config = load_config("config.yaml")

# 2. Resolve HDF5 path exactly as in data_process.py
hdf5_path = Path(config["paths"]["extracted_datadir"]) / config["paths"]["hdf5_filename"]
print("HDF5 path:", hdf5_path.resolve())

if not hdf5_path.exists():
    raise FileNotFoundError(f"HDF5 file not found at {hdf5_path}. Run data_process.py first.")

# 3. Open file and list datasets
with h5py.File(hdf5_path, "r") as f:
    print("\nDatasets in file:")
    for name, ds in f.items():
        print(f"  {name}: shape={ds.shape}, dtype={ds.dtype}")

    # Optional: peek at first element of each dataset for sanity
    print("\nFirst sample shapes / values (truncated):")
    for name, ds in f.items():
        try:
            sample = ds[0]
        except Exception:
            # In case of empty or scalar datasets
            sample = ds[()]
        print(f"  {name}: sample shape={getattr(sample, 'shape', None)}")


HDF5 path: /Users/zarz/Code/FYP_HANYANG/data_extracted/data_final_sample.h5

Datasets in file:
  qhi_sequences: shape=(22, 151), dtype=float32
  scalar_features: shape=(22, 15), dtype=float32
  soh_labels: shape=(22,), dtype=float32
  thi_sequences: shape=(22, 151), dtype=float32
  timestamps_end: shape=(22,), dtype=object
  timestamps_start: shape=(22,), dtype=object
  vehicle_ids: shape=(22,), dtype=object
  voltage_maps: shape=(22, 96, 96), dtype=float32

First sample shapes / values (truncated):
  qhi_sequences: sample shape=(151,)
  scalar_features: sample shape=(15,)
  soh_labels: sample shape=()
  thi_sequences: sample shape=(151,)
  timestamps_end: sample shape=None
  timestamps_start: sample shape=None
  vehicle_ids: sample shape=None
  voltage_maps: sample shape=(96, 96)


# Training

In [2]:
from src.train import model_pipeline

config = "config.yaml"
trained_model, test_results = model_pipeline(config)

Data profile: data_final
Dataset: raw
Model architecture: resnet
Train batches: 1 | Val batches: 1 | Test batches: 1
Device: mps
Training for 5 epochs with patience=5
Device: mps
Model parameters: 6,944,484
LR schedule: 10 epoch warm-up + exponential decay (γ=0.95)


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   1/5 | Train MSE: 0.0849 | Val MSE: 0.0053 | Val MAPE:  10.08% | Val RMSE: 0.0728 | LR: 0.000200
   ✓ Best model saved (MAPE: 10.08%)


Training: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it, loss=0.0754]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   2/5 | Train MSE: 0.0754 | Val MSE: 0.0052 | Val MAPE:   9.90% | Val RMSE: 0.0720 | LR: 0.000300
   ✓ Best model saved (MAPE: 9.90%)


Training: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it, loss=0.0619]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   3/5 | Train MSE: 0.0619 | Val MSE: 0.0050 | Val MAPE:   9.62% | Val RMSE: 0.0707 | LR: 0.000400
   ✓ Best model saved (MAPE: 9.62%)


Training: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it, loss=0.0489]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   4/5 | Train MSE: 0.0489 | Val MSE: 0.0048 | Val MAPE:   9.28% | Val RMSE: 0.0692 | LR: 0.000500
   ✓ Best model saved (MAPE: 9.28%)


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   5/5 | Train MSE: 0.0417 | Val MSE: 0.0046 | Val MAPE:   8.93% | Val RMSE: 0.0676 | LR: 0.000600
   ✓ Best model saved (MAPE: 8.93%)

Training complete. Best validation MAPE: 8.93%


Evaluating: 100%|██████████| 1/1 [00:06<00:00,  6.47s/it]



TEST SET EVALUATION
MSE:  0.069394
MAE:  0.180252
RMSE: 0.263428
MAPE: 21.04%

Average Modality Attention Weights:
  Voltage Map:    0.335
  Sequence (Q/T): 0.327
  Point Features: 0.338

✓ Final model saved to: checkpoints/data_final_raw_final.pt


In [3]:
from src.train import model_pipeline

config = "config.yaml"
trained_model, test_results = model_pipeline(config)

Data profile: data_final
Dataset: raw
Model architecture: transformer
Train batches: 1 | Val batches: 1 | Test batches: 1
Device: mps
Training for 5 epochs with patience=5
Device: mps
Model parameters: 6,931,364
LR schedule: 10 epoch warm-up + exponential decay (γ=0.95)


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   1/5 | Train MSE: 0.0951 | Val MSE: 0.0079 | Val MAPE:  13.46% | Val RMSE: 0.0886 | LR: 0.000200
   ✓ Best model saved (MAPE: 13.46%)


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   2/5 | Train MSE: 0.0815 | Val MSE: 0.0077 | Val MAPE:  13.38% | Val RMSE: 0.0875 | LR: 0.000300
   ✓ Best model saved (MAPE: 13.38%)


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   3/5 | Train MSE: 0.0676 | Val MSE: 0.0061 | Val MAPE:  11.69% | Val RMSE: 0.0780 | LR: 0.000400
   ✓ Best model saved (MAPE: 11.69%)


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   4/5 | Train MSE: 0.0560 | Val MSE: 0.0048 | Val MAPE:  10.17% | Val RMSE: 0.0694 | LR: 0.000500
   ✓ Best model saved (MAPE: 10.17%)


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[DEBUG] voltage_map: [-12.500, -12.497], qhi: [-5.080, 3.744], soh: [0.412, 1.161]


                                                                    

Epoch   5/5 | Train MSE: 0.0391 | Val MSE: 0.0019 | Val MAPE:   4.19% | Val RMSE: 0.0433 | LR: 0.000600
   ✓ Best model saved (MAPE: 4.19%)

Training complete. Best validation MAPE: 4.19%


Evaluating: 100%|██████████| 1/1 [00:06<00:00,  6.57s/it]



TEST SET EVALUATION
MSE:  0.056683
MAE:  0.165852
RMSE: 0.238083
MAPE: 19.70%

Average Modality Attention Weights:
  Voltage Map:    0.328
  Sequence (Q/T): 0.399
  Point Features: 0.273

✓ Final model saved to: checkpoints/data_final_raw_final.pt
