# Extract EMD Features for All Cycles

This notebook extends the existing dataset with EMD features extracted from voltage, current, and temperature signals.


In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

# Add src to path
project_root = Path("/Users/siddhantaggarwal/Desktop/Battery_RUL").resolve()
sys.path.append(str(project_root))

from src.features import extract_features_from_file


In [2]:
# Load existing dataset
processed_dir = project_root / "data" / "processed"
df_existing = pd.read_csv(processed_dir / "rul_features.csv")

print(f"Existing dataset: {len(df_existing)} rows")
print(f"Existing features: {len([c for c in df_existing.columns if c not in ['battery_id', 'filename', 'type', 'start_time', 'test_id', 'uid', 'split', 'cycle_index', 'EOL_cycle', 'RUL', 'SOH', 'Capacity', 'Re', 'Rct', 'ambient_temperature']])} features")


Existing dataset: 2750 rows
Existing features: 8 features


In [3]:
# Extract EMD features for all cycles
data_dir = project_root / "cleaned_dataset" / "data"
records = []

print("Extracting EMD features...")
for idx, row in tqdm(df_existing.iterrows(), total=len(df_existing), desc="Processing cycles"):
    filename = row['filename']
    file_path = data_dir / filename
    
    if file_path.exists():
        features = extract_features_from_file(file_path, include_emd=True, max_imfs=5)
        features['filename'] = filename
        records.append(features)
    else:
        print(f"Warning: File not found: {filename}")

feat_df = pd.DataFrame.from_records(records)
print(f"\nExtracted features from {len(feat_df)} cycles")
print(f"Total features: {len(feat_df.columns) - 1}")


Extracting EMD features...


Processing cycles: 100%|██████████| 2750/2750 [00:41<00:00, 65.79it/s] 


Extracted features from 2750 cycles
Total features: 175





In [4]:
# Merge with existing dataset (keep all original columns, add new EMD features)
# Identify overlapping columns
original_feature_cols = ['duration_s', 'voltage_mean', 'voltage_min', 'voltage_max', 
                         'temp_max', 'current_mean_abs', 'coulomb_Ah', 'ir_drop_proxy']

df_enhanced = df_existing.copy()

# Only add EMD features and new statistical features that don't exist
for col in feat_df.columns:
    if col == 'filename':
        continue
    if col not in df_enhanced.columns:
        df_enhanced = df_enhanced.merge(
            feat_df[['filename', col]], 
            on='filename', 
            how='left'
        )

print(f"Enhanced dataset: {len(df_enhanced)} rows")
print(f"Total columns: {len(df_enhanced.columns)}")

# Save enhanced dataset
output_path = processed_dir / "rul_features_with_emd.parquet"
df_enhanced.to_parquet(output_path, index=False)
print(f"\n✅ Saved enhanced dataset: {output_path}")


Enhanced dataset: 2750 rows
Total columns: 190

✅ Saved enhanced dataset: /Users/siddhantaggarwal/Desktop/Battery_RUL/data/processed/rul_features_with_emd.parquet


In [5]:
# Show feature summary
print("\nFeature Summary:")
emd_features = [c for c in df_enhanced.columns if '_imf' in c.lower()]
stat_features = [c for c in df_enhanced.columns if any(x in c for x in ['voltage', 'current', 'temp', 'duration', 'coulomb', 'ir'])]

print(f"EMD features: {len(emd_features)}")
print(f"Statistical features: {len(stat_features)}")
print(f"\nSample EMD features: {emd_features[:10] if emd_features else 'None found'}")
print(f"\nDataset ready for modeling! Total features: {len(df_enhanced.columns)}")



Feature Summary:
EMD features: 159
Statistical features: 176

Sample EMD features: ['voltage_imf1_mean', 'voltage_imf1_std', 'voltage_imf1_energy', 'voltage_imf1_max', 'voltage_imf1_min', 'voltage_imf1_skewness', 'voltage_imf1_kurtosis', 'voltage_imf1_energy_ratio', 'current_imf1_mean', 'current_imf1_std']

Dataset ready for modeling! Total features: 190
