# Feature Engineering for Battery Performance Prediction

This notebook demonstrates:
- Statistical feature extraction
- Time-series feature engineering
- CyclePatch framework implementation
- Feature selection and importance analysis

In [None]:
# Setup
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import signal, stats
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

# Project imports
from src.data.loader import BatteryDataLoader
from src.features.extractor import BatteryFeatureExtractor, FeatureEngineering
from src.data.cyclepatch import CyclePatchConfig, CyclePatchTokenizer

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')

## 1. Load Processed Data

In [None]:
# Load processed data from previous notebook
data_dir = Path('../data/processed')
battery_files = list(data_dir.glob('*_processed.csv'))

print(f"Found {len(battery_files)} processed battery files")

# Load a sample battery
if battery_files:
    sample_battery = pd.read_csv(battery_files[0])
    battery_id = battery_files[0].stem.replace('_processed', '')
    print(f"\nLoaded battery: {battery_id}")
    print(f"Shape: {sample_battery.shape}")
    sample_battery.head()

## 2. Statistical Feature Extraction

In [None]:
# Initialize feature extractor
extractor = BatteryFeatureExtractor(window_size=10)

# Extract statistical features for a window
window_data = sample_battery['capacity'].iloc[:10]
stat_features = extractor.extract_statistical_features(window_data)

print("Statistical features extracted:")
for feature, value in stat_features.items():
    print(f"  {feature}: {value:.4f}")

In [None]:
# Extract trend features
cycles = sample_battery['cycle'].values[:50]
capacities = sample_battery['capacity'].values[:50]

trend_features = extractor.extract_trend_features(cycles, capacities)

print("Trend features:")
for feature, value in trend_features.items():
    print(f"  {feature}: {value:.6f}")

# Visualize trend
plt.figure(figsize=(10, 6))
plt.scatter(cycles, capacities, alpha=0.6, label='Actual')

# Linear fit
linear_fit = trend_features['linear_slope'] * cycles + trend_features['linear_intercept']
plt.plot(cycles, linear_fit, 'r--', label='Linear Fit', linewidth=2)

# Polynomial fit
poly_fit = (trend_features['poly2_a'] * cycles**2 + 
            trend_features['poly2_b'] * cycles + 
            trend_features['poly2_c'])
plt.plot(cycles, poly_fit, 'g--', label='Polynomial Fit', linewidth=2)

plt.xlabel('Cycle Number')
plt.ylabel('Capacity (Ah)')
plt.title('Capacity Trend Analysis')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Extract degradation-specific features
deg_features = extractor.extract_degradation_features(sample_battery['capacity'])

print("Degradation features:")
for feature, value in deg_features.items():
    print(f"  {feature}: {value:.4f}")

# Visualize degradation knee
plt.figure(figsize=(10, 6))
plt.plot(sample_battery['cycle'], sample_battery['capacity'], 'b-', linewidth=2)
plt.axvline(x=deg_features['degradation_knee'], color='r', linestyle='--', 
            label=f"Knee Point (Cycle {int(deg_features['degradation_knee'])})")
plt.xlabel('Cycle Number')
plt.ylabel('Capacity (Ah)')
plt.title('Degradation Knee Detection')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 3. Multi-Window Feature Engineering

In [None]:
# Extract features with multiple window sizes
fe = FeatureEngineering(window_sizes=[5, 10, 20])
engineered_features = fe.engineer_features(sample_battery)

print(f"Total features extracted: {len(engineered_features.columns)}")
print(f"Sample shape: {engineered_features.shape}")
print("\nFeature categories:")

# Count features by category
feature_categories = {}
for col in engineered_features.columns:
    if col == 'cycle':
        continue
    category = col.split('_')[0]
    feature_categories[category] = feature_categories.get(category, 0) + 1

for category, count in sorted(feature_categories.items(), key=lambda x: x[1], reverse=True):
    print(f"  {category}: {count} features")

In [None]:
# Visualize feature evolution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

# Select key features to visualize
features_to_plot = [
    'window10_capacity_mean',
    'window10_capacity_std',
    'window10_trend_linear_slope',
    'window10_degradation_capacity_fade_rate'
]

for idx, feature in enumerate(features_to_plot):
    if feature in engineered_features.columns:
        ax = axes[idx]
        ax.plot(engineered_features['cycle'], engineered_features[feature], 
                linewidth=2, alpha=0.8)
        ax.set_xlabel('Cycle Number')
        ax.set_ylabel(feature.replace('_', ' ').title())
        ax.set_title(f'Evolution of {feature}')
        ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. CyclePatch Feature Extraction

In [None]:
# Configure CyclePatch
cp_config = CyclePatchConfig(
    patch_size=10,
    stride=5,
    embed_dim=128,
    features=['capacity', 'voltage_mean', 'current_mean', 
              'temperature_mean', 'soh', 'capacity_fade']
)

# Create tokenizer
tokenizer = CyclePatchTokenizer(cp_config)

# Prepare cycle data
cycle_features = sample_battery[cp_config.features].values
patches = tokenizer.create_patches(cycle_features)

print(f"Original data shape: {cycle_features.shape}")
print(f"Patches shape: {patches.shape}")
print(f"Number of patches: {patches.shape[0]}")
print(f"Patch size: {patches.shape[1]}")
print(f"Features per cycle: {patches.shape[2]}")

In [None]:
# Visualize patches
fig, axes = plt.subplots(3, 2, figsize=(12, 12))
axes = axes.ravel()

# Plot different features across patches
for idx, (feature_idx, feature_name) in enumerate(zip(range(6), cp_config.features)):
    ax = axes[idx]
    
    # Plot first 5 patches
    for patch_idx in range(min(5, len(patches))):
        patch_data = patches[patch_idx, :, feature_idx]
        ax.plot(patch_data, label=f'Patch {patch_idx}', alpha=0.7, linewidth=2)
    
    ax.set_xlabel('Position in Patch')
    ax.set_ylabel(feature_name.replace('_', ' ').title())
    ax.set_title(f'{feature_name} Across Patches')
    ax.grid(True, alpha=0.3)
    if idx == 0:
        ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Create positional encoding
pos_encoding = tokenizer.create_positional_encoding(len(patches))

# Visualize positional encoding
plt.figure(figsize=(12, 6))
plt.imshow(pos_encoding.T, aspect='auto', cmap='coolwarm')
plt.colorbar(label='Encoding Value')
plt.xlabel('Patch Position')
plt.ylabel('Embedding Dimension')
plt.title('CyclePatch Positional Encoding')
plt.show()

print(f"Positional encoding shape: {pos_encoding.shape}")

## 5. Feature Importance Analysis

In [None]:
# Prepare data for feature importance
# Remove non-numeric columns and target-related columns
feature_cols = [col for col in engineered_features.columns 
                if col not in ['cycle', 'rul_current', 'soh_current', 
                              'capacity_current', 'battery_id']]

X = engineered_features[feature_cols].fillna(0)
y_capacity = engineered_features['capacity_current'].fillna(method='ffill')

# Calculate mutual information
mi_scores = mutual_info_regression(X, y_capacity, random_state=42)

# Create feature importance dataframe
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': mi_scores
}).sort_values('importance', ascending=False)

# Plot top features
plt.figure(figsize=(10, 8))
top_n = 20
top_features = feature_importance.head(top_n)

plt.barh(range(top_n), top_features['importance'].values)
plt.yticks(range(top_n), top_features['feature'].values)
plt.xlabel('Mutual Information Score')
plt.title(f'Top {top_n} Features by Mutual Information')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("Top 10 most important features:")
print(feature_importance.head(10))

In [None]:
# Correlation analysis with target variables
target_cols = ['capacity_current', 'soh_current', 'rul_current']
available_targets = [col for col in target_cols if col in engineered_features.columns]

if available_targets:
    # Calculate correlations
    correlations = {}
    for target in available_targets:
        if engineered_features[target].notna().sum() > 0:
            corr = engineered_features[feature_cols].corrwith(
                engineered_features[target].fillna(method='ffill')
            )
            correlations[target] = corr
    
    # Create heatmap of top correlations
    top_corr_features = feature_importance.head(15)['feature'].values
    
    corr_matrix = pd.DataFrame(correlations)[available_targets].loc[top_corr_features]
    
    plt.figure(figsize=(8, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', 
                center=0, cbar_kws={'label': 'Correlation'})
    plt.title('Feature Correlations with Target Variables')
    plt.tight_layout()
    plt.show()

## 6. Feature Engineering for Multiple Batteries

In [None]:
# Process multiple batteries
all_features = []
max_batteries = min(5, len(battery_files))

for battery_file in battery_files[:max_batteries]:
    battery_data = pd.read_csv(battery_file)
    battery_id = battery_file.stem.replace('_processed', '')
    
    # Engineer features
    features = fe.engineer_features(battery_data)
    features['battery_id'] = battery_id
    
    all_features.append(features)
    print(f"Processed {battery_id}: {len(features)} cycles, {len(features.columns)} features")

# Combine all features
combined_features = pd.concat(all_features, ignore_index=True)
print(f"\nCombined dataset: {combined_features.shape}")

In [None]:
# Feature statistics across batteries
feature_stats = combined_features.groupby('battery_id')[feature_cols].agg(['mean', 'std'])

# Visualize feature variability across batteries
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

# Select representative features
repr_features = [
    'window10_capacity_mean',
    'window10_voltage_mean',
    'window10_temperature_mean',
    'window10_trend_linear_slope'
]

for idx, feature in enumerate(repr_features):
    if (feature, 'mean') in feature_stats.columns:
        ax = axes[idx]
        
        means = feature_stats[(feature, 'mean')]
        stds = feature_stats[(feature, 'std')]
        
        x = range(len(means))
        ax.bar(x, means.values, yerr=stds.values, capsize=5, alpha=0.7)
        ax.set_xticks(x)
        ax.set_xticklabels(means.index, rotation=45)
        ax.set_ylabel(feature.replace('_', ' ').title())
        ax.set_title(f'{feature} Across Batteries')
        ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 7. Feature Normalization and Preprocessing

In [None]:
# Normalize features
scaler = StandardScaler()

# Fit on all data
normalized_features = combined_features.copy()
normalized_features[feature_cols] = scaler.fit_transform(combined_features[feature_cols])

# Compare distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()

sample_features = ['window10_capacity_mean', 'window10_voltage_mean', 
                  'window10_temperature_mean', 'window10_trend_linear_slope']

for idx, feature in enumerate(sample_features):
    if feature in combined_features.columns:
        ax = axes[idx]
        
        # Plot original and normalized
        ax.hist(combined_features[feature], bins=30, alpha=0.5, 
                label='Original', density=True)
        ax.hist(normalized_features[feature], bins=30, alpha=0.5, 
                label='Normalized', density=True)
        
        ax.set_xlabel(feature.replace('_', ' ').title())
        ax.set_ylabel('Density')
        ax.set_title(f'{feature} Distribution')
        ax.legend()
        ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Save engineered features
output_dir = Path('../data/features')
output_dir.mkdir(exist_ok=True, parents=True)

# Save individual battery features
for battery_id in combined_features['battery_id'].unique():
    battery_features = combined_features[combined_features['battery_id'] == battery_id]
    battery_features.to_csv(output_dir / f'{battery_id}_features.csv', index=False)

# Save combined features
combined_features.to_csv(output_dir / 'all_batteries_features.csv', index=False)

# Save normalization parameters
import joblib
joblib.dump(scaler, output_dir / 'feature_scaler.pkl')

# Save feature names
with open(output_dir / 'feature_names.txt', 'w') as f:
    for feature in feature_cols:
        f.write(f"{feature}\n")

print(f"Features saved to {output_dir}")
print(f"Total features: {len(feature_cols)}")
print(f"Total samples: {len(combined_features)}")

## 8. Summary

### Key Features Engineered:

1. **Statistical Features**:
   - Mean, std, min, max, range
   - Skewness, kurtosis
   - Percentiles (25th, 50th, 75th)

2. **Trend Features**:
   - Linear regression slope and intercept
   - Polynomial coefficients
   - R-squared values

3. **Degradation Features**:
   - Capacity fade rate
   - Fade acceleration
   - Cycles to 80% SOH
   - Degradation knee point

4. **CyclePatch Features**:
   - Tokenized patches
   - Positional encodings
   - Multi-scale representations

### Feature Importance Insights:
- Window-based capacity statistics are most informative
- Trend features capture degradation patterns effectively
- Temperature and voltage features provide complementary information