In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Load the final, cleaned dataset
df = pd.read_csv('../data/processed/merged_battery_degradation.csv')

# Ensure the data is sorted by battery and cycle
df = df.sort_values(['battery_id', 'cycle_index']).reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"Batteries: {df['battery_id'].nunique()}")
print(f"Datasets: {df['dataset'].unique()}")

Dataset shape: (4870, 10)
Batteries: 42
Datasets: ['NASA' 'Oxford']


In [7]:
df.columns

Index(['battery_id', 'cycle_index', 'capacity', 'voltage', 'current',
       'temperature', 'time', 'norm_capacity', 'RUL', 'dataset'],
      dtype='object')

In [8]:
# Select features
feature_columns = ['norm_capacity', 'voltage', 'current', 'temperature', 'time']
target_column = 'RUL'

# Handle NaN values in features (e.g., from charge cycles)
df[feature_columns] = df[feature_columns].fillna(method='ffill')  # Forward-fill
df = df.dropna(subset=feature_columns + [target_column])  # Drop any remaining NaN

In [9]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

# Scale features
X_scaled = scaler_X.fit_transform(df[feature_columns])

# Scale target (RUL)
y_scaled = scaler_y.fit_transform(df[[target_column]])

In [10]:
def create_sequences(data, target, window_size=10):
    X, y = [], []
    for i in range(window_size, len(data)):
        X.append(data[i-window_size:i])  # Sequence of past `window_size` cycles
        y.append(target[i])              # RUL at current cycle
    return np.array(X), np.array(y)

# Define window size (e.g., last 10 cycles)
window_size = 15

# Create sequences
X_seq, y_seq = create_sequences(X_scaled, y_scaled, window_size)

print(f"Sequence shape: {X_seq.shape}")  # Should be (num_sequences, window_size, num_features)
print(f"Target shape: {y_seq.shape}")    # Should be (num_sequences, 1)

Sequence shape: (4855, 15, 5)
Target shape: (4855, 1)


In [11]:
from sklearn.model_selection import train_test_split

# Get unique battery IDs
battery_ids = df['battery_id'].unique()

# Split batteries into train and test
train_batteries, test_batteries = train_test_split(battery_ids, test_size=0.2, random_state=42)

# Create train/test masks
train_mask = df['battery_id'].isin(train_batteries)
test_mask = df['battery_id'].isin(test_batteries)

# Create sequences for train and test
X_train, y_train = create_sequences(X_scaled[train_mask], y_scaled[train_mask], window_size)
X_test, y_test = create_sequences(X_scaled[test_mask], y_scaled[test_mask], window_size)

print(f"Train sequences: {X_train.shape[0]}")
print(f"Test sequences: {X_test.shape[0]}")

Train sequences: 4116
Test sequences: 724


In [13]:
np.save('../data/processed/X_train.npy', X_train)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/X_test.npy', X_test)
np.save('../data/processed/y_test.npy', y_test)

# Save scalers for later use
import joblib
joblib.dump(scaler_X, '../saved_models/scaler_X.pkl')
joblib.dump(scaler_y, '../saved_models/scaler_y.pkl')

print("✅ Processed data saved for modeling.")

✅ Processed data saved for modeling.
