# Train Random Forest Model (Point Prediction)

This notebook trains a Random Forest model for RUL prediction.

**Phase 1**: Point prediction only - no uncertainty quantification.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Add project root to path
project_root = Path("/Users/siddhantaggarwal/Desktop/Battery_RUL").resolve()
sys.path.append(str(project_root))

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load dataset with EMD features
processed_dir = project_root / "data" / "processed"
df = pd.read_parquet(processed_dir / "rul_features_with_emd.parquet")

print(f"Dataset shape: {df.shape}")
print(f"\nSplits distribution:")
print(df['split'].value_counts().sort_index())

# Filter out rows with NaN RUL (batteries that didn't reach EOL)
df_clean = df[df['RUL'].notna()].copy()
print(f"\nAfter removing NaN RUL: {len(df_clean)} rows")
print(f"RUL range: [{df_clean['RUL'].min():.1f}, {df_clean['RUL'].max():.1f}] cycles")


In [None]:
# Prepare features and target
exclude_cols = [
    'battery_id', 'filename', 'type', 'start_time', 'test_id', 'uid',
    'split', 'cycle_index', 'EOL_cycle', 'RUL', 'SOH', 'Capacity', 
    'Re', 'Rct', 'ambient_temperature'
]

feature_cols = [c for c in df_clean.columns if c not in exclude_cols]
print(f"Total features: {len(feature_cols)}")
print(f"\nFeature categories:")
emd_features = [c for c in feature_cols if '_imf' in c.lower()]
stat_features = [c for c in feature_cols if c not in emd_features]
print(f"  - Statistical features: {len(stat_features)}")
print(f"  - EMD features: {len(emd_features)}")

# Create feature matrix and target
X = df_clean[feature_cols].fillna(0)
y = df_clean['RUL'].values

# Split by battery (already done in dataset)
train_idx = df_clean['split'] == 'train'
val_idx = df_clean['split'] == 'val'
test_idx = df_clean['split'] == 'test'

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx], y[val_idx]
X_test, y_test = X[test_idx], y[test_idx]

print(f"\nTrain: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
print(f"Train RUL stats: mean={y_train.mean():.2f}, std={y_train.std():.2f}")


In [None]:
# Train Random Forest for point prediction
print("Training Random Forest model...")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train, y_train)
print("‚úÖ Model trained!")

# Point predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Handle validation set (may be empty after filtering)
if len(X_val) > 0:
    y_val_pred = rf_model.predict(X_val)
    print(f"‚úÖ Validation set has {len(X_val)} samples")
else:
    print("‚ö†Ô∏è  Validation set is empty after filtering NaN RUL values")
    print("   This happens when validation batteries didn't reach EOL (SOH <= 0.8)")
    print("   This is acceptable - we'll use train/test splits only")
    y_val_pred = np.array([])

# Metrics calculation function
def calculate_metrics(y_true, y_pred, name):
    if len(y_pred) == 0:
        print(f"\n{name} Metrics: No data available (skipped)")
        return {'mae': np.nan, 'rmse': np.nan, 'r2': np.nan, 'mape': np.nan}
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (np.abs(y_true) + 1e-6))) * 100
    
    print(f"\n{name} Metrics:")
    print(f"  MAE:  {mae:.2f} cycles")
    print(f"  RMSE: {rmse:.2f} cycles")
    print(f"  R¬≤:   {r2:.3f}")
    print(f"  MAPE: {mape:.2f}%")
    return {'mae': mae, 'rmse': rmse, 'r2': r2, 'mape': mape}

# Calculate metrics
train_metrics = calculate_metrics(y_train, y_train_pred, "Train")
if len(X_val) > 0:
    val_metrics = calculate_metrics(y_val, y_val_pred, "Validation")
else:
    val_metrics = {'mae': np.nan, 'rmse': np.nan, 'r2': np.nan, 'mape': np.nan}
    print("\n‚ö†Ô∏è  Skipping validation metrics (empty set)")
test_metrics = calculate_metrics(y_test, y_test_pred, "Test")


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print(feature_importance.head(15).to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 8))
top_n = 20
top_features = feature_importance.head(top_n)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title(f'Top {top_n} Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
# Visualize predictions vs actual
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Predictions vs Actual
sample_idx = np.random.choice(len(y_test), min(200, len(y_test)), replace=False)
sorted_idx = np.argsort(y_test[sample_idx])

axes[0].scatter(y_test[sample_idx][sorted_idx], y_test_pred[sample_idx][sorted_idx], 
                alpha=0.5, s=20, label='Predictions')
axes[0].plot([y_test[sample_idx].min(), y_test[sample_idx].max()], 
             [y_test[sample_idx].min(), y_test[sample_idx].max()], 
             'r--', linewidth=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual RUL')
axes[0].set_ylabel('Predicted RUL')
axes[0].set_title('Predictions vs Actual (Test Set)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Residuals plot
residuals = y_test - y_test_pred
axes[1].scatter(y_test_pred, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted RUL')
axes[1].set_ylabel('Residual (Actual - Predicted)')
axes[1].set_title('Residuals Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Save model and results
models_dir = project_root / "results" / "models"
models_dir.mkdir(parents=True, exist_ok=True)

# Save the model
model_path = models_dir / "random_forest_rul_point_model.pkl"
joblib.dump(rf_model, model_path)
print(f"‚úÖ Saved model: {model_path}")

# Save predictions
results = pd.DataFrame({
    'battery_id': df_clean.loc[test_idx, 'battery_id'].values,
    'cycle_index': df_clean.loc[test_idx, 'cycle_index'].values,
    'actual_rul': y_test,
    'predicted_rul': y_test_pred,
})

results_path = models_dir / "rf_predictions_point.csv"
results.to_csv(results_path, index=False)
print(f"‚úÖ Saved predictions: {results_path}")

# Save metrics
metrics = pd.DataFrame({
    'metric': ['MAE', 'RMSE', 'R¬≤', 'MAPE'],
    'train': [train_metrics['mae'], train_metrics['rmse'], train_metrics['r2'], train_metrics['mape']],
    'val': [val_metrics['mae'], val_metrics['rmse'], val_metrics['r2'], val_metrics['mape']],
    'test': [test_metrics['mae'], test_metrics['rmse'], test_metrics['r2'], test_metrics['mape']]
})

metrics_path = models_dir / "rf_metrics_point.csv"
metrics.to_csv(metrics_path, index=False)
print(f"‚úÖ Saved metrics: {metrics_path}")

print(f"\nüìä Summary:")
print(f"   Test MAE: {test_metrics['mae']:.2f} cycles")
print(f"   Test RMSE: {test_metrics['rmse']:.2f} cycles")
print(f"   Test R¬≤: {test_metrics['r2']:.3f}")
print(f"\n‚úÖ Phase 1 Complete: Random Forest point prediction model saved!")
print(f"   Next: Train LSTM and Transformer models, then compare all 3.")
