# **Preprocessing**

In [7]:
# preprocess_rul_data.py
import pandas as pd
import numpy as np

# Step 1: Load and label columns
df = pd.read_csv("train_FD001.txt", sep="\s+", header=None)
cols = ['unit_number', 'time_in_cycles'] + \
       [f'operational_setting_{i}' for i in range(1, 4)] + \
       [f'sensor_measurement_{i}' for i in range(1, 22)]
df.columns = cols

# Step 2: Drop irrelevant or constant sensors
drop_sensors = ['sensor_measurement_1', 'sensor_measurement_5', 'sensor_measurement_6',
                'sensor_measurement_10', 'sensor_measurement_16', 'sensor_measurement_18',
                'sensor_measurement_19']
df.drop(columns=drop_sensors, inplace=True)

# Step 3: Feature engineering (only past and current info used)
for col in df.columns:
    if 'sensor_measurement' in col or 'operational_setting' in col:
        df[f'{col}_roll_mean'] = df.groupby("unit_number")[col].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
        df[f'{col}_roll_std'] = df.groupby("unit_number")[col].rolling(window=5, min_periods=1).std().reset_index(level=0, drop=True).fillna(0)
        df[f'{col}_delta'] = df.groupby("unit_number")[col].diff().fillna(0)
        df[f'{col}_ema'] = df.groupby("unit_number")[col].ewm(span=5, adjust=False).mean().reset_index(level=0, drop=True)

# Step 4: Normalized cycle position within each unit
df['cycle_norm'] = df.groupby('unit_number')['time_in_cycles'].transform(lambda x: x / x.max())

# Step 5: Safe RUL calculation (based only on each unit's max cycle)
max_cycle = df.groupby('unit_number')['time_in_cycles'].transform('max')
df['RUL'] = max_cycle - df['time_in_cycles']

# Step 6: Reset index and save
df.reset_index(drop=True, inplace=True)
df.to_csv("preprocessed_rul_data.csv", index=False)
print("Leak-free preprocessing complete. Saved as 'preprocessed_rul_data.csv'")


Leak-free preprocessing complete. Saved as 'preprocessed_rul_data.csv'


# **LightGBM**

In [8]:
# train_lightgbm.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Load preprocessed data
df = pd.read_csv("preprocessed_rul_data.csv")

# Drop non-feature columns
X = df.drop(columns=["unit_number", "time_in_cycles", "RUL"])
y = df["RUL"]

# Step 1: 70% train_val, 30% holdout (unseen testing)
X_train_val, X_holdout, y_train_val, y_holdout = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 2: 80% train, 20% val from train_val
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42
)

# Step 3: Scale using training set only
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_holdout_scaled = scaler.transform(X_holdout)

# Save scaler and scaled data
joblib.dump(scaler, "scaler.joblib")
joblib.dump(
    (X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout),
    "scaled_data.joblib"
)

# Step 4: Train LightGBM model
model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Step 5: Evaluation function
def evaluate(name, y_true, y_pred):
    print(f"\n{name} Evaluation:")
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print("MAE :", mae)
    print("RMSE:", rmse)
    print("R²  :", r2)

# Evaluate on val and holdout
evaluate("Validation", y_val, model.predict(X_val_scaled))
evaluate("Holdout", y_holdout, model.predict(X_holdout_scaled))

# Save model
joblib.dump(model, "lightgbm_model.joblib")
print(" LightGBM training complete and model saved.")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17841
[LightGBM] [Info] Number of data points in the train set: 11552, number of used features: 81
[LightGBM] [Info] Start training from score 107.365738





Validation Evaluation:
MAE : 16.367472579488762
RMSE: 23.719706161642765
R²  : 0.8851284919416782

Holdout Evaluation:
MAE : 16.705945777007535
RMSE: 23.734778819029227
R²  : 0.8770101238848215
 LightGBM training complete and model saved.


# **SVR**

In [9]:
# train_svr.py
import joblib
import numpy as np
from sklearn.svm import SVR
from sklearn.utils import resample
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load scaled data
X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout = joblib.load("scaled_data.joblib")

#  Downsample training data for faster training
X_svr_train, y_svr_train = resample(
    X_train_scaled, y_train, n_samples=5000, random_state=42
)

# Train SVR model
svr_model = SVR(kernel='rbf', C=10, epsilon=0.5, gamma='scale')
svr_model.fit(X_svr_train, y_svr_train)

# Save trained model
joblib.dump(svr_model, "svr_model.joblib")
print("✅ Saved svr_model.joblib")

# Evaluation function
def evaluate(name, X, y):
    preds = svr_model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n{name} Evaluation:")
    print("MAE :", mae)
    print("RMSE:", rmse)
    print("R²  :", r2)

# Run evaluations
evaluate("Validation", X_val_scaled, y_val)
evaluate("Holdout", X_holdout_scaled, y_holdout)


✅ Saved svr_model.joblib

Validation Evaluation:
MAE : 18.658455347484452
RMSE: 29.87076399050042
R²  : 0.8178260084178834

Holdout Evaluation:
MAE : 18.36718867842253
RMSE: 28.596825742186407
R²  : 0.8214603643808744


# **LSTM**

In [10]:
# train_lstm.py
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import joblib

# Load scaled and split data
X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout = joblib.load("scaled_data.joblib")

# LSTM Dataset class
class RULDataset(Dataset):
    def __init__(self, X, y, seq_len=30):
        self.X, self.y = self.create_sequences(X, y, seq_len)

    def create_sequences(self, X, y, seq_len):
        X_seq, y_seq = [], []
        for i in range(seq_len, len(X)):
            X_seq.append(X[i-seq_len:i])
            y_seq.append(y[i])
        return torch.tensor(np.array(X_seq), dtype=torch.float32), torch.tensor(np.array(y_seq), dtype=torch.float32)

    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

# LSTM Model definition
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=100, num_layers=2, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :]).squeeze()

# Parameters
seq_len = 30
batch_size = 64
input_size = X_train_scaled.shape[1]
epochs = 50
learning_rate = 0.001

# Create Datasets and DataLoaders
train_data = RULDataset(X_train_scaled, y_train.values, seq_len)
val_data = RULDataset(X_val_scaled, y_val.values, seq_len)
holdout_data = RULDataset(X_holdout_scaled, y_holdout.values, seq_len)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
holdout_loader = DataLoader(holdout_data, batch_size=batch_size)

# Initialize model, optimizer, and loss
model = LSTMModel(input_size=input_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
clip = 1.0

# Train loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")

# Evaluation function
def evaluate_lstm(name, loader, true_targets):
    model.eval()
    preds, truths = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            preds.extend(y_pred.numpy())
            truths.extend(y_batch.numpy())

    # Align predictions with correct true values
    y_true_aligned = true_targets[seq_len:]
    preds = np.array(preds)

    mae = mean_absolute_error(y_true_aligned, preds)
    rmse = np.sqrt(mean_squared_error(y_true_aligned, preds))
    r2 = r2_score(y_true_aligned, preds)

    print(f"\n{name} Evaluation:")
    print("MAE :", mae)
    print("RMSE:", rmse)
    print("R²  :", r2)

    return y_true_aligned, preds

# Evaluate on holdout set
y_true_lstm, y_pred_lstm = evaluate_lstm("LSTM - Holdout", holdout_loader, y_holdout.values)

# Save predictions
df_lstm = pd.DataFrame({
    "actual_rul": y_true_lstm,
    "lstm_pred": y_pred_lstm
})
df_lstm.to_csv("lstm_predictions.csv", index=False)
print("Saved lstm_predictions.csv")


Epoch 1/50 - Loss: 2481134.6167
Epoch 2/50 - Loss: 1893833.7420
Epoch 3/50 - Loss: 1461428.9150
Epoch 4/50 - Loss: 1153478.8125
Epoch 5/50 - Loss: 964416.8005
Epoch 6/50 - Loss: 881626.8779
Epoch 7/50 - Loss: 862814.7745
Epoch 8/50 - Loss: 865057.9695
Epoch 9/50 - Loss: 863732.3225
Epoch 10/50 - Loss: 863816.6177
Epoch 11/50 - Loss: 870302.0479
Epoch 12/50 - Loss: 869233.9849
Epoch 13/50 - Loss: 863483.8407
Epoch 14/50 - Loss: 865564.9456
Epoch 15/50 - Loss: 866980.6169
Epoch 16/50 - Loss: 864093.2809
Epoch 17/50 - Loss: 864183.5002
Epoch 18/50 - Loss: 867052.7122
Epoch 19/50 - Loss: 885862.0837
Epoch 20/50 - Loss: 863890.3835
Epoch 21/50 - Loss: 868563.4490
Epoch 22/50 - Loss: 867370.6729
Epoch 23/50 - Loss: 863647.0927
Epoch 24/50 - Loss: 867700.5894
Epoch 25/50 - Loss: 869276.9131
Epoch 26/50 - Loss: 877620.8743
Epoch 27/50 - Loss: 867364.9268
Epoch 28/50 - Loss: 882456.0007
Epoch 29/50 - Loss: 865543.8765
Epoch 30/50 - Loss: 863228.5024
Epoch 31/50 - Loss: 864441.1157
Epoch 32/50 -

# **Model comparison**

In [13]:
# compare_models.py
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load scaled and split data
X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout = joblib.load("scaled_data.joblib")

# Load trained models
lgb_model = joblib.load("lightgbm_model.joblib")
svr_model = joblib.load("svr_model.joblib")

# Predictions from LightGBM and SVM on the full holdout set
y_pred_lgb = lgb_model.predict(X_holdout_scaled)
y_pred_svr = svr_model.predict(X_holdout_scaled)

# Load LSTM predictions — already aligned and stored in CSV
df_lstm = pd.read_csv("lstm_predictions.csv")
y_true_lstm = df_lstm["actual_rul"].values
y_pred_lstm = df_lstm["lstm_pred"].values

# Create evaluation function
def get_metrics(y_true, y_pred):
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
        "r2": r2_score(y_true, y_pred)
    }

# Collect metrics into a list of dicts
metrics = [
    {"model": "LightGBM", **get_metrics(y_holdout, y_pred_lgb)},
    {"model": "SVM",      **get_metrics(y_holdout, y_pred_svr)},
    {"model": "LSTM",     **get_metrics(y_true_lstm, y_pred_lstm)}
]

# Save results as a CSV
df_compare = pd.DataFrame(metrics)
df_compare.to_csv("model_comparison2.csv", index=False)
print(" Saved model_comparison.csv with performance of LightGBM, SVM, and LSTM.")




✅ Saved model_comparison.csv with performance of LightGBM, SVM, and LSTM.


# **Actual vs Predicted**

In [14]:
# actual_vs_predicted.py
import pandas as pd
import numpy as np
import joblib

# Load scaled holdout set and true RUL values
_, _, X_holdout_scaled, _, _, y_holdout = joblib.load("scaled_data.joblib")

# Load LightGBM and SVM models
lgb_model = joblib.load("lightgbm_model.joblib")
svr_model = joblib.load("svr_model.joblib")

# Generate predictions on the holdout set
y_pred_lgb = lgb_model.predict(X_holdout_scaled)
y_pred_svr = svr_model.predict(X_holdout_scaled)

# Load LSTM predictions from CSV
df_lstm = pd.read_csv("lstm_predictions.csv")
y_true_lstm = df_lstm["actual_rul"].values
y_pred_lstm = df_lstm["lstm_pred"].values

# Align other model predictions to LSTM
seq_len = 30
y_pred_lgb_aligned = y_pred_lgb[seq_len:]
y_pred_svr_aligned = y_pred_svr[seq_len:]

# Create final DataFrame for side-by-side comparison
df_compare = pd.DataFrame({
    "actual_rul": y_true_lstm,
    "lightgbm_pred": y_pred_lgb_aligned,
    "svm_pred": y_pred_svr_aligned,
    "lstm_pred": y_pred_lstm
})

# Save to CSV for visualization
df_compare.to_csv("actual_vs_predicted2.csv", index=False)
print(" Saved actual_vs_predicted.csv for model comparison.")




✅ Saved actual_vs_predicted.csv for model comparison.
