# **Preprocessing**

In [None]:
# preprocess_rul_data.py
import pandas as pd
import numpy as np

# Step 1: Load and label columns
df = pd.read_csv("train_FD001.txt", sep="\s+", header=None)
cols = ['unit_number', 'time_in_cycles'] + \
       [f'operational_setting_{i}' for i in range(1, 4)] + \
       [f'sensor_measurement_{i}' for i in range(1, 22)]
df.columns = cols

# Step 2: RUL calculation
rul = df.groupby('unit_number')['time_in_cycles'].max().reset_index()
rul.columns = ['unit_number', 'max_cycle']
df = df.merge(rul, on='unit_number')
df['RUL'] = df['max_cycle'] - df['time_in_cycles']
df.drop('max_cycle', axis=1, inplace=True)

# Step 3: Drop irrelevant sensors
drop_sensors = ['sensor_measurement_1', 'sensor_measurement_5', 'sensor_measurement_6',
                'sensor_measurement_10', 'sensor_measurement_16', 'sensor_measurement_18',
                'sensor_measurement_19']
df.drop(columns=drop_sensors, inplace=True)

# Step 4: Feature engineering
for col in df.columns:
    if 'sensor_measurement' in col:
        df[f'{col}_roll_mean'] = df.groupby("unit_number")[col].rolling(window=5, min_periods=1).mean().reset_index(level=0, drop=True)
        df[f'{col}_roll_std'] = df.groupby("unit_number")[col].rolling(window=5, min_periods=1).std().reset_index(level=0, drop=True).fillna(0)
        df[f'{col}_delta'] = df.groupby("unit_number")[col].diff().fillna(0)
        df[f'{col}_ema'] = df.groupby("unit_number")[col].ewm(span=5, adjust=False).mean().reset_index(level=0, drop=True)

# Step 5: Normalized cycle
df['cycle_norm'] = df.groupby('unit_number')['time_in_cycles'].transform(lambda x: x / x.max())

# Step 6: Min-max normalization (per unit)
features_to_normalize = [col for col in df.columns if col not in ['unit_number', 'time_in_cycles', 'RUL']]
for col in features_to_normalize:
    df[col] = df.groupby('unit_number')[col].transform(lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8))

# Save
df.reset_index(drop=True, inplace=True)
df.to_csv("preprocessed_rul_data.csv", index=False)
print("Preprocessing complete. Saved as 'preprocessed_rul_data.csv'")


Preprocessing complete. Saved as 'preprocessed_rul_data.csv'


# **LightGBM**

In [None]:
# train_lightgbm.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Load
df = pd.read_csv("preprocessed_rul_data.csv")
X = df.drop(columns=["unit_number", "time_in_cycles", "RUL"])
y = df["RUL"]

# Split
X_train_val, X_holdout, y_train_val, y_holdout = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Scale
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_holdout_scaled = scaler.transform(X_holdout)

joblib.dump(scaler, "scaler.joblib")
joblib.dump((X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout), "scaled_data.joblib")

# Train
model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=40,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42
)
model.fit(X_train_scaled, y_train)

# Evaluate
def evaluate(name, y_true, y_pred):
    print(f"\n{name} Evaluation:")
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # <-- Compatible with all sklearn versions
    r2 = r2_score(y_true, y_pred)

    print("MAE :", mae)
    print("RMSE:", rmse)
    print("R²  :", r2)

evaluate("Validation", y_val, model.predict(X_val_scaled))
evaluate("Holdout", y_holdout, model.predict(X_holdout_scaled))
joblib.dump(model, "lightgbm_model.joblib")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17958
[LightGBM] [Info] Number of data points in the train set: 11552, number of used features: 73
[LightGBM] [Info] Start training from score 107.365738

Validation Evaluation:
MAE : 14.58639311208911
RMSE: 21.080658378678788
R²  : 0.9092676720293423





Holdout Evaluation:
MAE : 14.391989940150955
RMSE: 20.63226347497278
R²  : 0.9070621253876804


['lightgbm_model.joblib']

# **SVM**

In [None]:
import joblib
from sklearn.svm import SVR
from sklearn.utils import resample
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load scaled data
X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout = joblib.load("scaled_data.joblib")

# Downsample
X_svr_train, y_svr_train = resample(X_train_scaled, y_train, n_samples=5000, random_state=42)

# Train SVR
svr_model = SVR(kernel='rbf', C=10, epsilon=0.5, gamma='scale')
svr_model.fit(X_svr_train, y_svr_train)

# Save the trained SVR model
joblib.dump(svr_model, "svr_model.joblib")
print("Saved svr_model.joblib")

# Evaluation function
def evaluate(name, X, y):
    preds = svr_model.predict(X)
    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)

    print(f"\n{name} Evaluation:")
    print("MAE :", mae)
    print("RMSE:", rmse)
    print("R²  :", r2)

# Run evaluations
evaluate("Validation", X_val_scaled, y_val)
evaluate("Holdout", X_holdout_scaled, y_holdout)


Saved svr_model.joblib

Validation Evaluation:
MAE : 17.714910497902352
RMSE: 27.635183880667245
R²  : 0.8440740385854216

Holdout Evaluation:
MAE : 17.40973210222984
RMSE: 26.362914023201732
R²  : 0.8482649847198593


# **LSTM**

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import joblib

# Load scaled data
X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout = joblib.load("scaled_data.joblib")

# LSTM Dataset
class RULDataset(Dataset):
    def __init__(self, X, y, seq_len=30):
        self.X, self.y = self.create_sequences(X, y, seq_len)

    def create_sequences(self, X, y, seq_len):
        X_seq, y_seq = [], []
        for i in range(seq_len, len(X)):
            X_seq.append(X[i-seq_len:i])
            y_seq.append(y[i])
        return torch.tensor(np.array(X_seq), dtype=torch.float32), torch.tensor(np.array(y_seq), dtype=torch.float32)

    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=100, num_layers=2, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :]).squeeze()

#  Parameters
seq_len = 30
batch_size = 64
input_size = X_train_scaled.shape[1]
epochs = 50
learning_rate = 0.001

# Prepare Datasets
train_data = RULDataset(X_train_scaled, y_train.values, seq_len)
val_data = RULDataset(X_val_scaled, y_val.values, seq_len)
holdout_data = RULDataset(X_holdout_scaled, y_holdout.values, seq_len)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
holdout_loader = DataLoader(holdout_data, batch_size=batch_size)

# Build Model
model = LSTMModel(input_size=input_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
clip = 1.0

# Train LSTM
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")

# Evaluation Function
def evaluate_lstm(name, loader, true_targets):
    model.eval()
    preds, truths = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            y_pred = model(X_batch)
            preds.extend(y_pred.numpy())
            truths.extend(y_batch.numpy())

    # Align predictions to true targets
    y_true_aligned = true_targets[seq_len:]
    preds = np.array(preds)

    mae = mean_absolute_error(y_true_aligned, preds)
    rmse = np.sqrt(mean_squared_error(y_true_aligned, preds))
    r2 = r2_score(y_true_aligned, preds)

    print(f"\n {name} Evaluation:")
    print("MAE :", mae)
    print("RMSE:", rmse)
    print("R²  :", r2)

    return y_true_aligned, preds

# Run Evaluation and Save CSV
y_true_lstm, y_pred_lstm = evaluate_lstm("LSTM - Holdout", holdout_loader, y_holdout.values)

# Save predictions for Power BI
df_lstm = pd.DataFrame({
    "actual_rul": y_true_lstm,
    "lstm_pred": y_pred_lstm
})
df_lstm.to_csv("lstm_predictions.csv", index=False)
print(" Saved lstm_predictions.csv")


Epoch 1/50 - Loss: 2474248.2617
Epoch 2/50 - Loss: 1901031.3096
Epoch 3/50 - Loss: 1457949.1943
Epoch 4/50 - Loss: 1142920.3247
Epoch 5/50 - Loss: 951939.3899
Epoch 6/50 - Loss: 877941.7024
Epoch 7/50 - Loss: 864120.3459
Epoch 8/50 - Loss: 880158.2400
Epoch 9/50 - Loss: 867580.3274
Epoch 10/50 - Loss: 867099.2705
Epoch 11/50 - Loss: 869063.9634
Epoch 12/50 - Loss: 862924.4471
Epoch 13/50 - Loss: 869156.1682
Epoch 14/50 - Loss: 869696.7908
Epoch 15/50 - Loss: 862957.2123
Epoch 16/50 - Loss: 863012.6577
Epoch 17/50 - Loss: 862372.5296
Epoch 18/50 - Loss: 863473.9355
Epoch 19/50 - Loss: 864039.7098
Epoch 20/50 - Loss: 868217.1006
Epoch 21/50 - Loss: 864309.9468
Epoch 22/50 - Loss: 866898.4612
Epoch 23/50 - Loss: 877279.9971
Epoch 24/50 - Loss: 889617.5510
Epoch 25/50 - Loss: 880243.4407
Epoch 26/50 - Loss: 864098.2778
Epoch 27/50 - Loss: 862432.8520
Epoch 28/50 - Loss: 868503.6812
Epoch 29/50 - Loss: 867716.1101
Epoch 30/50 - Loss: 882332.6167
Epoch 31/50 - Loss: 865613.6228
Epoch 32/50 -

# **Model Comparison**

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load y_true and predictions
# These come from scaled_data.joblib or your model code
X_train_scaled, X_val_scaled, X_holdout_scaled, y_train, y_val, y_holdout = joblib.load("scaled_data.joblib")

# Load LightGBM and SVM predictions
lgb_model = joblib.load("lightgbm_model.joblib")
svr_model = joblib.load("svr_model.joblib")

y_pred_lgb = lgb_model.predict(X_holdout_scaled)
y_pred_svr = svr_model.predict(X_holdout_scaled)

# Load LSTM predictions
df_lstm = pd.read_csv("lstm_predictions.csv")
y_pred_lstm = df_lstm["lstm_pred"].values
y_true_lstm = df_lstm["actual_rul"].values

# Create Metrics Table
def get_metrics(y_true, y_pred):
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
        "r2": r2_score(y_true, y_pred)
    }

# Collect metrics
metrics = [
    {"model": "LightGBM", **get_metrics(y_holdout, y_pred_lgb)},
    {"model": "SVM",      **get_metrics(y_holdout, y_pred_svr)},
    {"model": "LSTM",     **get_metrics(y_true_lstm, y_pred_lstm)}
]

df_compare = pd.DataFrame(metrics)
df_compare.to_csv("model_comparison.csv", index=False)
print(" Saved model_comparison.csv")




 Saved model_comparison.csv


# **actual vs predicted**



In [None]:
import pandas as pd
import numpy as np
import joblib

# Load holdout true values
_, _, X_holdout_scaled, _, _, y_holdout = joblib.load("scaled_data.joblib")

# Load predictions
lgb_model = joblib.load("lightgbm_model.joblib")
svr_model = joblib.load("svr_model.joblib")
y_pred_lgb = lgb_model.predict(X_holdout_scaled)
y_pred_svr = svr_model.predict(X_holdout_scaled)

# Load LSTM predictions (already aligned)
df_lstm = pd.read_csv("lstm_predictions.csv")
y_true_lstm = df_lstm["actual_rul"].values
y_pred_lstm = df_lstm["lstm_pred"].values

# Align all predictions by trimming first 30 (LSTM uses seq_len = 30)
df_compare = pd.DataFrame({
    "actual_rul": y_true_lstm,
    "lightgbm_pred": y_pred_lgb[30:],
    "svm_pred": y_pred_svr[30:],
    "lstm_pred": y_pred_lstm
})

df_compare.to_csv("actual_vs_predicted.csv", index=False)
print(" Saved actual_vs_predicted.csv")




 Saved actual_vs_predicted.csv
