In [3]:
# This notebook demonstrates the development of an AI-based predictive maintenance system to estimate the Remaining Useful Life (RUL) of industrial machinery.
# We will use the NASA CMAPSS dataset, applying deep learning techniques (specifically, an LSTM model) to predict equipment failure.

In [4]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp313-cp313-win_amd64.whl.metadata (4.6 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Using cached termc

In [6]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns


In [12]:
# Robust load_data()

def load_data(fd_number, base_path='./CMAPSS'):
    """
    Loads CMAPSS files for fd_number (1..4).
    Returns: train_df, test_df, rul_df (rul_df indexed by engine_id)
    """
    fn_train = os.path.join(base_path, f'train_FD00{fd_number}.txt')
    fn_test  = os.path.join(base_path, f'test_FD00{fd_number}.txt')
    fn_rul   = os.path.join(base_path, f'RUL_FD00{fd_number}.txt')

    # helpful check
    for f in (fn_train, fn_test, fn_rul):
        if not os.path.exists(f):
            raise FileNotFoundError(f"{f} not found. Put CMAPSS files into ./CMAPSS (next to this notebook).")

    train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
    test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)
    rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)

    # drop any fully-empty columns (safe)
    train_df.dropna(axis=1, how='all', inplace=True)
    test_df.dropna(axis=1, how='all', inplace=True)
    rul_df.dropna(axis=1, how='all', inplace=True)

    column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3'] + [f'sensor{i}' for i in range(1, 22)]
    train_df.columns = column_names
    test_df.columns  = column_names
    rul_df.columns   = ['rul']

    # set rul_df index to engine_id (1-based). This makes lookup easy:
    rul_df.index = np.arange(1, len(rul_df) + 1)

    return train_df, test_df, rul_df




In [13]:
# Cell D: quick test for FD001–FD004
for fd in range(1, 5):
    print(f"\n==============================")
    print(f" Loading CMAPSS Dataset FD00{fd}")
    print("==============================")
    try:
        train_df, test_df, rul_df = load_data(fd)
        print(f" FD00{fd} loaded successfully!")
        print(f"Train shape: {train_df.shape}")
        print(f"Test  shape: {test_df.shape}")
        print(f"RUL   shape: {rul_df.shape}")

        # Show a few sample rows from train and RUL files
        print("\nSample training data:")
        display(train_df.head(3))
        print("Sample RUL data:")
        display(rul_df.head())

    except FileNotFoundError as e:
        print(f" Error for FD00{fd}: {e}")
        if os.path.exists('./CMAPSS'):
            print("Files in ./CMAPSS:", sorted(os.listdir('./CMAPSS')))
        else:
            print("No ./CMAPSS folder found.")



 Loading CMAPSS Dataset FD001


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)


 FD001 loaded successfully!
Train shape: (20631, 26)
Test  shape: (13096, 26)
RUL   shape: (100, 1)

Sample training data:


  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)


Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442


Sample RUL data:


Unnamed: 0,rul
1,112
2,98
3,69
4,82
5,91


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)



 Loading CMAPSS Dataset FD002
 FD002 loaded successfully!
Train shape: (53759, 26)
Test  shape: (33991, 26)
RUL   shape: (259, 1)

Sample training data:


  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)
  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)


Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21
0,1,1,34.9983,0.84,100.0,449.44,555.32,1358.61,1137.23,5.48,...,183.06,2387.72,8048.56,9.3461,0.02,334,2223,100.0,14.73,8.8071
1,1,2,41.9982,0.8408,100.0,445.0,549.9,1353.22,1125.78,3.91,...,130.42,2387.66,8072.3,9.3774,0.02,330,2212,100.0,10.41,6.2665
2,1,3,24.9988,0.6218,60.0,462.54,537.31,1256.76,1047.45,7.05,...,164.22,2028.03,7864.87,10.8941,0.02,309,1915,84.93,14.08,8.6723


Sample RUL data:


Unnamed: 0,rul
1,18
2,79
3,106
4,110
5,15



 Loading CMAPSS Dataset FD003


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)
  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)


 FD003 loaded successfully!
Train shape: (24720, 26)
Test  shape: (16596, 26)
RUL   shape: (100, 1)

Sample training data:


Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.5,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.0,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669


Sample RUL data:


Unnamed: 0,rul
1,44
2,51
3,27
4,120
5,101



 Loading CMAPSS Dataset FD004


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)


 FD004 loaded successfully!
Train shape: (61249, 26)
Test  shape: (41214, 26)
RUL   shape: (248, 1)

Sample training data:


  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)


Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,129.78,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,312.59,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,129.62,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213


Sample RUL data:


Unnamed: 0,rul
1,22
2,39
3,107
4,75
5,149


In [14]:
# Preprocessing : Prepare and Combine Data (FD001–FD004)
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

# Prepare combined data containers
all_train_dfs = []
all_test_dfs = {}

RUL_CAP = 125  # cap for training RUL (optional but improves training stability)

for fd_number in range(1, 5):
    print(f"\n🔹 Processing FD00{fd_number} dataset...")
    train_data, test_data, rul_data = load_data(fd_number)
    
    # --- Calculate RUL for training data ---
    max_cycles_train = train_data.groupby('engine_id')['cycle'].transform('max')
    train_data['rul'] = max_cycles_train - train_data['cycle']
    
    # Apply RUL cap (piecewise linear degradation)
    train_data['rul'] = np.minimum(train_data['rul'], RUL_CAP)
    
    # --- Calculate RUL for test data ---
    max_cycles_test = test_data.groupby('engine_id')['cycle'].max()
    test_data['rul'] = test_data.apply(
        lambda row: rul_data.loc[row['engine_id'], 'rul'] + max_cycles_test.loc[row['engine_id']] - row['cycle'],
        axis=1
    )

    # Store results
    all_train_dfs.append(train_data)
    all_test_dfs[fd_number] = test_data

# --- Combine all training data ---
combined_train_data = pd.concat(all_train_dfs, ignore_index=True)

# --- Normalize features ---
cols_to_normalize = combined_train_data.columns.difference(['engine_id', 'cycle', 'rul'])

scaler = MinMaxScaler()
scaler.fit(combined_train_data[cols_to_normalize])

combined_train_data[cols_to_normalize] = scaler.transform(combined_train_data[cols_to_normalize])

for fd_number in range(1, 5):
    test_df = all_test_dfs[fd_number]
    test_df[cols_to_normalize] = scaler.transform(test_df[cols_to_normalize])
    all_test_dfs[fd_number] = test_df

print("\n Preprocessing complete.")
print(f"Combined training data shape: {combined_train_data.shape}")
print("Scaled test datasets ready for FD001–FD004.")



🔹 Processing FD001 dataset...


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)
  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)



🔹 Processing FD002 dataset...


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)
  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)



🔹 Processing FD003 dataset...


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)
  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)



🔹 Processing FD004 dataset...


  train_df = pd.read_csv(fn_train, delim_whitespace=True, header=None)
  test_df  = pd.read_csv(fn_test,  delim_whitespace=True, header=None)
  rul_df   = pd.read_csv(fn_rul,   delim_whitespace=True, header=None)



 Preprocessing complete.
Combined training data shape: (160359, 27)
Scaled test datasets ready for FD001–FD004.


In [None]:
# Feature Engineering & Generate Sequences
import numpy as np

def create_sequences(df, sequence_length, cols_to_use):
    """
    Generates rolling window sequences (samples, sequence_length, features)
    and corresponding RUL targets for each engine_id.
    """
    sequences = []
    rul_targets = []

    for engine_id in df['engine_id'].unique():
        engine_df = df[df['engine_id'] == engine_id]
        values = engine_df[cols_to_use].values
        rul_values = engine_df['rul'].values

        # Only create sequences if engine has enough cycles
        if len(engine_df) >= sequence_length:
            for i in range(len(engine_df) - sequence_length + 1):
                seq_x = values[i:i+sequence_length]
                seq_y = rul_values[i+sequence_length-1]
                sequences.append(seq_x)
                rul_targets.append(seq_y)

    return np.array(sequences, dtype=np.float32), np.array(rul_targets, dtype=np.float32)


# -----------------------
# Generate Sequences
# -----------------------
sequence_length = 50  # Sliding window size
print(f"🔹 Creating sequences using window size = {sequence_length}...\n")

# Training sequences
X_train, y_train = create_sequences(combined_train_data, sequence_length, cols_to_normalize)
print(f" Combined training sequences created: {X_train.shape[0]} samples")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}\n")

# Test sequences for each dataset
X_test_dict = {}
y_test_dict = {}

for fd_number, test_df in all_test_dfs.items():
    X_test, y_test = create_sequences(test_df, sequence_length, cols_to_normalize)
    X_test_dict[fd_number] = X_test
    y_test_dict[fd_number] = y_test
    print(f"FD00{fd_number} → X_test: {X_test.shape}, y_test: {y_test.shape}")

print("\n Sequence generation complete for all datasets.")
