In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from pathlib import Path


In [11]:
import kagglehub

# Download latest version
#path = kagglehub.dataset_download("behrad3d/nasa-cmaps")
#print("Path to dataset files:", path)

DATA_PATH = Path("datasets/CMaps/")

# FD00X dataset prep

In [12]:
indexes = ['unit_number', 'time_cycles']
settings = ['setting_1', 'setting_2', 'setting_3']
sensors = ['s_{}'.format(i+1) for i in range(0,21)]
COLS = indexes + settings + sensors

In [13]:
def load_fd_dataset(dataset_id):
    """
    Load train/test/RUL files for a single FD dataset (e.g., FD001, FD002, etc.)
    
    :param dataset_id: integer 1..4, e.g. for FD001 use dataset_id=1
    :return: df_train, df_test, df_rul (pandas DataFrames)
    """

    train_file = DATA_PATH / f"train_FD00{dataset_id}.txt"
    test_file  = DATA_PATH / f"test_FD00{dataset_id}.txt"
    rul_file   = DATA_PATH / f"RUL_FD00{dataset_id}.txt"

    df_train = pd.read_csv(
        train_file,
        sep=r"\s+",        
        header=None,
        names=COLS,
        index_col=False
    )

    df_test = pd.read_csv(
        test_file,
        sep=r"\s+",
        header=None,
        names=COLS,
        index_col=False
    )

    df_rul = pd.read_csv(
        rul_file,
        sep=r"\s+",
        header=None,
        names=["RUL"],
        index_col=False
    )
    
    return df_train, df_test, df_rul

def add_train_rul(df_train):
    """
    For the training set, calculate RUL for every row.
    NASA’s train data runs each engine to failure, so:
      RUL = (last cycle for that engine) - (current cycle).
    """
    # Group by unit and get the max cycle of each engine
    max_cycle = df_train.groupby("unit_number")["time_cycles"].transform("max")
    # RUL = distance to max cycle
    df_train["RUL"] = max_cycle - df_train["time_cycles"]
    return df_train

def add_test_rul(df_test, df_rul):
    """
    For the test set, each engine is truncated before failure. 
    NASA gives a single RUL for the *last* row of each engine in df_rul.
    
    Typically, we only need that final row to evaluate or predict RUL. 
    So we can 'merge' that RUL onto the final snapshot of each engine.
    
    If you want row-level RUL for the entire partial test run (less common),
    you need a different approach. Usually, we label only the last row.
    """
    # Identify the final row for each engine in the test set
    # i.e., the row with the maximum 'time_cycles' for that unit_number
    idx = df_test.groupby("unit_number")["time_cycles"].transform("max") == df_test["time_cycles"]
    final_test_rows = df_test[idx].copy().reset_index(drop=True)
    
    # Attach RUL from df_rul, which is one row per engine
    # RUL rows match by index => engine 1 => df_rul.loc[0], engine 2 => df_rul.loc[1], etc.
    # final_test_rows are also in ascending engine order, so we can do direct assignment
    final_test_rows["RUL"] = df_rul["RUL"]
    
    return final_test_rows


In [14]:
datasets = {}  

for i in range(1, 5):
    
    df_train_raw, df_test_raw, df_rul = load_fd_dataset(i)
    df_train = add_train_rul(df_train_raw)
    df_test_final = add_test_rul(df_test_raw, df_rul)
    key = f"FD00{i}"
    datasets[key] = {
        "train":       df_train,   
        "test":        df_test_raw,
        "rul":         df_rul,
        "test_final":  df_test_final,
    }

In [15]:
for ds_name, ds_dict in datasets.items():
    print(ds_name)
    print("  train shape:", ds_dict["train"].shape, "(includes computed RUL)")
    print("  test shape: ", ds_dict["test"].shape)
    print("  rul shape:  ", ds_dict["rul"].shape, "(one row per engine in test)")
    print("  final test shape (with RUL):", ds_dict["test_final"].shape)
    print()

FD001
  train shape: (20631, 27) (includes computed RUL)
  test shape:  (13096, 26)
  rul shape:   (100, 1) (one row per engine in test)
  final test shape (with RUL): (100, 27)

FD002
  train shape: (53759, 27) (includes computed RUL)
  test shape:  (33991, 26)
  rul shape:   (259, 1) (one row per engine in test)
  final test shape (with RUL): (259, 27)

FD003
  train shape: (24720, 27) (includes computed RUL)
  test shape:  (16596, 26)
  rul shape:   (100, 1) (one row per engine in test)
  final test shape (with RUL): (100, 27)

FD004
  train shape: (61249, 27) (includes computed RUL)
  test shape:  (41214, 26)
  rul shape:   (248, 1) (one row per engine in test)
  final test shape (with RUL): (248, 27)



In [23]:
datasets["FD004"]["train"].head()

Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,s_1,s_2,s_3,s_4,s_5,...,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21,RUL
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367,320
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552,319
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213,318
3,1,4,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,...,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176,317
4,1,5,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,...,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754,316
