In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from pathlib import Path


In [4]:
import kagglehub

# Download latest version
#path = kagglehub.dataset_download("behrad3d/nasa-cmaps")
#print("Path to dataset files:", path)

DATA_PATH = Path("datasets/CMaps/")

# FD00X dataset prep

In [5]:
indexes = ['unit_number', 'time_cycles']
settings = ['setting_1', 'setting_2', 'setting_3']
sensors = ['s_{}'.format(i+1) for i in range(0,21)]
COLS = indexes + settings + sensors

In [None]:
def load_fd_dataset(dataset_id):
    """
    Load train/test/RUL files for a single FD dataset (e.g., FD001, FD002, etc.)
    
    :param dataset_id: integer 1..4, e.g. for FD001 use dataset_id=1
    :return: df_train, df_test, df_rul (pandas DataFrames)
    """

    train_file = DATA_PATH / f"train_FD00{dataset_id}.txt"
    test_file  = DATA_PATH / f"test_FD00{dataset_id}.txt"
    rul_file   = DATA_PATH / f"RUL_FD00{dataset_id}.txt"

    df_train = pd.read_csv(
        train_file,
        sep=r"\s+",        
        header=None,
        names=COLS,
        index_col=False
    )

    df_test = pd.read_csv(
        test_file,
        sep=r"\s+",
        header=None,
        names=COLS,
        index_col=False
    )

    df_rul = pd.read_csv(
        rul_file,
        sep=r"\s+",
        header=None,
        names=["RUL"],
        index_col=False
    )
    
    return df_train, df_test, df_rul

def add_train_rul(df_train):
    """
    For the training set, calculate RUL for every row.
    NASAâ€™s train data runs each engine to failure, so:
      RUL = (last cycle for that engine) - (current cycle).
    """
    # Group by unit and get the max cycle of each engine
    max_cycle = df_train.groupby("unit_number")["time_cycles"].transform(max)
    # RUL = distance to max cycle
    df_train["RUL"] = max_cycle - df_train["time_cycles"]
    return df_train

def add_test_rul(df_test, df_rul):
    """
    For the test set, each engine is truncated before failure. 
    NASA gives a single RUL for the *last* row of each engine in df_rul.
    
    Typically, we only need that final row to evaluate or predict RUL. 
    So we can 'merge' that RUL onto the final snapshot of each engine.
    
    If you want row-level RUL for the entire partial test run (less common),
    you need a different approach. Usually, we label only the last row.
    """
    # Identify the final row for each engine in the test set
    # i.e., the row with the maximum 'time_cycles' for that unit_number
    idx = df_test.groupby("unit_number")["time_cycles"].transform(max) == df_test["time_cycles"]
    final_test_rows = df_test[idx].copy().reset_index(drop=True)
    
    # Attach RUL from df_rul, which is one row per engine
    # RUL rows match by index => engine 1 => df_rul.loc[0], engine 2 => df_rul.loc[1], etc.
    # final_test_rows are also in ascending engine order, so we can do direct assignment
    final_test_rows["RUL"] = df_rul["RUL"]
    
    return final_test_rows


In [8]:
datasets = {}  

for i in range(1, 5):
    
    df_train_raw, df_test_raw, df_rul = load_fd_dataset(i)
    df_train = add_train_rul(df_train_raw)
    df_test_final = add_test_rul(df_test_raw, df_rul)
    key = f"FD00{i}"
    datasets[key] = {
        "train":       df_train,   
        "test":        df_test_raw,
        "rul":         df_rul,
        "test_final":  df_test_final,
    }

  max_cycle = df_train.groupby("unit_number")["time_cycles"].transform(max)
  idx = df_test.groupby("unit_number")["time_cycles"].transform(max) == df_test["time_cycles"]
  max_cycle = df_train.groupby("unit_number")["time_cycles"].transform(max)
  idx = df_test.groupby("unit_number")["time_cycles"].transform(max) == df_test["time_cycles"]
  max_cycle = df_train.groupby("unit_number")["time_cycles"].transform(max)
  idx = df_test.groupby("unit_number")["time_cycles"].transform(max) == df_test["time_cycles"]
  max_cycle = df_train.groupby("unit_number")["time_cycles"].transform(max)
  idx = df_test.groupby("unit_number")["time_cycles"].transform(max) == df_test["time_cycles"]
