In [1]:
# ==========================================
# PrognosAI – CMAPSS Preprocessing for All Datasets
# ==========================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
# Folder containing all CMAPSS train files
dataset_folder = r"C:\Users\Nithin G J\Desktop\PragnosAI\archive (1)\CMaps"


In [3]:
# List of datasets
datasets = ['FD001', 'FD002', 'FD003', 'FD004']


In [23]:
# Rolling window size for sequences
WINDOW_SIZE = 30


In [5]:
# Columns for sensors and operational settings
sensor_cols = [f'sensor_{i}' for i in range(1, 22)]
op_cols = ['operational_setting_1', 'operational_setting_2', 'operational_setting_3']

In [6]:
# Function to create sequences
def create_sequences(df, sensor_cols, window_size):
    sequences, rul = [], []
    for unit in df['unit_number'].unique():
        unit_data = df[df['unit_number']==unit].sort_values('time_in_cycles')
        sensor_data = unit_data[sensor_cols].values
        rul_data = unit_data['RUL'].values
        for i in range(len(sensor_data)-window_size+1):
            sequences.append(sensor_data[i:i+window_size])
            rul.append(rul_data[i+window_size-1])
    return np.array(sequences), np.array(rul)

In [9]:
# Loop through all datasets
for dataset_name in datasets:
    print(f"\nProcessing {dataset_name}...")

    # Load data
    file_path = os.path.join(dataset_folder, f'train_{dataset_name}.txt')

    columns = ['unit_number', 'time_in_cycles',
               'operational_setting_1', 'operational_setting_2', 'operational_setting_3'] + sensor_cols

    data = pd.read_csv(file_path, sep=' ', header=None, names=columns, engine='python')
    data = data.drop(columns=[col for col in data.columns if 'Unnamed' in col])


Processing FD001...

Processing FD002...

Processing FD003...

Processing FD004...


In [11]:
# Compute RUL
max_cycle = data.groupby('unit_number')['time_in_cycles'].max().reset_index()
max_cycle.columns = ['unit_number', 'max_cycle']
data = data.merge(max_cycle, on='unit_number', how='left')
data['RUL'] = data['max_cycle'] - data['time_in_cycles']
data = data.drop(columns=['max_cycle'])

In [13]:
# Normalize sensor data
if 'operational_setting_1' in data.columns:
    # Create condition ID for multiple operating conditions
    condition_cols = [col for col in data.columns if 'operational_setting' in col]
    data['condition_id'] = data[condition_cols].astype(str).agg('-'.join, axis=1)
    scaler = MinMaxScaler()
    for cond in data['condition_id'].unique():
         cond_data = data[data['condition_id'] == cond]
         data.loc[data['condition_id'] == cond, sensor_cols] = scaler.fit_transform(cond_data[sensor_cols])
else:
    scaler = MinMaxScaler()
    data[sensor_cols] = scaler.fit_transform(data[sensor_cols])

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
 0.42857143 0.57142857 0.42857143 0.42857143 0.57142857 0.42857143
 0.14285714 0.42857143 0.28571429 0.28571429 0.42857143 0.14285714
 0.42857143 0.28571429 0.28571429 0.42857143 1.         0.85714286
 0.42857143 0.42857143 0.57142857 0.71428571 0.57142857 0.28571429
 0.42857143 0.42857143 0.14285714 0.57142857 0.28571429 0.42857143
 0.28571429 0.42857143 0.14285714 0.85714286 0.42857143 0.42857143
 0.28571429 0.28571429 0.42857143 0.71428571 0.14285714 0.85714286
 0.         0.42857143 0.57142857 0.28571429 0.42857143 0.
 0.71428571 0.14285714 0.14285714 0.42857143 0.28571429 0.28571429
 0.14285714 0.28571429 0.57142857 0.42857143 0.71428571 0.57142857
 0.57142857 0.71428571 0.42857143 0.71428571 0.57142857 0.14285714
 0.57142857 0.28571429 0.42857143 0.71428571 0.42857143 0.42857143
 0.14285714 0.42857143 0.28571429 0.42857143 0.28571429 0.14285714
 0.14285714 0.14285714 0.28571429 0.28571

In [15]:
# Create sequences
X, y = create_sequences(data, sensor_cols, WINDOW_SIZE)
print(f"{dataset_name} sequences shape: {X.shape}, RUL shape: {y.shape}")

FD004 sequences shape: (45705, 30, 21), RUL shape: (45705,)


In [17]:
# Save preprocessed data
np.save(os.path.join(dataset_folder, f'X_{dataset_name}.npy'), X)
np.save(os.path.join(dataset_folder, f'y_{dataset_name}.npy'), y)
print(f"{dataset_name} preprocessing completed and saved.")

FD004 preprocessing completed and saved.


In [18]:
# ==========================================
# PrognosAI – CMAPSS Preprocessing Script
# Generates: preprocessed .npy sequences + train CSV
# ==========================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

# Folder containing CMAPSS files
dataset_folder = r"C:\Users\Nithin G J\Desktop\PragnosAI\archive (1)\CMaps"

# Datasets to process
datasets = ['FD002', 'FD003', 'FD004']

# Rolling window size for sequences
WINDOW_SIZE = 30

# Sensor & operational columns
sensor_cols = [f'sensor_{i}' for i in range(1, 22)]
op_cols = ['operational_setting_1', 'operational_setting_2', 'operational_setting_3']

# Function to create sequences
def create_sequences(df, sensor_cols, window_size):
    sequences, rul = [], []
    for unit in df['unit_number'].unique():
        unit_data = df[df['unit_number']==unit].sort_values('time_in_cycles')
        sensor_data = unit_data[sensor_cols].values
        rul_data = unit_data['RUL'].values
        for i in range(len(sensor_data)-window_size+1):
            sequences.append(sensor_data[i:i+window_size])
            rul.append(rul_data[i+window_size-1])
    return np.array(sequences), np.array(rul)

# Process each dataset
for dataset_name in datasets:
    print(f"\nProcessing {dataset_name}...")
    
    # Load dataset
    file_path = os.path.join(dataset_folder, f'train_{dataset_name}.txt')
    columns = ['unit_number', 'time_in_cycles', 
               'operational_setting_1', 'operational_setting_2', 'operational_setting_3'] + sensor_cols
    data = pd.read_csv(file_path, sep=' ', header=None, names=columns, engine='python')
    data = data.drop(columns=[col for col in data.columns if 'Unnamed' in col])
    
    # Compute RUL
    max_cycle = data.groupby('unit_number')['time_in_cycles'].max().reset_index()
    max_cycle.columns = ['unit_number', 'max_cycle']
    data = data.merge(max_cycle, on='unit_number', how='left')
    data['RUL'] = data['max_cycle'] - data['time_in_cycles']
    data = data.drop(columns=['max_cycle'])
    
    # Normalize sensor data
    if 'operational_setting_1' in data.columns:
        condition_cols = [col for col in data.columns if 'operational_setting' in col]
        data['condition_id'] = data[condition_cols].astype(str).agg('-'.join, axis=1)
        scaler = MinMaxScaler()
        for cond in data['condition_id'].unique():
            cond_data = data[data['condition_id']==cond]
            data.loc[data['condition_id']==cond, sensor_cols] = scaler.fit_transform(cond_data[sensor_cols])
    else:
        scaler = MinMaxScaler()
        data[sensor_cols] = scaler.fit_transform(data[sensor_cols])
    
    # Save full processed dataset as CSV
    csv_path = os.path.join(dataset_folder, f'train_{dataset_name}_processed.csv')
    data.to_csv(csv_path, index=False)
    print(f"Saved processed CSV: {csv_path}")
    
    # Create rolling window sequences
    X, y = create_sequences(data, sensor_cols, WINDOW_SIZE)
    print(f"{dataset_name} sequences shape: {X.shape}, RUL shape: {y.shape}")
    
    # Save sequences as .npy
    np.save(os.path.join(dataset_folder, f'X_{dataset_name}.npy'), X)
    np.save(os.path.join(dataset_folder, f'y_{dataset_name}.npy'), y)
    print(f"Preprocessed .npy files saved for {dataset_name}.")



Processing FD002...


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
 0.75 0.5  0.5  0.   0.75 0.75 0.25 0.25 0.75 0.75 0.5  0.75 0.5  0.5
 0.25 0.25 0.25 0.5  0.   0.25 0.5  0.   1.   0.25 0.5  0.25 0.   0.
 0.5  0.   0.5  0.75 0.5  0.25 0.25 0.75 0.   0.25 0.75 0.25 0.25]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[data['condition_id']==cond, sensor_cols] = scaler.fit_transform(cond_data[sensor_cols])
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmi

Saved processed CSV: C:\Users\Nithin G J\Desktop\PragnosAI\archive (1)\CMaps\train_FD002_processed.csv
FD002 sequences shape: (38215, 30, 21), RUL shape: (38215,)
Preprocessed .npy files saved for FD002.

Processing FD003...


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
 0.5   0.75  0.625 0.625 0.625 0.5   0.5   0.75  0.25  0.625 0.375 0.625
 0.375 0.375 0.625 0.75  0.25  0.375 0.5   0.25  0.5   0.625 0.625 0.5
 0.375 0.5   0.375 0.5   0.375 0.625 0.5   0.375 0.5   0.75  0.75  0.75
 0.375 0.5   0.625 0.5   0.375 0.25  0.375 0.625 0.375 0.75  0.5   0.375
 0.5   0.625 0.75  0.625 0.125 0.5   0.625 0.375 0.625 0.5   0.75  0.5
 0.5   0.375 0.125 0.5   0.5   0.625 0.375 0.125 0.25  0.375 0.5   0.5
 0.625 0.5   0.5   0.5   0.25  0.375 0.5   0.5   0.75  0.75  1.    0.375
 0.625 0.625 0.75  0.25  0.375 0.375 0.5   0.125 0.625 0.625 0.5   0.5
 0.625 0.75  0.5   0.625 0.5   0.375 0.375 0.5   0.75  0.625 0.625 0.625
 0.375 0.5   0.375 0.5   0.375 0.125 0.625 0.25  0.5   0.5   0.25  0.
 0.625 0.625 0.5   0.75  0.625 0.625 0.375 0.625 0.625 0.625 0.75  0.5
 0.75  0.5   0.5   0.5   0.625 0.25  0.    0.375 0.25  0.375 0.625 0.5
 0.375 0.5   0.25  0.75  0.75  0.875 0.75  0

Saved processed CSV: C:\Users\Nithin G J\Desktop\PragnosAI\archive (1)\CMaps\train_FD003_processed.csv
FD003 sequences shape: (21278, 30, 21), RUL shape: (21278,)
Preprocessed .npy files saved for FD003.

Processing FD004...


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
 0.42857143 0.57142857 0.42857143 0.42857143 0.57142857 0.42857143
 0.14285714 0.42857143 0.28571429 0.28571429 0.42857143 0.14285714
 0.42857143 0.28571429 0.28571429 0.42857143 1.         0.85714286
 0.42857143 0.42857143 0.57142857 0.71428571 0.57142857 0.28571429
 0.42857143 0.42857143 0.14285714 0.57142857 0.28571429 0.42857143
 0.28571429 0.42857143 0.14285714 0.85714286 0.42857143 0.42857143
 0.28571429 0.28571429 0.42857143 0.71428571 0.14285714 0.85714286
 0.         0.42857143 0.57142857 0.28571429 0.42857143 0.
 0.71428571 0.14285714 0.14285714 0.42857143 0.28571429 0.28571429
 0.14285714 0.28571429 0.57142857 0.42857143 0.71428571 0.57142857
 0.57142857 0.71428571 0.42857143 0.71428571 0.57142857 0.14285714
 0.57142857 0.28571429 0.42857143 0.71428571 0.42857143 0.42857143
 0.14285714 0.42857143 0.28571429 0.42857143 0.28571429 0.14285714
 0.14285714 0.14285714 0.28571429 0.28571

Saved processed CSV: C:\Users\Nithin G J\Desktop\PragnosAI\archive (1)\CMaps\train_FD004_processed.csv
FD004 sequences shape: (45705, 30, 21), RUL shape: (45705,)
Preprocessed .npy files saved for FD004.


In [22]:
# ==========================================
# PrognosAI – CMAPSS Preprocessing Script
# Generates: preprocessed CSVs + LSTM sequences (.npy)
# Works for FD001–FD004
# ==========================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

# ================== SETTINGS ==================
DATASETS = ["FD001", "FD002", "FD003", "FD004"]
RAW_PATH = r"C:\Users\Nithin G J\Desktop\PragnosAI\archive (1)\CMaps"
SAVE_PATH = r"C:\Users\Nithin G J\Desktop\PragnosAI\processed_data"  # folder to save outputs
WINDOW_SIZE = 30

os.makedirs(SAVE_PATH, exist_ok=True)

# Function to create rolling sequences
def create_sequences(df, sensor_cols, window_size):
    sequences, rul = [], []
    for unit in df['unit_number'].unique():
        unit_data = df[df['unit_number']==unit].sort_values('time_in_cycles')
        sensor_data = unit_data[sensor_cols].values
        rul_data = unit_data['RUL'].values
        for i in range(len(sensor_data) - window_size + 1):
            sequences.append(sensor_data[i:i+window_size])
            rul.append(rul_data[i+window_size-1])
    return np.array(sequences), np.array(rul)

# ================== PROCESS DATASETS ==================
for ds in DATASETS:
    print(f"\n--- Preprocessing {ds} ---")
    
    try:
        # ======= LOAD TRAIN =======
        train_file = os.path.join(RAW_PATH, f"train_{ds}.txt")
        col_names = ['unit_number', 'time_in_cycles'] + [f'operational_setting_{i}' for i in range(1, 4)] + [f'sensor_{i}' for i in range(1, 22)]
        train = pd.read_csv(train_file, sep=' ', header=None, names=col_names, engine='python')
        train = train.drop(columns=[c for c in train.columns if 'Unnamed' in c]).reset_index(drop=True)
        
        # ======= LOAD TEST =======
        test_file = os.path.join(RAW_PATH, f"test_{ds}.txt")
        test = pd.read_csv(test_file, sep=' ', header=None, names=col_names, engine='python')
        test = test.drop(columns=[c for c in test.columns if 'Unnamed' in c]).reset_index(drop=True)
        
        # ======= LOAD RUL =======
        rul_file = os.path.join(RAW_PATH, f"RUL_{ds}.txt")
        rul = pd.read_csv(rul_file, header=None)
        rul.columns = ['RUL']
        
        # ======= COMPUTE RUL FOR TRAIN =======
        max_cycle = train.groupby('unit_number')['time_in_cycles'].transform('max')
        train['RUL'] = max_cycle - train['time_in_cycles']
        
        # ======= NORMALIZE SENSOR COLUMNS =======
        sensor_cols = [c for c in train.columns if 'sensor' in c]
        scaler = MinMaxScaler()
        train[sensor_cols] = scaler.fit_transform(train[sensor_cols])
        test[sensor_cols] = scaler.transform(test[sensor_cols])  # use same scaler
        
        # ======= SAVE PROCESSED CSV =======
        train.to_csv(os.path.join(SAVE_PATH, f"train_{ds}_preprocessed.csv"), index=False)
        test.to_csv(os.path.join(SAVE_PATH, f"test_{ds}_preprocessed.csv"), index=False)
        rul.to_csv(os.path.join(SAVE_PATH, f"RUL_{ds}.csv"), index=False)
        print(f"{ds} CSVs saved successfully.")
        
        # ======= CREATE LSTM SEQUENCES =======
        X, y = create_sequences(train, sensor_cols, WINDOW_SIZE)
        np.save(os.path.join(SAVE_PATH, f"X_{ds}.npy"), X)
        np.save(os.path.join(SAVE_PATH, f"y_{ds}.npy"), y)
        print(f"{ds} LSTM sequences saved. Shape: X={X.shape}, y={y.shape}")
        
    except Exception as e:
        print(f"Error processing {ds}: {e}")



--- Preprocessing FD001 ---


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


FD001 CSVs saved successfully.
FD001 LSTM sequences saved. Shape: X=(17302, 30, 21), y=(17302,)

--- Preprocessing FD002 ---


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


FD002 CSVs saved successfully.
FD002 LSTM sequences saved. Shape: X=(38215, 30, 21), y=(38215,)

--- Preprocessing FD003 ---


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


FD003 CSVs saved successfully.
FD003 LSTM sequences saved. Shape: X=(21278, 30, 21), y=(21278,)

--- Preprocessing FD004 ---


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


FD004 CSVs saved successfully.
FD004 LSTM sequences saved. Shape: X=(45705, 30, 21), y=(45705,)
