In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load preprocessed EDA dataset
train_df = pd.read_csv(r"C:\Users\ADMIN\OneDrive\Desktop\predictive_maintenance\data\CMaps\train_FD001.txt", sep=' ', header=None)
train_df.dropna(axis=1, how='all', inplace=True)

col_names = [
    'unit_number', 'time_in_cycles',
    'op_setting_1', 'op_setting_2', 'op_setting_3',
    'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5',
    'sensor_6', 'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10',
    'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15',
    'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20',
    'sensor_21'
]
train_df.columns = col_names


In [2]:
rul_df = train_df.groupby('unit_number')['time_in_cycles'].max().reset_index()
rul_df.columns = ['unit_number', 'max_cycle']
train_df = train_df.merge(rul_df, on='unit_number', how='left')
train_df['RUL'] = train_df['max_cycle'] - train_df['time_in_cycles']
train_df.drop('max_cycle', axis=1, inplace=True)


In [3]:
# Remove low-variance sensors (you can verify this with .var())
sensor_cols = [col for col in train_df.columns if 'sensor' in col]
low_var = [col for col in sensor_cols if train_df[col].var() < 0.01]
print("Low variance sensors:", low_var)

# Drop them
train_df.drop(columns=low_var, inplace=True)


Low variance sensors: ['sensor_1', 'sensor_5', 'sensor_6', 'sensor_8', 'sensor_10', 'sensor_13', 'sensor_15', 'sensor_16', 'sensor_18', 'sensor_19']


In [4]:
# Sort data
train_df = train_df.sort_values(by=['unit_number', 'time_in_cycles'])

# Rolling window features
window_size = 5
for sensor in [col for col in train_df.columns if 'sensor' in col]:
    train_df[f'{sensor}_avg'] = train_df.groupby('unit_number')[sensor]\
                                        .rolling(window_size, min_periods=1).mean().reset_index(level=0, drop=True)
    train_df[f'{sensor}_std'] = train_df.groupby('unit_number')[sensor]\
                                        .rolling(window_size, min_periods=1).std().reset_index(level=0, drop=True)


In [5]:
# Define columns to scale
feature_cols = [col for col in train_df.columns if 'sensor' in col or 'op_setting' in col]

scaler = MinMaxScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])


In [6]:
from sklearn.model_selection import train_test_split

unique_engines = train_df['unit_number'].unique()
train_engines, test_engines = train_test_split(unique_engines, test_size=0.2, random_state=42)

train_data = train_df[train_df['unit_number'].isin(train_engines)]
test_data = train_df[train_df['unit_number'].isin(test_engines)]

print("Train engines:", len(train_engines))
print("Test engines:", len(test_engines))


Train engines: 80
Test engines: 20


In [7]:
train_data.to_csv('../data/processed_train.csv', index=False)
test_data.to_csv('../data/processed_test.csv', index=False)
