In [11]:
#Importing Libraries

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

In [12]:
#Define File Paths

# Relative path to the raw data file
RAW_FILE_PATH = '../data/raw/train_FD001.txt'
# Path for the processed output file
PROCESSED_FOLDER = '../data/processed'
PROCESSED_FILE_PATH = os.path.join(PROCESSED_FOLDER, 'train_FD001_processed.csv')

In [13]:
#Define Column Names

#5 operational columns + 21 sensor columns (s1-s26 are listed, but only ~21 are dynamic)
op_cols = ['op_setting_1', 'op_setting_2', 'op_setting_3']
sensor_cols_full = [f's_{i}' for i in range(1, 27)]
column_names = ['unit_number', 'time_in_cycles'] + op_cols + sensor_cols_full

print(f"Loading data from: {RAW_FILE_PATH}")

Loading data from: ../data/raw/train_FD001.txt


In [14]:
#Loading Data

try:
    df_raw = pd.read_csv(RAW_FILE_PATH, sep='\s+', header=None, names=column_names, index_col=False)
    print(f"Raw data loaded successfully. Shape: {df_raw.shape}")
except FileNotFoundError:
    print(f"Error: The file {RAW_FILE_PATH} was not found. Please check your folder structure.")
    raise

Raw data loaded successfully. Shape: (20631, 31)


  df_raw = pd.read_csv(RAW_FILE_PATH, sep='\s+', header=None, names=column_names, index_col=False)


In [15]:
#Initial Preprocessing and Cleaning

#Drop columns that are entirely NaN (often an artifact of the TXT parsing)
df_raw.dropna(axis=1, how='all', inplace=True)

In [16]:
#Drop Static/Constant Features

static_cols = [col for col in df_raw.columns if df_raw[col].std() == 0 and col.startswith('s_')]
df_cleaned = df_raw.drop(columns=static_cols)

print(f"\nData shape after dropping all-NaN columns: {df_raw.shape}")
print(f"Static Sensor Columns Removed (STD=0): {static_cols}")
print(f"Data shape after cleaning: {df_cleaned.shape}")


Data shape after dropping all-NaN columns: (20631, 26)
Static Sensor Columns Removed (STD=0): ['s_1', 's_10', 's_18', 's_19']
Data shape after cleaning: (20631, 22)


In [17]:
df_cleaned.head()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,s_2,s_3,s_4,s_5,s_6,...,s_9,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_20,s_21
0,1,1,-0.0007,-0.0004,100.0,641.82,1589.7,1400.6,14.62,21.61,...,9046.19,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,642.15,1591.82,1403.14,14.62,21.61,...,9044.07,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,642.35,1587.99,1404.2,14.62,21.61,...,9052.94,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,38.95,23.3442
3,1,4,0.0007,0.0,100.0,642.35,1582.79,1401.87,14.62,21.61,...,9049.48,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,642.37,1582.85,1406.22,14.62,21.61,...,9055.15,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,38.9,23.4044


In [18]:
#Feature Engineering (Remaining Useful Life - RUL)

print("\nFeature Engineering: Calculating RUL Target Variable")

#Calculate RUL

max_cycles_df = df_cleaned.groupby('unit_number')['time_in_cycles'].max().reset_index()
max_cycles_df.columns = ['unit_number', 'max_time_in_cycles']

df_cleaned = df_cleaned.merge(max_cycles_df, on='unit_number', how='left')

df_cleaned['RUL'] = df_cleaned['max_time_in_cycles'] - df_cleaned['time_in_cycles']

df_cleaned.drop(columns=['max_time_in_cycles'], inplace=True)

print("RUL calculation complete.")
print(df_cleaned[['unit_number', 'time_in_cycles', 'RUL']].tail())



Feature Engineering: Calculating RUL Target Variable
RUL calculation complete.
       unit_number  time_in_cycles  RUL
20626          100             196    4
20627          100             197    3
20628          100             198    2
20629          100             199    1
20630          100             200    0


In [19]:
#Feature Scaling

print("\nScaling Sensor and Operational Features")

#Identify Features to Scale

cols_to_exclude = ['unit_number', 'time_in_cycles', 'RUL']
features_to_scale = df_cleaned.columns.drop(cols_to_exclude, errors='ignore').tolist()

#Apply Min-Max Scaling (0 to 1)
scaler = MinMaxScaler()

#Perform scaling on a copy
df_scaled = df_cleaned.copy()
df_scaled[features_to_scale] = scaler.fit_transform(df_scaled[features_to_scale])

print("Features scaled successfully (MinMaxScaler).")


Scaling Sensor and Operational Features
Features scaled successfully (MinMaxScaler).


In [20]:
#Final Output and Saving

#Prepare Output Directory and Save File
if not os.path.exists(PROCESSED_FOLDER):
    os.makedirs(PROCESSED_FOLDER)
    print(f"\nCreated directory: {PROCESSED_FOLDER}")

#Save the final cleaned, RUL-calculated, and scaled DataFrame to CSV
df_scaled.to_csv(PROCESSED_FILE_PATH, index=False)

print(f"\nMilestone 1 Complete!")
print(f"Final processed dataset saved to: {PROCESSED_FILE_PATH}")
print(f"Final shape of the processed data: {df_scaled.shape}")


Milestone 1 Complete!
Final processed dataset saved to: ../data/processed\train_FD001_processed.csv
Final shape of the processed data: (20631, 23)
