In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

In [167]:
train_path = "C:/Users/chsai/OneDrive/Desktop/infosys internship/dataset/CMaps/raw data/train_FD004.csv"
test_path = "C:/Users/chsai/OneDrive/Desktop/infosys internship/dataset/CMaps/raw data/test_FD004.csv"  
rul_path = "C:/Users/chsai/OneDrive/Desktop/infosys internship/dataset/CMaps/raw data/RUL_FD004.csv"                  

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
rul_df = pd.read_csv(rul_path)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("RUL shape:", rul_df.shape)

Train shape: (61249, 26)
Test shape: (41214, 26)
RUL shape: (248, 1)


In [169]:
# Compute maximum cycles per unit
max_cycles = train_df.groupby("unit_number")["time_in_cycles"].max().reset_index()
max_cycles.columns = ["unit_number", "max_cycle"]

# Merge and calculate RUL
train_df = train_df.merge(max_cycles, on="unit_number", how="left")
train_df["RUL"] = train_df["max_cycle"] - train_df["time_in_cycles"]

# Drop temporary column
train_df = train_df.drop(columns=["max_cycle"])

train_df.head()

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,...,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367,320
1,1,2,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,...,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552,319
2,1,3,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,...,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213,318
3,1,4,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,...,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176,317
4,1,5,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,...,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754,316


In [171]:
# Initialize RUL column in test_df
test_df['RUL'] = 0

# Compute maximum cycle per unit in test data
max_cycles_test = test_df.groupby("unit_number")["time_in_cycles"].max().reset_index()
max_cycles_test.columns = ["unit_number", "max_cycle"]

# Merge max cycle with test_df
test_df = test_df.merge(max_cycles_test, on="unit_number", how="left")

# Get final RUL for each unit from RUL_FD001.csv
unit_final_rul = rul_df["RUL"].values
units = sorted(test_df["unit_number"].unique())

# Calculate RUL for each row
for i, unit in enumerate(units):
    final_rul = unit_final_rul[i]
    unit_mask = test_df["unit_number"] == unit
    test_df.loc[unit_mask, "RUL"] = final_rul + (test_df.loc[unit_mask, "max_cycle"] - test_df.loc[unit_mask, "time_in_cycles"])

# Drop temporary max_cycle
test_df = test_df.drop(columns=["max_cycle"])

test_df.head()

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,RUL
0,1,1,20.0072,0.7,100.0,491.19,606.67,1481.04,1227.81,9.35,...,2387.78,8048.98,9.2229,0.02,362,2324,100.0,24.31,14.7007,251
1,1,2,24.9984,0.62,60.0,462.54,536.22,1256.17,1031.48,7.05,...,2028.09,7863.46,10.8632,0.02,306,1915,84.93,14.36,8.5748,250
2,1,3,42.0,0.842,100.0,445.0,549.23,1340.13,1105.88,3.91,...,2387.95,8071.13,9.396,0.02,328,2212,100.0,10.39,6.4365,249
3,1,4,42.0035,0.8402,100.0,445.0,549.19,1339.7,1107.26,3.91,...,2387.9,8078.89,9.3594,0.02,328,2212,100.0,10.56,6.2367,248
4,1,5,35.0079,0.84,100.0,449.44,555.1,1353.04,1117.8,5.48,...,2387.87,8057.83,9.303,0.02,333,2223,100.0,14.85,8.9326,247


In [173]:
# Identify sensor columns
sensor_cols = [col for col in train_df.columns if "sensor" in col]

# Find constant sensors in training data
constant_sensors = [col for col in sensor_cols if train_df[col].nunique() <= 1]

# Drop constant sensors from both train and test
train_df = train_df.drop(columns=constant_sensors)
test_df = test_df.drop(columns=constant_sensors)

# Update sensor columns
sensor_cols = [col for col in train_df.columns if "sensor" in col]

print("Removed constant sensors:", constant_sensors)

Removed constant sensors: []


In [175]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (61249, 27)
Test shape: (41214, 27)


In [177]:
# Initialize scaler
scaler = MinMaxScaler()

# Fit on training sensor data
train_df[sensor_cols] = scaler.fit_transform(train_df[sensor_cols])

# Transform test sensor data
test_df[sensor_cols] = scaler.transform(test_df[sensor_cols])

In [179]:
window_size = 30  # sliding window size
feature_cols = [col for col in train_df.columns if col not in ["unit_number", "time_in_cycles", "RUL"]]

train_sequences = []
train_rul = []

for unit in train_df["unit_number"].unique():
    unit_data = train_df[train_df["unit_number"] == unit]
    unit_array = unit_data[feature_cols].values
    unit_rul_values = unit_data["RUL"].values
    
    for i in range(len(unit_array) - window_size + 1):
        train_sequences.append(unit_array[i:i+window_size])
        train_rul.append(unit_rul_values[i+window_size-1])

train_sequences = np.array(train_sequences)
train_rul = np.array(train_rul)

print("Train sequences shape:", train_sequences.shape)
print("Train RUL shape:", train_rul.shape)

Train sequences shape: (54028, 30, 24)
Train RUL shape: (54028,)


In [181]:
test_sequences = []
test_rul_seq = []

for unit in test_df["unit_number"].unique():
    unit_data = test_df[test_df["unit_number"] == unit]
    unit_array = unit_data[feature_cols].values
    unit_rul_values = unit_data["RUL"].values
    
    for i in range(len(unit_array) - window_size + 1):
        test_sequences.append(unit_array[i:i+window_size])
        test_rul_seq.append(unit_rul_values[i+window_size-1])

test_sequences = np.array(test_sequences)
test_rul_seq = np.array(test_rul_seq)

print("Test sequences shape:", test_sequences.shape)
print("Test RUL shape:", test_rul_seq.shape)

Test sequences shape: (34081, 30, 24)
Test RUL shape: (34081,)


In [183]:
import os

save_folder = "C:/Users/chsai/OneDrive/Desktop/infosys internship/dataset/CMaps/processed data"
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

print(f"Folder '{save_folder}' is ready.")

Folder 'C:/Users/chsai/OneDrive/Desktop/infosys internship/dataset/CMaps/processed data' is ready.


In [185]:
train_csv_path = os.path.join(save_folder, "train_preprocessed_scaled_FD004.csv")
test_csv_path = os.path.join(save_folder, "test_preprocessed_scaled_FD004.csv")

train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print("df saved")

df saved


In [187]:
np.save(os.path.join(save_folder, "train_sequences_FD004.npy"), train_sequences)
np.save(os.path.join(save_folder, "train_rul_FD004.npy"), train_rul)
np.save(os.path.join(save_folder, "test_sequences_FD004.npy"), test_sequences)
np.save(os.path.join(save_folder, "test_rul_FD004.npy"), test_rul_seq)

print("Sequences and RUL files saved")

Sequences and RUL files saved
