In [2]:
import pandas as pd
import numpy as np


In [3]:
# Define column names for CMAPSS dataset
col_names = ["engine_id", "cycle", "op_setting_1", "op_setting_2", "op_setting_3"] \
            + [f"s{i}" for i in range(1, 22)]


In [4]:
train = pd.read_csv(r"C:\infosys_project\archive (3)\CMaps\train_FD001.txt", 
                    sep=r"\s+", header=None, names=col_names)

# Optional: compute RUL for train data
train["RUL"] = train.groupby("engine_id")["cycle"].transform(max) - train["cycle"]

# Save cleaned CSV (choose a safe folder)
train.to_csv(r"C:\infosys_project\train_FD001.csv", index=False)
print("Train data saved successfully!")


  train["RUL"] = train.groupby("engine_id")["cycle"].transform(max) - train["cycle"]


Train data saved successfully!


In [5]:
test = pd.read_csv(r"C:\infosys_project\archive (3)\CMaps\test_FD001.txt", 
                   sep=r"\s+", header=None, names=col_names)

# Save cleaned CSV
test.to_csv(r"C:\infosys_project\test_FD001.csv", index=False)
print("Test data saved successfully!")


Test data saved successfully!


In [6]:
rul = pd.read_csv(r"C:\infosys_project\archive (3)\CMaps\RUL_FD001.txt", 
                  header=None, names=["RUL"])

# Save CSV
rul.to_csv(r"C:\infosys_project\RUL_FD001.csv", index=False)
print("RUL data saved successfully!")


RUL data saved successfully!


In [7]:
# Check for missing values
print("Missing values in train:\n", train.isnull().sum())
print("Missing values in test:\n", test.isnull().sum())

# Drop duplicates if any
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)


Missing values in train:
 engine_id       0
cycle           0
op_setting_1    0
op_setting_2    0
op_setting_3    0
s1              0
s2              0
s3              0
s4              0
s5              0
s6              0
s7              0
s8              0
s9              0
s10             0
s11             0
s12             0
s13             0
s14             0
s15             0
s16             0
s17             0
s18             0
s19             0
s20             0
s21             0
RUL             0
dtype: int64
Missing values in test:
 engine_id       0
cycle           0
op_setting_1    0
op_setting_2    0
op_setting_3    0
s1              0
s2              0
s3              0
s4              0
s5              0
s6              0
s7              0
s8              0
s9              0
s10             0
s11             0
s12             0
s13             0
s14             0
s15             0
s16             0
s17             0
s18             0
s19             0
s20             0


In [8]:
# Calculate RUL for each engine in train data
rul_train = train.groupby("engine_id")["cycle"].transform(max) - train["cycle"]
train["RUL"] = rul_train
train.head()


  rul_train = train.groupby("engine_id")["cycle"].transform(max) - train["cycle"]


Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187


In [9]:
def create_sequences(df, window_size=30):
    sequences, targets = [], []
    engines = df["engine_id"].unique()
    
    for engine in engines:
        engine_data = df[df["engine_id"] == engine].reset_index(drop=True)
        features = engine_data.drop(["engine_id", "cycle", "RUL"], axis=1).values
        target = engine_data["RUL"].values
        
        for i in range(len(engine_data) - window_size + 1):
            sequences.append(features[i:i+window_size])
            targets.append(target[i+window_size-1])
    
    return np.array(sequences), np.array(targets)

X, y = create_sequences(train, window_size=30)
print("Sequence shape:", X.shape)
print("Target shape:", y.shape)


Sequence shape: (17731, 30, 24)
Target shape: (17731,)


In [10]:
train.to_csv(r"C:\infosys_project\archive (3)\CMaps\train_FD001_preprocessed.csv", index=False)
