## **PREPARE THE DATASETS FOR TRAINING DECISION TREES**

In [None]:
import pandas as pd
import numpy as np

In [None]:
## GENERAL SETTINGS
WINDOW_MAIN = 20      # janela longa (20 ciclos)
WINDOW_TREND = 5      # janela curta (5 ciclos)
RUL_THRESHOLD = 30    # limite para "falha em breve"

# NOMEANDO AS COLUNAS DO DATAFRAME
op_cols = [f"op_setting_{i}" for i in range(1, 4)]
sensor_cols = [f"sensor_{i}" for i in range(1, 22)]
base_cols = ["id", "cycle"] + op_cols + sensor_cols

In [None]:
## 1. READING THE .TXT FILES

# train_FD001: 
df_train_raw = pd.read_csv("train_FD001.txt", delim_whitespace=True, header=None)
df_train_raw.columns = base_cols

# test_FD001: 
df_test_raw = pd.read_csv("test_FD001.txt",delim_whitespace=True,header=None)
df_test_raw.columns = base_cols

# RUL_FD001:
df_rul_final = pd.read_csv("RUL_FD001.txt",header=None,names=["RUL_end"])
test_engine_ids_sorted = df_test_raw["id"].unique()
test_engine_ids_sorted.sort()
df_rul_final["id"] = test_engine_ids_sorted

df_rul_final.head()

In [None]:
## 2. CALCULATION OF RUL IN TRAINING
# RUL = max_cycle - atual_cycle
max_cycle_per_id = df_train_raw.groupby("id")["cycle"].transform("max")
df_train_raw["RUL"] = max_cycle_per_id - df_train_raw["cycle"]

In [None]:
## 3. CREATE BINARY LABEL fail_soon IN TRAINING
# fail_soon = 1 se RUL <= 30, else 0
df_train_raw["fail_soon"] = (df_train_raw["RUL"] <= RUL_THRESHOLD).astype(int)

In [None]:
## 4. FUNCTION TO GENERATE WINDOW-BASED FEATURES
## mean, standard deviation, min, and max of each sensor over the last 20 cycles
## short-term trend: current value − mean of the last 5 cycles

def make_features_for_engine(df_engine):
    df_engine = df_engine.sort_values("cycle").copy()

    # Rolling window of 20 cycles for each sensor
    roll_long = df_engine[sensor_cols].rolling(window=WINDOW_MAIN, min_periods=1)

    feat_mean20 = roll_long.mean().add_suffix("_mean20")
    feat_std20  = roll_long.std(ddof=0).add_suffix("_std20")
    feat_min20  = roll_long.min().add_suffix("_min20")
    feat_max20  = roll_long.max().add_suffix("_max20")

    # Short-term trend: current value − mean of the last 5 cycles
    roll_short_mean = df_engine[sensor_cols].rolling(window=WINDOW_TREND, min_periods=1).mean()
    feat_trend5 = (df_engine[sensor_cols] - roll_short_mean).add_suffix("_trend5")

    # Combine everything
    out = pd.concat([
        df_engine[["id", "cycle", "RUL", "fail_soon"] + op_cols],
        feat_mean20,
        feat_std20,
        feat_min20,
        feat_max20,
        feat_trend5,
    ], axis=1)

    return out

In [None]:
## 5. APPLY THE FUNCTION TO ALL TRAINING ENGINES
df_train_feat = (
    df_train_raw
    .groupby("id", group_keys=False)
    .apply(make_features_for_engine)
    .reset_index(drop=True)
)

In [None]:
## 6. RUL IN TEST AND fail_soon LABEL
# RUL estimate per row assuming RUL_end at the last cycle and calculating backwards

# Maximum observed cycle for each engine in the test set
max_cycle_test = df_test_raw.groupby("id")["cycle"].transform("max")
df_test_raw["cycle_max"] = max_cycle_test

rul_map = dict(zip(df_rul_final["id"], df_rul_final["RUL_end"]))

# For each row of each engine in the test set::
# RUL(line) = engine_RUL_end + (engine_cycle_max - current_cycle)
df_test_raw["RUL"] = df_test_raw.apply(
    lambda row: rul_map[row["id"]] + (row["cycle_max"] - row["cycle"]),
    axis=1
)

df_test_raw["fail_soon"] = (df_test_raw["RUL"] <= RUL_THRESHOLD).astype(int)

# Generate features per engine in the test set
df_test_feat = (
    df_test_raw
    .groupby("id", group_keys=False)
    .apply(make_features_for_engine)
    .reset_index(drop=True)
)

  .apply(make_features_for_engine)


In [None]:
## 7. SAVING DATA .CSV
df_train_feat.to_csv("train_features_fd001.csv", index=False)
df_test_feat.to_csv("test_features_fd001.csv", index=False)