# 1 - Information

In [None]:
# Author: Pierre Oreistein
# Description inspired by: https://towardsdatascience.com/predictive-maintenance-of-turbofan-engines-ec54a083127

# 2 - Packages

In [None]:
%reload_kedro

In [None]:
# Data Handling Packages
import pandas as pd

# Machine Learning Packages
from sklearn.preprocessing import StandardScaler

# Prevent unecessary warnings
from warnings import filterwarnings
filterwarnings("ignore", ".*`should_run_async`.*")

# 3 - Feature Engineering

In [None]:
def drop_unecessary_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Remove unecessary columns."""
    # Load columns to drop defined in params
    columns_to_drop_l = context.params["columns_to_drop"]
    
    # Drop the columns
    df.drop(columns_to_drop_l, inplace=True, axis=1)
    
    return df

In [None]:
def add_remaining_useful_life(df: pd.DataFrame) -> pd.DataFrame:
    """Add remaining useful life to df."""
    # Get the total number of cycles for each unit
    grouped_by_unit_df = df.groupby(by="unit_nb")
    max_cycle_s = grouped_by_unit_df["time"].max()

    # Merge the max cycle back into the original frame
    result_df = df.merge(
        max_cycle_s.to_frame(name="max_cycle"), left_on="unit_nb", right_index=True
    )

    # Calculate remaining useful life for each row
    remaining_useful_life_s = result_df["max_cycle"] - result_df["time"]
    result_df["RUL"] = remaining_useful_life_s

    # drop max_cycle as it's no longer needed
    result_df = result_df.drop("max_cycle", axis=1)
    return result_df

In [None]:
def preprocess_X_train(X_train_raw: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the training data.

    Args:
        X_train_raw: Raw data.
    Returns:
        Preprocessed data, by dropping unecessary columns and adding the Reamining Useful Life (RUL).
    """    
    # Add RUL (target) to training set
    train_df = add_remaining_useful_life(X_train_raw)
    
    # Drop unecessary features
    train_df = drop_unecessary_columns(train_df)
    
    # Scale the sensors values
    scaler = StandardScaler()
    
    scaling_columns = [col_name for col_name in train_df.columns if col_name not in ["RUL"]]
    train_df[scaling_columns] = scaler.fit_transform(train_df[scaling_columns])
    
    # Save the StandardScaler
    catalog.save(name="StandardScaler", data=scaler)

    return train_df

In [None]:
# Load the training dataset
X_train_df = catalog.load("X_train_raw")

# Preprocess the training dataset and save it
train_df = preprocess_X_train(X_train_df)

# Save the training dataset
catalog.save("train_preprocessed_df", train_df)

In [None]:
def preprocess_X_test(X_test_raw: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the testing data.

    Args:
        X_test_raw: Raw data.
    Returns:
        Preprocessed data, by dropping unecessary columns.
    """
    # Keep only the last timestamp
    X_test_df = X_test_raw.groupby('unit_nb').last().reset_index()
    
    # Drop unecessary features
    X_test_df = drop_unecessary_columns(X_test_df)
    
    # Scale the sensors values
    scaler = catalog.load("StandardScaler")
    X_test = scaler.transform(X_test_df)
    X_test_df = pd.DataFrame(
        X_test,
        columns=X_test_df.columns,
        index=X_test_df.index
    )
    return X_test_df

In [None]:
# Load the training dataset
X_test_df = catalog.load("X_test_raw")

# Preprocess the training dataset and save it
X_test_df = preprocess_X_test(X_test_df)

# Save the training dataset
catalog.save("X_test_preprocessed_df", X_test_df)