In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from pathlib import Path

DATA_PATH = Path("datasets/CMaps/")
images_dir = "images"

indexes = ['unit_number', 'time_cycles']
settings = ['setting_1', 'setting_2', 'setting_3']
sensors = ['s_{}'.format(i+1) for i in range(0,21)]
COLS = indexes + settings + sensors

In [46]:
Sensor_dictionary = {}
dict_list = [
    "Fan intake temperature (°R)",
    "Low-Pressure Compressor outlet temperature (°R)",
    "High-Pressure Compressor outlet temperature (°R)",
    "Low-Pressure Turbine outlet temperature (°R)",
    "Fan intake pressure (psia)",
    "Bypass-duct pressure (psia)",
    "High-Pressure Compressor outlet pressure (psia)",
    "Physical fan RPM",
    "Physical core RPM",
    "Engine pressure ratio (P50/P2)",
    "High-Pressure Compressor outlet static pressure (psia)",
    "Fuel flow to Ps30 ratio (pps/psia)",
    "Corrected fan RPM",
    "Corrected core RPM",
    "Bypass ratio",
    "Burner fuel-air ratio",
    "Bleed enthalpy",
    "Required fan RPM",
    "Required fan conversion RPM",
    "High-pressure turbine cooling airflow",
    "Low-pressure turbine cooling airflow"
]

Sensor_dictionary = {f's_{i+1}': sensor for i, sensor in enumerate(dict_list)}
Sensor_dictionary

def load_fd_dataset(dataset_id):

    train_file = DATA_PATH / f"train_FD00{dataset_id}.txt"
    test_file  = DATA_PATH / f"test_FD00{dataset_id}.txt"
    rul_file   = DATA_PATH / f"RUL_FD00{dataset_id}.txt"

    df_train = pd.read_csv(
        train_file,
        sep=r"\s+",        
        header=None,
        names=COLS,
        index_col=False
    )

    df_test = pd.read_csv(
        test_file,
        sep=r"\s+",
        header=None,
        names=COLS,
        index_col=False
    )

    df_rul = pd.read_csv(
        rul_file,
        sep=r"\s+",
        header=None,
        names=["RUL"],
        index_col=False
    )
    
    return df_train, df_test, df_rul

def add_train_rul(df_train):
    # Group by unit and get the max cycle of each engine
    max_cycle = df_train.groupby("unit_number")["time_cycles"].transform("max")
    # RUL = distance to max cycle
    df_train["RUL"] = max_cycle - df_train["time_cycles"]
    return df_train

def add_test_rul(df_test, df_rul):

    idx = df_test.groupby("unit_number")["time_cycles"].transform("max") == df_test["time_cycles"]
    final_test_rows = df_test[idx].copy().reset_index(drop=True)
    final_test_rows["RUL"] = df_rul["RUL"]
    
    return final_test_rows

In [47]:
datasets = {}  

for i in range(1, 5):
    
    df_train_raw, df_test_raw, df_rul = load_fd_dataset(i)
    df_train = add_train_rul(df_train_raw)
    df_test_final = add_test_rul(df_test_raw, df_rul)
    key = f"FD00{i}"
    datasets[key] = {
        "train":       df_train,   
        "test":        df_test_raw,
        "rul":         df_rul,
        "test_final":  df_test_final,
    }

# Gradient Boosting

In [48]:
""" from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Prepare features and target
df_fd001 = datasets["FD001"]["train"]
X = df_fd001.drop(columns=["unit_number", "time_cycles", "RUL"])
y = df_fd001["RUL"]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Gradient Boosting Regressor
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gbr.fit(X_train, y_train)

# Make predictions
y_pred = gbr.predict(X_val)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2) """



' from sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n\n# Prepare features and target\ndf_fd001 = datasets["FD001"]["train"]\nX = df_fd001.drop(columns=["unit_number", "time_cycles", "RUL"])\ny = df_fd001["RUL"]\n\n# Split into training and validation sets\nX_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# Initialize and train Gradient Boosting Regressor\ngbr = GradientBoostingRegressor(\n    n_estimators=200,\n    learning_rate=0.1,\n    max_depth=5,\n    random_state=42\n)\ngbr.fit(X_train, y_train)\n\n# Make predictions\ny_pred = gbr.predict(X_val)\n\n# Evaluate\nrmse = np.sqrt(mean_squared_error(y_val, y_pred))\nmae = mean_absolute_error(y_val, y_pred)\nr2 = r2_score(y_val, y_pred)\n\nprint("RMSE:", rmse)\nprint("MAE:", mae)\nprint("R²:", r2) '

In [49]:
""" from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Prepare features and target
df_fd001 = datasets["FD001"]["train"]

# Drop unnecessary columns
drop_cols = ["unit_number", "time_cycles", "RUL", "s_1", "s_5", "s_6", "s_10", "s_16", "s_18", "s_19"]
feature_cols = [col for col in df_fd001.columns if col not in drop_cols]
X = df_fd001[feature_cols]
y = df_fd001["RUL"]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Gradient Boosting Regressor
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gbr.fit(X_train, y_train)

# Make predictions
y_pred = gbr.predict(X_val)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2) """

' from sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\nimport numpy as np\n\n# Prepare features and target\ndf_fd001 = datasets["FD001"]["train"]\n\n# Drop unnecessary columns\ndrop_cols = ["unit_number", "time_cycles", "RUL", "s_1", "s_5", "s_6", "s_10", "s_16", "s_18", "s_19"]\nfeature_cols = [col for col in df_fd001.columns if col not in drop_cols]\nX = df_fd001[feature_cols]\ny = df_fd001["RUL"]\n\n# Split into training and validation sets\nX_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# Initialize and train Gradient Boosting Regressor\ngbr = GradientBoostingRegressor(\n    n_estimators=200,\n    learning_rate=0.1,\n    max_depth=5,\n    random_state=42\n)\ngbr.fit(X_train, y_train)\n\n# Make predictions\ny_pred = gbr.predict(X_val)\n\n# Evaluate\nrmse = np.sqrt(mean_squared_error(y_val, y_pred))\nmae = 

In [50]:
""" import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import uniform, randint

# Prepare features and target
df_fd001 = datasets["FD001"]["train"]
X = df_fd001.drop(columns=["unit_number", "time_cycles", "RUL"])
y = df_fd001["RUL"]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)

# Set up the hyperparameter search space
param_dist = {
    'n_estimators': randint(100, 1000),  # number of trees
    'learning_rate': uniform(0.01, 0.2),  # learning rate
    'max_depth': randint(3, 10),  # max depth of the trees
    'min_samples_split': randint(2, 10),  # min samples to split
    'min_samples_leaf': randint(1, 10),  # min samples to be at a leaf node
    'subsample': uniform(0.5, 0.5)  # fraction of samples to use for fitting each tree
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    gbr, param_distributions=param_dist, 
    n_iter=50,  # number of different combinations to try
    scoring='neg_mean_squared_error',  # use negative MSE for scoring (as lower is better)
    cv=5,  # 5-fold cross-validation
    verbose=1,  # print progress
    random_state=42,
    n_jobs=-1  # use all available cores for computation
)

# Run the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best model from the random search
best_gbr = random_search.best_estimator_

# Make predictions with the best model
y_pred = best_gbr.predict(X_val)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Best Hyperparameters:", random_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2)
 """

' import numpy as np\nimport pandas as pd\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.model_selection import train_test_split, RandomizedSearchCV\nfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\nfrom scipy.stats import uniform, randint\n\n# Prepare features and target\ndf_fd001 = datasets["FD001"]["train"]\nX = df_fd001.drop(columns=["unit_number", "time_cycles", "RUL"])\ny = df_fd001["RUL"]\n\n# Split into training and validation sets\nX_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# Initialize the Gradient Boosting Regressor\ngbr = GradientBoostingRegressor(random_state=42)\n\n# Set up the hyperparameter search space\nparam_dist = {\n    \'n_estimators\': randint(100, 1000),  # number of trees\n    \'learning_rate\': uniform(0.01, 0.2),  # learning rate\n    \'max_depth\': randint(3, 10),  # max depth of the trees\n    \'min_samples_split\': randint(2, 10),  # min samples to split\n  

In [51]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load dataset
df_fd001 = datasets["FD001"]["train"]

# 1. Handle missing values
# Checking for missing values in the dataset
missing_data = df_fd001.isnull().sum()
print("Missing Values:\n", missing_data)

# If there are missing values, you can either drop them or fill them. 
# Let's fill missing values with the median (this can be changed depending on the context).
df_fd001 = df_fd001.fillna(df_fd001.median())

# 2. Outlier detection and removal
# Outliers can be detected using z-scores, IQR, or domain knowledge. 
# Here, we'll use IQR to remove extreme outliers for each feature.

# Calculate the Interquartile Range (IQR)
Q1 = df_fd001.quantile(0.25)
Q3 = df_fd001.quantile(0.75)
IQR = Q3 - Q1

# Remove outliers (values outside 1.5*IQR from the 25th and 75th percentiles)
df_fd001 = df_fd001[~((df_fd001 < (Q1 - 1.5 * IQR)) | (df_fd001 > (Q3 + 1.5 * IQR))).any(axis=1)]

# 3. Feature scaling
# If you're using models sensitive to scale, such as linear regression or k-NN, scaling is important.
# StandardScaler works well for most models. We will apply it here.

# Dropping non-numeric columns that shouldn't be scaled (like unit_number and time_cycles)
X = df_fd001.drop(columns=["unit_number", "time_cycles", "RUL"])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Target variable
y = df_fd001["RUL"]

# 4. Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Now you can proceed with training your model, e.g., Gradient Boosting or other models

# Example: Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

# Initialize and train Gradient Boosting Regressor
gbr = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gbr.fit(X_train, y_train)

# Make predictions
y_pred = gbr.predict(X_val)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2)


Missing Values:
 unit_number    0
time_cycles    0
setting_1      0
setting_2      0
setting_3      0
s_1            0
s_2            0
s_3            0
s_4            0
s_5            0
s_6            0
s_7            0
s_8            0
s_9            0
s_10           0
s_11           0
s_12           0
s_13           0
s_14           0
s_15           0
s_16           0
s_17           0
s_18           0
s_19           0
s_20           0
s_21           0
RUL            0
dtype: int64
RMSE: 41.772032455442535
MAE: 31.736391922615862
R²: 0.5311458990633546
