In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# Load the data
train = pd.read_csv('train_FD004.csv', low_memory=False)

# Define a function to calculate Fisher score using first 50 and last 50 samples of each engine
def fisher_score_sensor(df, sensor, start_cycles=50, end_cycles=50):
    begin_life = df[df['time, in cycles'] <= start_cycles][sensor]
    end_life = df[df['time, in cycles'] >= (df['time, in cycles'].max() - end_cycles + 1)][sensor]
    mean_diff = abs(begin_life.mean() - end_life.mean())
    within_var = begin_life.var() + end_life.var()
    return mean_diff / within_var

# Apply Fisher score calculation across each sensor
sensor_columns = [col for col in train.columns if col.startswith('sensor')]
fisher_scores = {sensor: fisher_score_sensor(train, sensor) for sensor in sensor_columns}

# Select the top sensors based on Fisher scores
top_sensors = sorted(fisher_scores, key=fisher_scores.get, reverse=True)[:6]
print("Top sensors selected based on Fisher score:", top_sensors)

# Calculate RUL for each engine
train['RUL'] = train.groupby('unit number')['time, in cycles'].transform(lambda x: x.max() - x)

# Apply EMA for each top sensor
ema_span = 50
for sensor in top_sensors:
    train[f'{sensor}_EMA'] = train.groupby('unit number')[sensor].transform(lambda x: x.ewm(span=ema_span, adjust=False).mean())

# Prepare the data using EMA features
ema_features = [f'{sensor}_EMA' for sensor in top_sensors]
X = train[ema_features]
y = train['RUL']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, 'scaler_model.pkl')

# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Initialize and train the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Initialize and train the Random Forest Regressor model
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Make predictions for each model
y_pred_linear = linear_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Calculate RMSE for each model
rmse_linear = (root_mean_squared_error(y_test, y_pred_linear))
rmse_dt = (root_mean_squared_error(y_test, y_pred_dt))
rmse_rf = (root_mean_squared_error(y_test, y_pred_rf))

print(f"RMSE on Validation Set (Linear Regression): {rmse_linear}")
print(f"RMSE on Validation Set (Decision Tree Regressor): {rmse_dt}")
print(f"RMSE on Validation Set (Random Forest Regressor): {rmse_rf}")

# Save the models
joblib.dump(linear_model, 'linear_regression_model_with_ema.h5')
joblib.dump(dt_model, 'decision_tree_model_with_ema.h5')
joblib.dump(rf_model, 'random_forest_model_with_ema.h5')

# Load the test data and actual RUL values
test = pd.read_csv('test_FD004.csv', low_memory=False)
RUL = pd.read_csv('RUL_FD004.csv', low_memory=False)

# Load the pre-trained scaler
scaler = joblib.load('scaler_model.pkl')

# Apply EMA calculation to each sensor in the test dataset for each engine
for sensor in top_sensors:
    test[f'{sensor}_EMA'] = test.groupby('unit number')[sensor].transform(lambda x: x.ewm(span=ema_span, adjust=False).mean())

# Extract the unique unit numbers from the test dataset
unit_numbers = test['unit number'].unique()

# Insert the unique unit numbers into the RUL DataFrame
RUL['unit number'] = unit_numbers
RUL = RUL[['unit number', 'RUL']]

# Function to evaluate model performance on test data
def evaluate_model_on_test(model, model_name):
    engine_rul_predictions = []
    
    for engine_id, group in test.groupby('unit number'):
        # Get the last calculated EMA values for each sensor of this engine
        X_test_engine = pd.DataFrame([group[ema_features].iloc[-1]], columns=ema_features)

        # Scale the EMA features
        X_test_engine_scaled = scaler.transform(X_test_engine)

        # Predict RUL for the engine
        predicted_rul = model.predict(X_test_engine_scaled)[0]

        # Get the actual RUL for this engine from the RUL dataset
        actual_rul = RUL.loc[RUL['unit number'] == engine_id, 'RUL'].values[0]

        # Append results
        engine_rul_predictions.append((engine_id, actual_rul, predicted_rul))

    # Convert predictions to a DataFrame for easier analysis
    engine_rul_predictions_df = pd.DataFrame(engine_rul_predictions, columns=['Engine ID', 'Actual RUL', 'Predicted RUL'])

    # Calculate RMSE for the predictions
    rmse_test = (root_mean_squared_error(engine_rul_predictions_df['Actual RUL'], engine_rul_predictions_df['Predicted RUL']))
    print(f"RMSE on Test Set ({model_name}): {rmse_test}")
    
    # Display the predictions DataFrame
    print(engine_rul_predictions_df.head())
    # print(engine_rul_predictions_df)

# Evaluate each model on the test set
evaluate_model_on_test(linear_model, 'Linear Regression')
evaluate_model_on_test(dt_model, 'Decision Tree Regressor')
evaluate_model_on_test(rf_model, 'Random Forest Regressor')

Top sensors selected based on Fisher score: ['sensor measurement 16', 'sensor measurement 10', 'sensor measurement 15', 'sensor measurement 11', 'sensor measurement 19', 'sensor measurement 14']
RMSE on Validation Set (Linear Regression): 67.06375075138918
RMSE on Validation Set (Decision Tree Regressor): 75.3512569129895
RMSE on Validation Set (Random Forest Regressor): 54.46309157053434
RMSE on Test Set (Linear Regression): 52.63370936833324
   Engine ID  Actual RUL  Predicted RUL
0          1          22     109.164963
1          2          39     123.137876
2          3         107     161.603232
3          4          75     162.577208
4          5         149      99.442572
RMSE on Test Set (Decision Tree Regressor): 69.35084947266792
   Engine ID  Actual RUL  Predicted RUL
0          1          22           92.0
1          2          39           82.0
2          3         107          109.0
3          4          75           97.0
4          5         149          131.0
RMSE on Te