In [None]:
# Enhanced Predictive Maintenance Pipeline

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report, confusion_matrix,
    precision_recall_curve, auc, matthews_corrcoef, roc_curve
)
from sklearn.calibration import calibration_curve
from imblearn.combine import SMOTETomek

import shap
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# ========== Setup ==========
output_dir = "result"
os.makedirs(output_dir, exist_ok=True)

# ========== Load and Preprocess Dataset ==========
file_path = 'ai4i2020.csv'  # <--- UPDATE this path to your dataset location
df = pd.read_csv(file_path)

df.drop(columns=['UDI', 'Product ID', 'Type'], inplace=True, errors='ignore')
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True).str.strip()

imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

high_corr_features = ['HDF', 'OSF', 'PWF', 'TWF']
df_cleaned = df_imputed.drop(columns=[col for col in high_corr_features if col in df_imputed.columns])

target = 'Machine_failure'
features = [col for col in df_cleaned.columns if col != target]

X = df_cleaned[features]
y = df_cleaned[target]

# ========== Hybrid Resampling (SMOTE + Tomek) ==========
print("Applying SMOTE + Tomek Links...")
smote_tomek = SMOTETomek(random_state=42)
X, y = smote_tomek.fit_resample(X, y)
print("New class distribution:\n", pd.Series(y).value_counts())

# ========== Train/Test Split ==========
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# ========== Cost-Sensitive Models ==========
param_grid_rf = {'n_estimators': [100], 'max_depth': [10]}
grid_search_rf = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42),
                              param_grid_rf, cv=3)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_

scale_pos_weight = (y == 0).sum() / (y == 1).sum()
param_grid_xgb = {'n_estimators': [100], 'max_depth': [5], 'learning_rate': [0.1]}
grid_search_xgb = GridSearchCV(XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False,
                                             eval_metric='logloss'), param_grid_xgb, cv=3)
grid_search_xgb.fit(X_train, y_train)
best_xgb = grid_search_xgb.best_estimator_

log_model = LogisticRegressionCV(class_weight='balanced', cv=3, penalty='l2', solver='liblinear')
log_model.fit(X_train, y_train)

# ========== Deep Learning Model (MLP) ==========
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

mlp = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_dl.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
mlp.fit(X_train_dl, y_train_dl, validation_split=0.2, epochs=100, batch_size=64, callbacks=[early_stop])
y_prob_dl = mlp.predict(X_test_dl).ravel()
y_pred_dl = (y_prob_dl > 0.5).astype(int)

# ========== Evaluation Function ==========
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_advanced(y_true, y_prob, model_name):
    y_pred = (y_prob > 0.5).astype(int)
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)
    mcc = matthews_corrcoef(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    roc = roc_auc_score(y_true, y_prob)
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {acc:.4f}, ROC-AUC: {roc:.4f}")
    print(f"PR-AUC: {pr_auc:.4f}, MCC: {mcc:.4f}")
    print(classification_report(y_true, y_pred))

# ========== Evaluate Models ==========
models = {
    'Random Forest': (best_rf, X_test),
    'XGBoost': (best_xgb, X_test),
    'Logistic Regression': (log_model, X_test),
    'MLP': (mlp, X_test_dl)
}

probs = {}
for name, (model, X_eval) in models.items():
    if name == 'MLP':
        y_prob = model.predict(X_eval).ravel()
    else:
        y_prob = model.predict_proba(X_eval)[:, 1]
    evaluate_advanced(y_test, y_prob, name)
    probs[name] = y_prob

# ========== Calibration Curve ==========
def plot_calibration_curve(y_true, y_prob, model_name):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=10)
    plt.figure()
    plt.plot(prob_pred, prob_true, marker='o', label=f'{model_name}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Observed Frequency')
    plt.title(f'Calibration Curve - {model_name}')
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{output_dir}/calibration_{model_name}.png", dpi=300)
    plt.show()

for name, y_prob in probs.items():
    plot_calibration_curve(y_test, y_prob, name)

# ========== SHAP Explainability (Random Forest) ==========
explainer = shap.TreeExplainer(best_rf)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values[1], X_train)
plt.tight_layout()
plt.savefig(f"{output_dir}/shap_summary_rf.png", dpi=300)
plt.close()
