In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report, confusion_matrix,
    precision_recall_curve, roc_curve
)
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shap
import pennylane as qml
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# Create directories for saving outputs
output_dir = "your_path"
os.makedirs(output_dir, exist_ok=True)
results_file = os.path.join(output_dir, "results.xlsx")

# Step 1: Load the AI4I 2020 dataset
file_path = 'your_path'
df = pd.read_csv(file_path)

print("Dataset loaded successfully!")

# Step 2: Preprocess the dataset
df = df.drop(columns=['UDI', 'Product ID', 'Type'], errors='ignore')
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df.columns = df.columns.str.strip()

imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

high_corr_features = ['HDF', 'OSF', 'PWF', 'TWF']
df_cleaned = df_imputed.drop(columns=[col for col in high_corr_features if col in df_imputed.columns])

target = 'Machine_failure'
features = [col for col in df_cleaned.columns if col != target]

X = df_cleaned[features]
y = df_cleaned[target]

if y.value_counts()[1] / y.value_counts()[0] < 0.1:
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Quantum Optimization using PennyLane
def quantum_optimizer(params, model_type):
    if model_type == 'rf':
        max_depth = max(1, int(params[1]))
        model = RandomForestClassifier(n_estimators=max(1, int(params[0])), max_depth=max_depth, min_samples_split=max(2, int(params[2])), random_state=42)
    elif model_type == 'xgb':
        max_depth = max(1, int(params[1]))
        model = XGBClassifier(n_estimators=max(1, int(params[0])), max_depth=max_depth, learning_rate=max(0.01, params[2]), eval_metric='logloss')
    elif model_type == 'log':
        model = LogisticRegressionCV(cv=3, penalty='l2', solver='liblinear', Cs=[max(0.01, params[0])])
    model.fit(X_train, y_train)
    return -accuracy_score(y_test, model.predict(X_test))

def optimize_hyperparams(model_type):
    from scipy.optimize import minimize
    init_params = np.random.rand(3)
    result = minimize(lambda p: quantum_optimizer(p, model_type), init_params, method='Nelder-Mead')
    return result.x

print("Optimizing Random Forest...")
best_rf_params = optimize_hyperparams('rf')
best_rf_model = RandomForestClassifier(n_estimators=max(1, int(float(best_rf_params[0]))),
                                       max_depth=max(1, int(float(best_rf_params[1]))),
                                       min_samples_split=max(2, int(float(best_rf_params[2]))),
                                       random_state=42)
best_rf_model.fit(X_train, y_train)

print("Optimizing XGBoost...")
best_xgb_params = optimize_hyperparams('xgb')
best_xgb_model = XGBClassifier(n_estimators=max(1, int(float(best_xgb_params[0]))),
                               max_depth=max(1, int(float(best_xgb_params[1]))),
                               learning_rate=max(0.01, float(best_xgb_params[2])),
                               eval_metric='logloss')
best_xgb_model.fit(X_train, y_train)

print("Optimizing Logistic Regression...")
best_log_params = optimize_hyperparams('log')
log_model = LogisticRegressionCV(cv=3, penalty='l2', solver='liblinear',
                                 Cs=[max(0.01, float(best_log_params[0]))])
log_model.fit(X_train, y_train)

def evaluate_model(model, name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    report = classification_report(y_test, y_pred, output_dict=True)
    return accuracy, roc_auc, report

rf_accuracy, rf_roc, rf_report = evaluate_model(best_rf_model, "Random Forest")
xgb_accuracy, xgb_roc, xgb_report = evaluate_model(best_xgb_model, "XGBoost")
log_accuracy, log_roc, log_report = evaluate_model(log_model, "Logistic Regression")

# Save results to Excel
with pd.ExcelWriter(results_file, engine='openpyxl') as writer:
    pd.DataFrame({'Metric': ['Accuracy', 'ROC-AUC'], 'Random Forest': [rf_accuracy, rf_roc], 'XGBoost': [xgb_accuracy, xgb_roc], 'Logistic Regression': [log_accuracy, log_roc]}).to_excel(writer, sheet_name="Model_Comparison", index=False)
    pd.DataFrame(rf_report).transpose().to_excel(writer, sheet_name="RandomForest_Report")
    pd.DataFrame(xgb_report).transpose().to_excel(writer, sheet_name="XGBoost_Report")
    pd.DataFrame(log_report).transpose().to_excel(writer, sheet_name="LogisticRegression_Report")

# SHAP analysis
explainer = shap.TreeExplainer(best_rf_model)  # Use TreeExplainer for tree-based models
shap_values = explainer.shap_values(X_test)

# Ensure X_test is a DataFrame with correct feature names
X_test_df = pd.DataFrame(X_test, columns=features)

# Use the correct index for shap_values (SHAP gives a list of arrays for multi-class models)
if isinstance(shap_values, list):
    shap_values = shap_values[1]  # Take the second class (for binary classification)

# Generate SHAP summary plot
shap.summary_plot(shap_values, X_test_df, show=False)

# Save the figure
plt.savefig(os.path.join(output_dir, "shap_summary_plot.png"))
plt.close()



print("Results and plots saved successfully!")

Dataset loaded successfully!
Optimizing Random Forest...
Optimizing XGBoost...
Optimizing Logistic Regression...
All additional plots generated and saved successfully!
Results and plots saved successfully!
