In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score, classification_report, confusion_matrix,
    precision_recall_curve, roc_curve
)
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Create directories for saving outputs
output_dir = "figures"
os.makedirs(output_dir, exist_ok=True)

# Step 1: Load the AI4I 2020 dataset
file_path = 'ai4i2020.csv'  # Update with your dataset path
df = pd.read_csv(file_path)

print("Dataset loaded successfully!")
print("First 5 rows of the dataset:")
print(df.head())

# Step 2: Preprocess the dataset
# Drop unnecessary columns
df = df.drop(columns=['UDI', 'Product ID', 'Type'], errors='ignore')

# Rename columns to remove special characters or spaces
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
df.columns = df.columns.str.strip()

# Check for missing values and impute with mean
print("\nMissing values in the dataset:")
print(df.isnull().sum())

imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Step 3: Check correlations and remove high-correlation features
print("\nFeature-Target Correlation:")
correlation = df_imputed.corr()['Machine_failure'].sort_values(ascending=False)
print(correlation)

# Drop high-correlation features
high_corr_features = ['HDF', 'OSF', 'PWF', 'TWF']
df_cleaned = df_imputed.drop(columns=[col for col in high_corr_features if col in df_imputed.columns])

# Verify column names
print("\nCleaned Column Names:")
print(df_cleaned.columns)

# Step 4: Dynamically define features and target
target = 'Machine_failure'

# Select all features except the target
features = [col for col in df_cleaned.columns if col != target]
print(f"\nFeatures selected: {features}")

# Define X and y
X = df_cleaned[features]
y = df_cleaned[target]

# Step 5: Handle class imbalance using SMOTE
if y.value_counts()[1] / y.value_counts()[0] < 0.1:
    print("Class imbalance detected! Applying SMOTE...")
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    print("After SMOTE Resampling:\n", y.value_counts())

# Step 6: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 7: Train Random Forest Classifier
print("\nTraining Random Forest Classifier...")
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [5, 10]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3)
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_

# Step 8: Train XGBoost Classifier
print("\nTraining XGBoost Classifier...")
param_grid_xgb = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'learning_rate': [0.01, 0.1]
}
grid_search_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                               param_grid_xgb, cv=3)
grid_search_xgb.fit(X_train, y_train)
best_xgb_model = grid_search_xgb.best_estimator_

# Step 9: Train Logistic Regression
print("\nTraining Logistic Regression...")
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Step 10: Evaluate All Models
def evaluate_model(model, name):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"ROC-AUC: {roc_auc:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return accuracy, roc_auc, y_pred, y_prob

# Evaluate models
rf_accuracy, rf_roc, rf_pred, rf_prob = evaluate_model(best_rf_model, "Random Forest")
xgb_accuracy, xgb_roc, xgb_pred, xgb_prob = evaluate_model(best_xgb_model, "XGBoost")
log_accuracy, log_roc, log_pred, log_prob = evaluate_model(log_model, "Logistic Regression")






Dataset loaded successfully!
First 5 rows of the dataset:
   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.6   
1    2     L47181    L                298.2                    308.7   
2    3     L47182    L                298.1                    308.5   
3    4     L47183    L                298.2                    308.6   
4    5     L47184    L                298.2                    308.7   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  TWF  \
0                    1551         42.8                0                0    0   
1                    1408         46.3                3                0    0   
2                    1498         49.4                5                0    0   
3                    1433         39.5                7                0    0   
4                    1408         40.0                9                0    0   

   HDF  PWF  OSF  RNF 