In [2]:
import joblib
import pandas as pd
import numpy as np

data = joblib.load("../data/train_test.pkl")

X_train = data["X_train"]
X_test = data["X_test"]
X_train_scaled = data["X_train_scaled"]
X_test_scaled = data["X_test_scaled"]
y_train = data["y_train"]
y_test = data["y_test"]
class_weights = data["class_weights"]

print(X_train.shape, X_test.shape)

(8000, 8) (2000, 8)


In [1]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate(model, X_tr, X_te, y_tr, y_te, name="Model"):
    model.fit(X_tr, y_tr)
    pred = model.predict(X_te)
    prob = model.predict_proba(X_te)[:,1]

    print("\n======", name, "======")
    print("ROC-AUC:", roc_auc_score(y_te, prob))
    print("\nConfusion Matrix:\n", confusion_matrix(y_te, pred))
    print("\nReport:\n", classification_report(y_te, pred))

In [3]:
import joblib
import pandas as pd
import numpy as np

data = joblib.load("../data/train_test.pkl")

X_train = data["X_train"]
X_test = data["X_test"]
X_train_scaled = data["X_train_scaled"]
X_test_scaled = data["X_test_scaled"]
y_train = data["y_train"]
y_test = data["y_test"]
class_weights = data["class_weights"]

print("Loaded successfully")
print(class_weights)

Loaded successfully
{np.int64(0): np.float64(0.51753137533963), np.int64(1): np.float64(14.760147601476016)}


In [4]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(class_weight=class_weights, max_iter=1000)

evaluate(
    log_model,
    X_train_scaled,
    X_test_scaled,
    y_train,
    y_test,
    "Logistic Regression"
)


ROC-AUC: 0.9069312507611741

Confusion Matrix:
 [[1596  336]
 [  12   56]]

Report:
               precision    recall  f1-score   support

           0       0.99      0.83      0.90      1932
           1       0.14      0.82      0.24        68

    accuracy                           0.83      2000
   macro avg       0.57      0.82      0.57      2000
weighted avg       0.96      0.83      0.88      2000



In [5]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    class_weight=class_weights,
    random_state=42,
    n_jobs=-1
)

evaluate(
    rf_model,
    X_train,
    X_test,
    y_train,
    y_test,
    "Random Forest"
)


ROC-AUC: 0.9687005236877361

Confusion Matrix:
 [[1929    3]
 [  28   40]]

Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1932
           1       0.93      0.59      0.72        68

    accuracy                           0.98      2000
   macro avg       0.96      0.79      0.86      2000
weighted avg       0.98      0.98      0.98      2000



In [7]:
def clean_columns(df):
    df = df.copy()
    df.columns = (
        df.columns
        .str.replace('[', '', regex=False)
        .str.replace(']', '', regex=False)
        .str.replace(' ', '_')
        .str.replace('/', '')
    )
    return df

X_train_xgb = clean_columns(X_train)
X_test_xgb = clean_columns(X_test)

print(X_train_xgb.columns)

Index(['Air_temperature_K', 'Process_temperature_K', 'Rotational_speed_rpm',
       'Torque_Nm', 'Tool_wear_min', 'temp_diff', 'Type_L', 'Type_M'],
      dtype='str')


In [8]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    scale_pos_weight=class_weights[1],
    eval_metric='logloss',
    random_state=42
)

evaluate(
    xgb_model,
    X_train_xgb,
    X_test_xgb,
    y_train,
    y_test,
    "XGBoost"
)


ROC-AUC: 0.9794483010595542

Confusion Matrix:
 [[1914   18]
 [  14   54]]

Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1932
           1       0.75      0.79      0.77        68

    accuracy                           0.98      2000
   macro avg       0.87      0.89      0.88      2000
weighted avg       0.98      0.98      0.98      2000



In [9]:
import numpy as np

probs = xgb_model.predict_proba(X_test_xgb)[:,1]

# default threshold
pred_default = (probs >= 0.5).astype(int)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred_default))

[[1914   18]
 [  14   54]]


In [10]:
threshold = 0.30
pred_safe = (probs >= threshold).astype(int)

print("Threshold:", threshold)
print(confusion_matrix(y_test, pred_safe))

Threshold: 0.3
[[1891   41]
 [  13   55]]


In [11]:
threshold = 0.70
pred_strict = (probs >= threshold).astype(int)

print("Threshold:", threshold)
print(confusion_matrix(y_test, pred_strict))

Threshold: 0.7
[[1920   12]
 [  16   52]]


In [12]:
import joblib

joblib.dump({
    "model": xgb_model,
    "threshold": 0.3,
    "columns": X_train_xgb.columns.tolist()
}, "../models/predictive_maintenance_model.pkl")

['../models/predictive_maintenance_model.pkl']