# Objective

To illustrate a supervised learning workflow for classification tasks.


# Setup

In [4]:
import sklearn
import joblib

from sklearn.datasets import fetch_openml

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
sklearn.set_config(display='diagram')

# Business Context

Consider the case of predicting machinery failure based on the quality of the machinery and its wear and tear. Such predictions often require an expert intervention on the shopfloor and result in high dependencies on the expert. Predicting machine failure in advance based on the current state of the machinery can avoid large costs inncurred due to unplanned maintenance breakdowns.

The dataset used in this session is reflective of actual production data and is hosted on Open ML.


# Data

In [6]:
dataset = fetch_openml(data_id=42890, as_frame=True, parser="auto")

In [7]:
data_df = dataset.data

In [8]:
target = 'Machine failure'
numeric_features = [
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]'
]
categorical_features = ['Type']

## Attribute Information:

The dataset consists of 10 000 data points stored as rows with 14 features in columns

- UID: unique identifier ranging from 1 to 10000
- product ID: consisting of a letter L, M, or H for low (50% of all products), medium (30%) and high (20%) as product quality variants and a variant-specific serial number
- air temperature [K]: generated using a random walk process later normalized to a standard deviation of 2 K around 300 K
- process temperature [K]: generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.
- rotational speed [rpm]: calculated from a power of 2860 W, overlaid with a normally distributed noise
- torque [Nm]: torque values are normally distributed around 40 Nm with a f = 10 Nm and no negative values.
- tool wear [min]: The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process.
- 'machine failure' label that indicates, whether the machine has failed in this particular datapoint for any of the following failure modes are true.

The machine failure consists of five independent failure modes
tool wear failure (TWF): the tool will be replaced of fail at a randomly selected tool wear time between 200-240 mins (120 times in our dataset). At this point in time, the tool is replaced 69 times, and fails 51 times (randomly assigned).

- heat dissipation failure (HDF): heat dissipation causes a process failure, if the difference between air- and process temperature is below 8.6 K and the tools rotational speed is below 1380 rpm. This is the case for 115 data points.
- power failure (PWF): the product of torque and rotational speed (in rad/s) equals the power required for the process. If this power is below 3500 W or above 9000 W, the process fails, which is the case 95 times in our dataset.
- overstrain failure (OSF): if the product of tool wear and torque exceeds 11,000 minNm for the L product variant (12,000 M, 13,000 H), the process fails due to overstrain. This is true for 98 datapoints.
- random failures (RNF): each process has a chance of 0,1 % to fail regardless of its process parameters. This is the case for only 5 datapoints, less than could be expected for 10,000 datapoints in our dataset.

If at least one of the above failure modes is true, the process fails and the 'machine failure' label is set to 1.

# EDA

In [9]:
data_df[numeric_features].describe()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,300.00493,310.00556,1538.7761,39.98691,107.951
std,2.000259,1.483734,179.284096,9.968934,63.654147
min,295.3,305.7,1168.0,3.8,0.0
25%,298.3,308.8,1423.0,33.2,53.0
50%,300.1,310.1,1503.0,40.1,108.0
75%,301.5,311.1,1612.0,46.8,162.0
max,304.5,313.8,2886.0,76.6,253.0


In [10]:
data_df[categorical_features].describe()

Unnamed: 0,Type
count,10000
unique,3
top,L
freq,6000


In [11]:
data_df[target].value_counts()

Machine failure
0    9661
1     339
Name: count, dtype: int64

# Model Estimation

In [12]:
X = data_df[numeric_features + categorical_features]
y = data_df[target]

In [13]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [14]:
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features)
)

In [15]:
model_logistic_regression = LogisticRegression(n_jobs=-1)

In [16]:
model_pipeline = make_pipeline(
    preprocessor,
    model_logistic_regression
)

In [17]:
model_pipeline.fit(Xtrain, ytrain)

# Model Evaluation

In [18]:
model_pipeline.predict(Xtest)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
accuracy_score(ytest, model_pipeline.predict(Xtest))

0.9735

In [20]:
print(classification_report(ytest, model_pipeline.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1939
           1       0.67      0.26      0.38        61

    accuracy                           0.97      2000
   macro avg       0.82      0.63      0.68      2000
weighted avg       0.97      0.97      0.97      2000



# Hyperparameter Tuning

In [21]:
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features)
)

In [22]:
model_logistic_regression = LogisticRegression(n_jobs=-1)

In [23]:
model_pipeline = make_pipeline(
    preprocessor,
    model_logistic_regression
)

In [24]:
model_pipeline.named_steps

{'columntransformer': ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                  ['Air temperature [K]',
                                   'Process temperature [K]',
                                   'Rotational speed [rpm]', 'Torque [Nm]',
                                   'Tool wear [min]']),
                                 ('onehotencoder',
                                  OneHotEncoder(handle_unknown='ignore'),
                                  ['Type'])]),
 'logisticregression': LogisticRegression(n_jobs=-1)}

In [25]:
param_distribution = {
    "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
}

In [26]:
rand_search_cv = RandomizedSearchCV(
    model_pipeline,
    param_distribution,
    n_iter=3,
    cv=3,
    random_state=42
)

In [27]:
rand_search_cv.fit(Xtrain, ytrain)

In [28]:
rand_search_cv.best_estimator_

In [29]:
rand_search_cv.best_score_

0.9691250146619894

In [30]:
print(classification_report(ytest, rand_search_cv.best_estimator_.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1939
           1       0.63      0.28      0.39        61

    accuracy                           0.97      2000
   macro avg       0.80      0.64      0.69      2000
weighted avg       0.97      0.97      0.97      2000



# Serialization

In [31]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.5.2
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License

Copyright (c) 2007-2024 The scikit-learn developers.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS 

In [32]:
%%writefile requirements.txt
scikit-learn==1.4.2

Writing requirements.txt


In [33]:
%%writefile train.py

import joblib

from sklearn.datasets import fetch_openml

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

dataset = fetch_openml(data_id=42890, as_frame=True, parser="auto")

data_df = dataset.data

target = 'Machine failure'
numeric_features = [
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]'
]
categorical_features = ['Type']

print("Creating data subsets")

X = data_df[numeric_features + categorical_features]
y = data_df[target]

Xtrain, Xtest, ytrain, ytest = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features)
)

model_logistic_regression = LogisticRegression(n_jobs=-1)

print("Estimating Best Model Pipeline")

model_pipeline = make_pipeline(
    preprocessor,
    model_logistic_regression
)

param_distribution = {
    "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
}

rand_search_cv = RandomizedSearchCV(
    model_pipeline,
    param_distribution,
    n_iter=3,
    cv=3,
    random_state=42
)

rand_search_cv.fit(Xtrain, ytrain)

print("Logging Metrics")
print(f"Accuracy: {rand_search_cv.best_score_}")

print("Serializing Model")

saved_model_path = "model.joblib"

joblib.dump(rand_search_cv.best_estimator_, saved_model_path)

Writing train.py


In [34]:
!python train.py

# Test Predictions

In [36]:
saved_model = joblib.load("model.joblib")

In [37]:
saved_model

In [38]:
saved_model.predict(Xtest)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [40]:
!pip install huggingface-hub

Collecting huggingface-hub
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting pyyaml>=5.1 (from huggingface-hub)
  Using cached PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Downloading huggingface_hub-0.26.1-py3-none-any.whl (447 kB)
Downloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
Using cached PyYAML-6.0.2-cp311-cp311-win_amd64.whl (161 kB)
Downloading filelock-3.16.1-py3-none-any.whl (16 kB)
Installing collected packages: pyyaml, fsspec, filelock, huggingface-hub
Successfully installed filelock-3.16.1 fsspec-2024.10.0 huggingface-hub-0.26.1 pyyaml-6.0.2


In [39]:
%%writefile app.py
import os
import uuid
import joblib
import json

import gradio as gr
import pandas as pd

from huggingface_hub import CommitScheduler
from pathlib import Path

log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
log_folder = log_file.parent

scheduler = CommitScheduler(
    repo_id="machine-failure-logs",
    repo_type="dataset",
    folder_path=log_folder,
    path_in_repo="data",
    every=2
)

machine_failure_predictor = joblib.load('model.joblib')

air_temperature_input = gr.Number(label='Air temperature [K]')
process_temperature_input = gr.Number(label='Process temperature [K]')
rotational_speed_input = gr.Number(label='Rotational speed [rpm]')
torque_input = gr.Number(label='Torque [Nm]')
tool_wear_input = gr.Number(label='Tool wear [min]')
type_input = gr.Dropdown(
    ['L', 'M', 'H'],
    label='Type'
)

model_output = gr.Label(label="Machine failure")

def predict_machine_failure(air_temperature, process_temperature, rotational_speed, torque, tool_wear, type):
    sample = {
        'Air temperature [K]': air_temperature,
        'Process temperature [K]': process_temperature,
        'Rotational speed [rpm]': rotational_speed,
        'Torque [Nm]': torque,
        'Tool wear [min]': tool_wear,
        'Type': type
    }
    data_point = pd.DataFrame([sample])
    prediction = machine_failure_predictor.predict(data_point).tolist()

    with scheduler.lock:
        with log_file.open("a") as f:
            f.write(json.dumps(
                {
                    'Air temperature [K]': air_temperature,
                    'Process temperature [K]': process_temperature,
                    'Rotational speed [rpm]': rotational_speed,
                    'Torque [Nm]': torque,
                    'Tool wear [min]': tool_wear,
                    'Type': type,
                    'prediction': prediction[0]
                }
            ))
            f.write("\n")
            
    return prediction[0]

demo = gr.Interface(
    fn=predict_machine_failure,
    inputs=[air_temperature_input, process_temperature_input, rotational_speed_input, 
            torque_input, tool_wear_input, type_input],
    outputs=model_output,
    title="Machine Failure Predictor",
    description="This API allows you to predict the machine failure status of an equipment",
    allow_flagging="auto",
    concurrency_limit=8
)

demo.queue()
demo.launch(share=False)

Writing app.py
