In [176]:
# Machine Learning to improve factory availability

In [177]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn import tree
from sklearn.metrics import f1_score
import random

I first imported our data to a ``panda`` dataframe. This data was mostly generated by a script, ``generate_mock_up_data.py`` that created data for every hour of the year 2023, in order to have a sizeable amount to train our model.
I didn't engage in data cleaning, because, as the data we are working with is simulated, it doesn't contain the errors that usually occur with normal data.

In [178]:
mock_data = pd.read_csv('csv_data/mock_sensor_data.csv')
sensor_data = pd.read_csv('csv_data/sensor.csv')

sensor_data.pop("status")

mock_data.head(5)


Unnamed: 0,timestamp,sensor_id,temperature,status
0,2023-01-01 00:00:00,1,35.921273,Normal
1,2023-01-01 00:01:00,1,66.13869,Normal
2,2023-01-01 00:02:00,1,46.842345,Failed
3,2023-01-01 00:03:00,1,48.080549,Normal
4,2023-01-01 00:04:00,1,40.268974,Normal


In [179]:
sensor_data.head(5)

Unnamed: 0,id,expansion_id,sensor_model_id
0,1,1,1
1,2,1,2
2,3,2,3
3,4,2,1
4,5,3,1


I then joined our different datasets, in order to train the model

In [180]:
data = mock_data.merge(sensor_data, left_on="sensor_id", right_on="sensor_model_id")

data.head(5)

Unnamed: 0,timestamp,sensor_id,temperature,status,id,expansion_id,sensor_model_id
0,2023-01-01 00:00:00,1,35.921273,Normal,1,1,1
1,2023-01-01 00:00:00,1,35.921273,Normal,4,2,1
2,2023-01-01 00:00:00,1,35.921273,Normal,5,3,1
3,2023-01-01 00:00:00,1,35.921273,Normal,8,5,1
4,2023-01-01 00:00:00,1,35.921273,Normal,9,6,1


I changed the status of the machine to be binary, in order to be compatible with the model.

In [181]:
data['status'] = data['status'].replace({'Normal': 1,  'Failed': 0})
print(data['status'].unique())

[1 0]


Did the same for the ``timestamp`` column

In [182]:
data['timestamp'] = pd.to_datetime(mock_data['timestamp']).dt.hour

print(data['timestamp'].head(5))

0    0
1    0
2    0
3    0
4    0
Name: timestamp, dtype: int32


I also checked for skewness, in order to detect abnormal behaviour and deal with it.

## Splitting

I then proceeded to split the data to train and test the model. We used stratified sampling to guarantee an equal distribution of sensor types.

In [183]:
X = data.drop(columns=["status"])
Y = data["status"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.7, stratify=data["status"], random_state=42)

def categorize_temperature(temperature):
    if temperature < 35:
        return 'Low'
    elif 35 <= temperature < 50:
        return 'Medium'
    else:
        return 'High'
    
Y_train = Y_train.apply(categorize_temperature)
Y_test = Y_test.apply(categorize_temperature)


## Machine Learning

### K-Nearest Neighbors

This classifier stores all the available data and classifies a new data point based on proximity or similarity.

In [184]:

knn = KNeighborsClassifier()

knn.fit(X_train, Y_train)
knn_predictions = knn.predict(X_test)

knn_report = classification_report(Y_test, knn_predictions)
knn_cm = confusion_matrix(Y_test, knn_predictions)

print("Classification Report:\n\n", knn_report, "\n\nConfusion Matrix:\n\n", knn_cm)

Classification Report:

               precision    recall  f1-score   support

         Low       1.00      1.00      1.00    843898

    accuracy                           1.00    843898
   macro avg       1.00      1.00      1.00    843898
weighted avg       1.00      1.00      1.00    843898
 

Confusion Matrix:

 [[843898]]


### Decision Trees

This classifier follows a tree like model of decisons. It splits the data recursively based on the most significant feature of each node.

In [185]:
dt_classifier = DecisionTreeClassifier()

dt_classifier.fit(X_train, Y_train)
dt_predictions = dt_classifier.predict(X_test)

dt_report = classification_report(Y_test, dt_predictions)
dt_cm = confusion_matrix(Y_test, dt_predictions)

print("Classification Report:\n\n", dt_report, "\n\nConfusion Matrix:\n\n", dt_cm)

Classification Report:

               precision    recall  f1-score   support

         Low       1.00      1.00      1.00    843898

    accuracy                           1.00    843898
   macro avg       1.00      1.00      1.00    843898
weighted avg       1.00      1.00      1.00    843898
 

Confusion Matrix:

 [[843898]]
