In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [5]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
df = pd.read_csv('predictive_maintenance.csv')
print(df.head())

   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.6   
1    2     L47181    L                298.2                    308.7   
2    3     L47182    L                298.1                    308.5   
3    4     L47183    L                298.2                    308.6   
4    5     L47184    L                298.2                    308.7   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Target Failure Type  
0                    1551         42.8                0       0   No Failure  
1                    1408         46.3                3       0   No Failure  
2                    1498         49.4                5       0   No Failure  
3                    1433         39.5                7       0   No Failure  
4                    1408         40.0                9       0   No Failure  


In [8]:
print(df.isnull().values.any())

False


In [9]:
no_fail_rows = df[df['Target'] == 0]

print(no_fail_rows['Failure Type'].unique())

['No Failure' 'Random Failures']


In [10]:
# Create Target from Failure Type
df['Target'] = df['Failure Type'].apply(lambda x: 0 if x == 'No Failure' else 1)


In [11]:
# Check unique Failure Types where Target = 0
print(df[df['Target'] == 0]['Failure Type'].unique())  # should only show ['No Failure']

# Check unique Failure Types where Target = 1
print(df[df['Target'] == 1]['Failure Type'].unique())  # should show all failure types


['No Failure']
['Power Failure' 'Tool Wear Failure' 'Overstrain Failure'
 'Random Failures' 'Heat Dissipation Failure']


In [12]:
failure_counts = df['Failure Type'].value_counts()
print(failure_counts)

Failure Type
No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: count, dtype: int64


In [13]:
# highly imbalanced data

In [14]:
df_dropped = df.drop(['UDI', 'Product ID', 'Target'], axis=1)

In [15]:
# Custom label encoding for the 'Type' column
type_mapping = {'L': 0, 'M': 1, 'H': 2}
df_dropped['Type'] = df_dropped['Type'].map(type_mapping)
display(df_dropped.head())

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Failure Type
0,1,298.1,308.6,1551,42.8,0,No Failure
1,0,298.2,308.7,1408,46.3,3,No Failure
2,0,298.1,308.5,1498,49.4,5,No Failure
3,0,298.2,308.6,1433,39.5,7,No Failure
4,0,298.2,308.7,1408,40.0,9,No Failure


In [16]:
type_mapping = {'No Failure': 0, 'Heat Dissipation Failure': 1, 'Power Failure': 2, 'Overstrain Failure': 3, 'Tool Wear Failure': 4, 'Random Failures': 5}
df_dropped['Failure Type'] = df_dropped['Failure Type'].map(type_mapping)
display(df_dropped.head())

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Failure Type
0,1,298.1,308.6,1551,42.8,0,0
1,0,298.2,308.7,1408,46.3,3,0
2,0,298.1,308.5,1498,49.4,5,0
3,0,298.2,308.6,1433,39.5,7,0
4,0,298.2,308.7,1408,40.0,9,0


In [17]:
X = df_dropped.drop('Failure Type', axis=1)
y = df_dropped['Failure Type']

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (8000, 6)
X_test shape: (2000, 6)
y_train shape: (8000,)
y_test shape: (2000,)


In [19]:
from sklearn.preprocessing import StandardScaler

columns_to_scale = ['Air temperature [K]', 'Process temperature [K]',
                    'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Check new class balance
from collections import Counter
print(Counter(y_train_res))

Counter({0: 7717, 2: 7717, 1: 7717, 4: 7717, 3: 7717, 5: 7717})


In [21]:
# LR
# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_res, y_train_res)

In [22]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model performance
print("Classification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.63      0.77      1935
           1       0.23      1.00      0.37        15
           2       0.47      0.90      0.62        20
           3       0.29      0.92      0.44        13
           4       0.06      0.82      0.11        11
           5       0.00      0.17      0.00         6

    accuracy                           0.64      2000
   macro avg       0.34      0.74      0.39      2000
weighted avg       0.97      0.64      0.76      2000


Confusion Matrix:
 [[1227   49   20   29  135  475]
 [   0   15    0    0    0    0]
 [   0    2   18    0    0    0]
 [   0    0    0   12    1    0]
 [   1    0    0    1    9    0]
 [   4    0    0    0    1    1]]

Accuracy Score: 0.641


In [23]:
# Get the weights (coefficients) for each feature
feature_weights = pd.DataFrame({'Feature': X.columns, 'Weight': model.coef_[0]})
print("Feature Weights:")
print(feature_weights)

Feature Weights:
                   Feature    Weight
0                     Type  0.960410
1      Air temperature [K] -2.530466
2  Process temperature [K]  1.986141
3   Rotational speed [rpm] -0.570589
4              Torque [Nm] -3.614551
5          Tool wear [min] -3.754042
