# <h1 style="font-size: 18px;"> Info</h1>

https://www.kaggle.com/datasets/dnkumars/industrial-equipment-monitoring-dataset?resource=download<br>
<span style="font-size: 16px;">
This dataset contains simulated data representing real-time monitoring of various industrial equipment, including turbines,<br> compressors, and pumps. Each row in the dataset corresponds to a unique observation capturing key parameters such as temperature,<br> pressure, vibration, and humidity. The dataset also includes information about the equipment type, location, and whether the equipment is classified as faulty.
</span>
<br>

<span style="font-size: 16px;">
- Columns Description:<br>
- temperature: Temperature reading at the time of observation (in °C).<br>
- pressure: Pressure reading at the time of observation (in bar).<br>
- vibration: Vibration level reading (normalized units).<br>
- humidity: Humidity percentage recorded at the location of the equipment.<br>
- equipment: Type of industrial equipment being monitored (e.g., Turbine, Compressor, Pump).<br>
- location: Location of the equipment (city name).<br>
- faulty: Binary indicator (0 = Not Faulty, 1 = Faulty) to specify whether the equipment is functioning properly or requires maintenance.<br>
</span>

# <h1 style="font-size: 18px;">Equipment Analysis</h1>

In [2]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

# Models
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score,recall_score

# Export model
import joblib

In [7]:
# Import data
df = pd.read_csv("equipment_anomaly_data.csv")

In [None]:
# Proportion of categorical columns
for col in df[["location","faulty"]]:
  print(f"{df[col].value_counts(normalize=True)*100}\n")

In [27]:
# Create a dataframe for each equipment
turbine = df[df["equipment"] == "Turbine"].drop(columns=["equipment","location"])
compressor = df[df["equipment"] == "Compressor"].drop(columns=["equipment","location"])
pump = df[df["equipment"] == "Pump"].drop(columns=["equipment","location"])

In [230]:
# Print scatter 3d text structure
# https://miabellaai.net/index.html to visualize online
def scatter3d_text(dataframe):
  output_text = "Interactive 3D Scatter Plot;\n::temperature::pressure::vibration;\n"

  # Iterate over the data
  for idx, row in dataframe.iterrows():
      output_text += f"#{idx + 1}::{round(row["temperature"],5)}::{round(row["pressure"], 5)}::{round(row["vibration"], 5)}::{row["faulty"]:.0f}::10::A::1::0::0::0::0;\n"

  print(output_text)

# <h1 style="font-size: 18px;">All Equipments</h1>

In [12]:
# Split columns that will be part of analysis

# Independent
X = df[["temperature", "pressure", "vibration", "humidity"]]

# Dependent
y = df["faulty"]

## <h1 style="font-size: 18px;">Scatter Plot 3D</h1>

In [None]:
# 3D Scatter Plot

# Separate predicted values into true positive, true negative, false positive and false negative
class_0 = X[y == 0]
class_1 = X[y == 1]

# Create a trace for Faulty True
no_faulty = go.Scatter3d(
  x=class_0.iloc[:, 0],
  y=class_0.iloc[:, 1],
  z=class_0.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="blue",
    symbol="circle"
  ),
  name="Faulty No"
)

# Create a trace for Faulty False
yes_faulty = go.Scatter3d(
  x=class_1.iloc[:, 0],
  y=class_1.iloc[:, 1],
  z=class_1.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="red",
    symbol="circle"
  ),
  name="Faulty True"
)

# Configure graph layout
layout = go.Layout(
  scene=dict(
    xaxis_title="Temperature",
    yaxis_title="Pressure",
    zaxis_title="Vibration"
  ),
  legend=dict(
    x=0,
    y=1
  ),
  width=1000,
  height=800
)

# Create a figure and add the traces
fig = go.Figure(
  data=[no_faulty, yes_faulty],
  layout=layout)

fig.update_traces(marker=dict(size=4))

fig.update_layout(
  title={
    'text':"<b>Original Values</b>",
    'font': {
      'family':'Arial Black',
      'size': 30,
      'color': 'black'
    },
    'x': 0.5,
    'xanchor': 'center'
  },
  legend={
    'font': {
      'family':'Calibri',
      'size': 16,
      'color': 'black'
  },
    'itemsizing':'constant'}, # Increase legend marker size
  width=1500,
  height=1500
)

# Show graph
fig.show()

## <h1 style="font-size: 18px;">Outliers</h1>

In [None]:
outliers = {}
for col in X:
  Q1 = X[col].quantile(0.25)
  Q3 = X[col].quantile(0.75)
  IQR = Q3 - Q1
  
  aux_outliers = X[
    (X[col] < Q1-(IQR*1.5)) |
    (X[col] > Q3+(IQR*1.5))
  ]
  
  outliers.update({col:aux_outliers.index.tolist()})
  
for a in outliers.keys():
  print(f"{a}: {len(outliers[a])} outliers = {round(len(outliers[a]) / X.shape[0]*100, 2)}%")

## <h1 style="font-size: 18px;">Density Curve</h1>

In [None]:
# Density Curve
for col in X:
  plt.figure(figsize=(10,4))
  sns.kdeplot(X[col], fill=True)
  plt.axvline(
    X[col].median(),
    color="red",
    label=f"Median: {round(X[col].median(), 2)}",
    linestyle="dashed"
  )
  plt.axvline(
    X[col].mean(),
    color="green",
    label=f"Mean: {round(X[col].mean(), 2)}",
    linestyle="dashed"
  )
  plt.legend(loc=0)
  plt.show()

## <h1 style="font-size: 18px;">Correlation Plot</h1>

In [None]:
# Correlation plot
plt.figure(figsize=(12, 6))
sns.heatmap(compressor.corr(), annot=True)
plt.title(
  "Correlation".upper(),
  fontdict={'family':'calibri', 'fontsize':18, 'weight':'bold', 'color':'red'},
  pad=20
)
plt.show()

## <h1 style="font-size: 18px;">Scatter Plot Matrix</h1>

In [None]:
# Scatter Plot
fig = px.scatter_matrix(
  X,
  dimensions=["temperature", "pressure", "vibration", "humidity"],
  color=y.apply(str)
)

fig.update_layout(
  width=3000,
  height=3000,
)

fig.show()

## <h1 style="font-size: 18px;">ML Model</h1>

In [259]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
  X.values, y.values, test_size=0.2, random_state=42
)

In [260]:
# Hyperparameters to GridSearch
param_grid = {
  'criterion': ['gini', 'entropy', 'log_loss'],
  'splitter': ['best', 'random'],
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 2, 5],
  'max_depth': [None, 5, 10],
  'min_impurity_decrease': [0.001, 0.01, 0.05]
}

# Build Model
clf = DecisionTreeClassifier(random_state=42)

# Execute GridSearch
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)

# Train Model
grid_search.fit(X_train, y_train)

# Predict Values
y_pred = grid_search.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Build confusion matrix
conf_matrix_pred = confusion_matrix(y_test, y_pred)

# Classification Report
class_report = classification_report(y_test, y_pred)

In [None]:
# Evaluating the Model
print("Model Accuracy:")
print(f"{round(accuracy * 100,2)}%")
print()
print(f"Best Params: {grid_search.best_params_}")
print()
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Plot Confusion Matrix to evaluate model
font_ticklabels = FontProperties(
  family='calibri', size=10, weight='bold')

sns.heatmap(
  conf_matrix_pred,
  annot=True,
  fmt='d',
  cmap='Blues',
  xticklabels=set(y_train),
  yticklabels=set(y_train),
  cbar=False # Remove color bar
)

plt.xticks(fontproperties=font_ticklabels)
plt.yticks(fontproperties=font_ticklabels)

plt.xlabel('Predicted', fontdict={'fontsize':14}, labelpad=20)
plt.ylabel('Test', fontdict={'fontsize':14}, labelpad=20)
plt.title('Confusion Matrix', fontdict={'fontsize':16})
plt.show()

In [None]:
# Values of contamination to be tested
contamination_values = [0.0001, 0.001, 0.01, 0.05, 0.1]

# Evaluating model performance for each contamination value
for contamination in contamination_values:
  model = IsolationForest(contamination=contamination, random_state=42)
  model.fit(X_train)
  y_pred = model.predict(X_test)
  precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
  recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
  print(f'Contamination: {contamination}, Precision: {precision}, Recall: {recall}')

In [None]:
# Export Trained Model
joblib.dump(grid_search, 'model_ML.pkl')

# <h1 style="font-size: 18px;">Turbine</h1>

In [193]:
# Split columns that will be part of analysis

# Independent
X = turbine[["temperature", "pressure", "vibration", "humidity"]]

# Dependent
y = turbine["faulty"].apply(str)

In [None]:
outliers = {}
for col in X:
  Q1 = X[col].quantile(0.25)
  Q3 = X[col].quantile(0.75)
  IQR = Q3 - Q1
  
  aux_outliers = X[
    (X[col] < Q1-(IQR*1.5)) |
    (X[col] > Q3+(IQR*1.5))
  ]
  
  outliers.update({col:aux_outliers.index.tolist()})
  
for a in outliers.keys():
  print(f"{a}: {len(outliers[a])} outliers = {round(len(outliers[a]) / X.shape[0]*100, 2)}%")

In [None]:
# Density Curve
for col in X:
  plt.figure(figsize=(10,4))
  sns.kdeplot(X[col], fill=True)
  plt.axvline(
    X[col].median(),
    color="red",
    label=f"Median: {round(X[col].median(), 2)}",
    linestyle="dashed"
  )
  plt.axvline(
    X[col].mean(),
    color="green",
    label=f"Mean: {round(X[col].mean(), 2)}",
    linestyle="dashed"
  )
  plt.legend(loc=0)
  plt.show()

In [None]:
# Correlation plot
plt.figure(figsize=(12, 6))
sns.heatmap(turbine.corr(), annot=True)
plt.title(
  "Correlation".upper(),
  fontdict={'family':'calibri', 'fontsize':18, 'weight':'bold', 'color':'red'},
  pad=20
)
plt.show()

In [None]:
# Scatter Plot
px.scatter_matrix(
  X,
  dimensions=["temperature", "pressure", "vibration", "humidity"],
  color=y
)

In [195]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

In [196]:
# Hyperparameters to GridSearch
param_grid = {
  'criterion': ['gini', 'entropy', 'log_loss'],
  'splitter': ['best', 'random'],
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 2, 5],
  'max_depth': [None, 5, 10],
  'min_impurity_decrease': [0.001, 0.01, 0.05]
}

# Build Model
clf = DecisionTreeClassifier(random_state=42)

# Execute GridSearch
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)

# Train Model
grid_search.fit(X_train, y_train)

# Predict Values
y_pred = grid_search.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Build confusion matrix
conf_matrix_pred = confusion_matrix(y_test, y_pred)

# Classification Report
class_report = classification_report(y_test, y_pred)

In [None]:
# Evaluating the Model
print("Model Accuracy:")
print(f"{round(accuracy * 100,2)}%")
print()
print(f"Best Params: {grid_search.best_params_}")
print()
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Plot Confusion Matrix to evaluate model
font_ticklabels = FontProperties(
  family='calibri', size=10, weight='bold')

sns.heatmap(
  conf_matrix_pred,
  annot=True,
  fmt='d',
  cmap='Blues',
  xticklabels=set(y_train),
  yticklabels=set(y_train),
  cbar=False # Remove color bar
)

plt.xticks(fontproperties=font_ticklabels)
plt.yticks(fontproperties=font_ticklabels)

plt.xlabel('Predicted', fontdict={'fontsize':14}, labelpad=20)
plt.ylabel('Test', fontdict={'fontsize':14}, labelpad=20)
plt.title('Confusion Matrix', fontdict={'fontsize':16})
plt.show()

In [None]:
# Values of contamination to be tested
contamination_values = [0.0001, 0.001, 0.01, 0.05, 0.1]

# Evaluating model performance for each contamination value
for contamination in contamination_values:
  model = IsolationForest(contamination=contamination, random_state=42)
  model.fit(X_train)
  y_pred = model.predict(X_test)
  precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
  recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
  print(f'Contamination: {contamination}, Precision: {precision}, Recall: {recall}')

In [None]:
# 3D Scatter Plot

# Separate predicted values into true positive, true negative, false positive and false negative
class_TF = X_test[(y_test == 1) & (y_pred == 0)]
class_TT = X_test[(y_test == 1) & (y_pred == 1)]
class_FF = X_test[(y_test == 0) & (y_pred == 0)]
class_FT = X_test[(y_test == 0) & (y_pred == 1)]

# Create a trace for True Positive data
true_positive = go.Scatter3d(
  x=class_TT.iloc[:, 0],
  y=class_TT.iloc[:, 1],
  z=class_TT.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="blue",
    symbol="circle"
  ),
  name="True Positive"
)

# Create a trace for True Negative data
true_negative = go.Scatter3d(
  x=class_FF.iloc[:, 0],
  y=class_FF.iloc[:, 1],
  z=class_FF.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="green",
    symbol="circle"
  ),
  name="True Negative"
)

# Create a trace for False Positive data
false_positive = go.Scatter3d(
  x=class_FT.iloc[:, 0],
  y=class_FT.iloc[:, 1],
  z=class_FT.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="red",
    symbol="circle"
  ),
  name="False Positive"
)

# Create a trace for False Negative data
false_negative = go.Scatter3d(
  x=class_TF.iloc[:, 0],
  y=class_TF.iloc[:, 1],
  z=class_TF.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="orange",
    symbol="circle"
  ),
  name="False Negative"
)

# Configure graph layout
layout = go.Layout(
  scene=dict(
    xaxis_title="Temperature",
    yaxis_title="Pressure",
    zaxis_title="Vibration"
  ),
  legend=dict(
    x=0,
    y=1
  ),
  width=1000,
  height=800
)

# Create a figure and add the traces
fig = go.Figure(
  data=[true_positive, true_negative, false_positive, false_negative],
  layout=layout)

fig.update_layout(
  title={
    'text':"<b>Predict Values</b>",
    'font': {
      'family':'Arial Black',
      'size': 30,
      'color': 'black'
    },
    'x': 0.5,
    'xanchor': 'center'
  },
  legend={
    'font': {
      'family':'Calibri',
      'size': 16,
      'color': 'black'
  },
    'itemsizing':'constant'} # Increase legend marker size
)
# Show graph
fig.show()

# <h1 style="font-size: 18px;">Compressor</h1>

In [184]:
# Split columns that will be part of analysis

# Independent
X = compressor[["temperature", "pressure", "vibration", "humidity"]]

# Dependent
y = compressor["faulty"]

In [None]:
outliers = {}
for col in X:
  Q1 = X[col].quantile(0.25)
  Q3 = X[col].quantile(0.75)
  IQR = Q3 - Q1
  
  aux_outliers = X[
    (X[col] < Q1-(IQR*1.5)) |
    (X[col] > Q3+(IQR*1.5))
  ]
  
  outliers.update({col:aux_outliers.index.tolist()})
  
for a in outliers.keys():
  print(f"{a}: {len(outliers[a])} outliers = {round(len(outliers[a]) / X.shape[0]*100, 2)}%")

In [None]:
# Density Curve
for col in X:
  plt.figure(figsize=(10,4))
  sns.kdeplot(X[col], fill=True)
  plt.axvline(
    X[col].median(),
    color="red",
    label=f"Median: {round(X[col].median(), 2)}",
    linestyle="dashed"
  )
  plt.axvline(
    X[col].mean(),
    color="green",
    label=f"Mean: {round(X[col].mean(), 2)}",
    linestyle="dashed"
  )
  plt.legend(loc=0)
  plt.show()

In [None]:
# Correlation plot
plt.figure(figsize=(12, 6))
sns.heatmap(compressor.corr(), annot=True)
plt.title(
  "Correlation".upper(),
  fontdict={'family':'calibri', 'fontsize':18, 'weight':'bold', 'color':'red'},
  pad=20
)
plt.show()

In [None]:
# Scatter Plot
px.scatter_matrix(
  X,
  dimensions=["temperature", "pressure", "vibration", "humidity"],
  color=y.apply(str)
)

In [162]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42
)

In [163]:
# Hyperparameters to GridSearch
param_grid = {
  'criterion': ['gini', 'entropy', 'log_loss'],
  'splitter': ['best', 'random'],
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 2, 5],
  'max_depth': [None, 5, 10],
  'min_impurity_decrease': [0.001, 0.01, 0.05]
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Predict Values
y_pred = grid_search.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Build confusion matrix
conf_matrix_pred = confusion_matrix(y_test, y_pred)

# Classification Report
class_report = classification_report(y_test, y_pred)

In [None]:
# Evaluating the Model
print("Model Accuracy:")
print(f"{round(accuracy * 100,2)}%")
print()
print(f"Best Params: {grid_search.best_params_}")
print()
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Plot Confusion Matrix to evaluate model
font_ticklabels = FontProperties(
  family='calibri', size=10, weight='bold')

sns.heatmap(
  conf_matrix_pred,
  annot=True,
  fmt='d',
  cmap='Blues',
  xticklabels=set(y_train),
  yticklabels=set(y_train),
  cbar=False # Remove color bar
)

plt.xticks(fontproperties=font_ticklabels)
plt.yticks(fontproperties=font_ticklabels)

plt.xlabel('Predicted', fontdict={'fontsize':14}, labelpad=20)
plt.ylabel('Test', fontdict={'fontsize':14}, labelpad=20)
plt.title('Confusion Matrix', fontdict={'fontsize':16})
plt.show()

In [None]:
# Values of contamination to be tested
contamination_values = [0.0001, 0.001, 0.01, 0.05, 0.1]

# Evaluating model performance for each contamination value
for contamination in contamination_values:
  model = IsolationForest(contamination=contamination, random_state=42)
  model.fit(X_train)
  y_pred = model.predict(X_test)
  precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
  recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
  print(f'Contamination: {contamination}, Precision: {precision}, Recall: {recall}')

In [None]:
# 3D Scatter Plot

# Separate predicted values into true positive, true negative, false positive and false negative
class_TF = X_test[(y_test == 1) & (y_pred == 0)]
class_TT = X_test[(y_test == 1) & (y_pred == 1)]
class_FF = X_test[(y_test == 0) & (y_pred == 0)]
class_FT = X_test[(y_test == 0) & (y_pred == 1)]

# Create a trace for True Positive data
true_positive = go.Scatter3d(
  x=class_TT.iloc[:, 0],
  y=class_TT.iloc[:, 1],
  z=class_TT.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="blue",
    symbol="circle"
  ),
  name="True Positive"
)

# Create a trace for True Negative data
true_negative = go.Scatter3d(
  x=class_FF.iloc[:, 0],
  y=class_FF.iloc[:, 1],
  z=class_FF.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="green",
    symbol="circle"
  ),
  name="True Negative"
)

# Create a trace for False Positive data
false_positive = go.Scatter3d(
  x=class_FT.iloc[:, 0],
  y=class_FT.iloc[:, 1],
  z=class_FT.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="red",
    symbol="circle"
  ),
  name="False Positive"
)

# Create a trace for False Negative data
false_negative = go.Scatter3d(
  x=class_TF.iloc[:, 0],
  y=class_TF.iloc[:, 1],
  z=class_TF.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="orange",
    symbol="circle"
  ),
  name="False Negative"
)

# Configure graph layout
layout = go.Layout(
  scene=dict(
    xaxis_title="Temperature",
    yaxis_title="Pressure",
    zaxis_title="Vibration"
  ),
  legend=dict(
    x=0,
    y=1
  ),
  width=1000,
  height=800
)

# Create a figure and add the traces
fig = go.Figure(
  data=[true_positive, true_negative, false_positive, false_negative],
  layout=layout)

fig.update_layout(
  title={
    'text':"<b>Predict Values</b>",
    'font': {
      'family':'Arial Black',
      'size': 30,
      'color': 'black'
    },
    'x': 0.5,
    'xanchor': 'center'
  },
  legend={
    'font': {
      'family':'Calibri',
      'size': 16,
      'color': 'black'
  },
    'itemsizing':'constant'} # Increase legend marker size
)
# Show graph
fig.show()

# <h1 style="font-size: 18px;">Pump</h1>

In [208]:
# Split columns that will be part of analysis

# Independent
X = pump[["temperature", "pressure", "vibration", "humidity"]]

# Dependent
y = pump["faulty"]

In [None]:
outliers = {}
for col in X:
  Q1 = X[col].quantile(0.25)
  Q3 = X[col].quantile(0.75)
  IQR = Q3 - Q1
  
  aux_outliers = X[
    (X[col] < Q1-(IQR*1.5)) |
    (X[col] > Q3+(IQR*1.5))
  ]
  
  outliers.update({col:aux_outliers.index.tolist()})
  
for a in outliers.keys():
  print(f"{a}: {len(outliers[a])} outliers = {round(len(outliers[a]) / X.shape[0]*100, 2)}%")

In [None]:
# Density Curve
for col in X:
  plt.figure(figsize=(10,4))
  sns.kdeplot(X[col], fill=True)
  plt.axvline(
    X[col].median(),
    color="red",
    label=f"Median: {round(X[col].median(), 2)}",
    linestyle="dashed"
  )
  plt.axvline(
    X[col].mean(),
    color="green",
    label=f"Mean: {round(X[col].mean(), 2)}",
    linestyle="dashed"
  )
  plt.legend(loc=0)
  plt.show()

In [None]:
# Correlation plot
plt.figure(figsize=(12, 6))
sns.heatmap(compressor.corr(), annot=True)
plt.title(
  "Correlation".upper(),
  fontdict={'family':'calibri', 'fontsize':18, 'weight':'bold', 'color':'red'},
  pad=20
)
plt.show()

In [None]:
# Scatter Plot
px.scatter_matrix(
  X,
  dimensions=["temperature", "pressure", "vibration", "humidity"],
  color=y.apply(str)
)

In [210]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42
)

In [217]:
# Hyperparameters to GridSearch
param_grid = {
  'criterion': ['gini', 'entropy', 'log_loss'],
  'splitter': ['best', 'random'],
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 2, 5],
  'max_depth': [None, 5, 10],
  'min_impurity_decrease': [0.00001, 0.0001, 0.001, 0.01, 0.1]
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Predict Values
y_pred = grid_search.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Build confusion matrix
conf_matrix_pred = confusion_matrix(y_test, y_pred)

# Classification Report
class_report = classification_report(y_test, y_pred)

In [None]:
# Evaluating the Model
print("Model Accuracy:")
print(f"{round(accuracy * 100,2)}%")
print()
print(f"Best Params: {grid_search.best_params_}")
print()
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
# Plot Confusion Matrix to evaluate model
font_ticklabels = FontProperties(
  family='calibri', size=10, weight='bold')

sns.heatmap(
  conf_matrix_pred,
  annot=True,
  fmt='d',
  cmap='Blues',
  xticklabels=set(y_train),
  yticklabels=set(y_train),
  cbar=False # Remove color bar
)

plt.xticks(fontproperties=font_ticklabels)
plt.yticks(fontproperties=font_ticklabels)

plt.xlabel('Predicted', fontdict={'fontsize':14}, labelpad=20)
plt.ylabel('Test', fontdict={'fontsize':14}, labelpad=20)
plt.title('Confusion Matrix', fontdict={'fontsize':16})
plt.show()

In [None]:
# Values of contamination to be tested
contamination_values = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1]

# Evaluating model performance for each contamination value
for contamination in contamination_values:
  model = IsolationForest(contamination=contamination, random_state=42)
  model.fit(X_train)
  y_pred = model.predict(X_test)
  precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
  recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
  print(f'Contamination: {contamination}, Precision: {precision}, Recall: {recall}')

In [None]:
# 3D Scatter Plot

# Separate predicted values into true positive, true negative, false positive and false negative
class_TF = X_test[(y_test == 1) & (y_pred == 0)]
class_TT = X_test[(y_test == 1) & (y_pred == 1)]
class_FF = X_test[(y_test == 0) & (y_pred == 0)]
class_FT = X_test[(y_test == 0) & (y_pred == 1)]

# Create a trace for True Positive data
true_positive = go.Scatter3d(
  x=class_TT.iloc[:, 0],
  y=class_TT.iloc[:, 1],
  z=class_TT.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="blue",
    symbol="circle"
  ),
  name="True Positive"
)

# Create a trace for True Negative data
true_negative = go.Scatter3d(
  x=class_FF.iloc[:, 0],
  y=class_FF.iloc[:, 1],
  z=class_FF.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="green",
    symbol="circle"
  ),
  name="True Negative"
)

# Create a trace for False Positive data
false_positive = go.Scatter3d(
  x=class_FT.iloc[:, 0],
  y=class_FT.iloc[:, 1],
  z=class_FT.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="red",
    symbol="circle"
  ),
  name="False Positive"
)

# Create a trace for False Negative data
false_negative = go.Scatter3d(
  x=class_TF.iloc[:, 0],
  y=class_TF.iloc[:, 1],
  z=class_TF.iloc[:, 2],
  mode="markers",
  marker=dict(
    size=5,
    color="orange",
    symbol="circle"
  ),
  name="False Negative"
)

# Configure graph layout
layout = go.Layout(
  scene=dict(
    xaxis_title="Temperature",
    yaxis_title="Pressure",
    zaxis_title="Vibration"
  ),
  legend=dict(
    x=0,
    y=1
  ),
  width=1000,
  height=800
)

# Create a figure and add the traces
fig = go.Figure(
  data=[true_positive, true_negative, false_positive, false_negative],
  layout=layout)

fig.update_layout(
  title={
    'text':"<b>Predict Values</b>",
    'font': {
      'family':'Arial Black',
      'size': 30,
      'color': 'black'
    },
    'x': 0.5,
    'xanchor': 'center'
  },
  legend={
    'font': {
      'family':'Calibri',
      'size': 16,
      'color': 'black'
  },
    'itemsizing':'constant'} # Increase legend marker size
)
# Show graph
fig.show()