# <h1 style="font-size: 18px;"> Info</h1>

https://www.kaggle.com/datasets/dnkumars/industrial-equipment-monitoring-dataset?resource=download<br>
<span style="font-size: 16px;">
This dataset contains simulated data representing real-time monitoring of various industrial equipment, including turbines,<br> compressors, and pumps. Each row in the dataset corresponds to a unique observation capturing key parameters such as temperature,<br> pressure, vibration, and humidity. The dataset also includes information about the equipment type, location, and whether the equipment is classified as faulty.
</span>
<br>

<span style="font-size: 16px;">
- Columns Description:<br>
- temperature: Temperature reading at the time of observation (in °C).<br>
- pressure: Pressure reading at the time of observation (in bar).<br>
- vibration: Vibration level reading (normalized units).<br>
- humidity: Humidity percentage recorded at the location of the equipment.<br>
- equipment: Type of industrial equipment being monitored (e.g., Turbine, Compressor, Pump).<br>
- location: Location of the equipment (city name).<br>
- faulty: Binary indicator (0 = Not Faulty, 1 = Faulty) to specify whether the equipment is functioning properly or requires maintenance.<br>
</span>

# <h1 style="font-size: 18px;"> Industrial Analysis</h1>

In [4]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE

In [None]:
# Import data
df = pd.read_csv("equipment_anomaly_data.csv")

In [65]:
# Create a class to reduce the quantity of code
class Analysis:
  """
  Class to analyze the dataframe.
    
  Attributes:
  - proportion: Print proportion of each categorical column and dependent variable
  - plot_kde: Plot the density curve of each numeric column
  - plot_boxplot: Plot the BoxPlot of each numeric column
  - outliers: Returns a dict of outliers of each numeric column
  - correlation: Print the correlation between numeric columns and dependent variable
  - corr_heatmap: Plot a HeatMap of the Correlation
  - logistic_plot: Plot a ScatterPlot of each numeric column and dependent variable
  """
  def __init__(self, dataframe):
    self.data = dataframe
    
  # Proportion
  @property
  def proportion(self):
    """
      Return the proportion of categorical columns also the dependent variable
    """
    data = self.data
    for col in data[["equipment", "location", "faulty"]]:
      print(f"{data[col].value_counts(normalize=True)}\n")
  
  # Plot Density Curve
  @property
  def plot_kde(self):
    """Fuction to plot density curve
      
    Columns:
    - temperature
    - pressure
    - vibration
    - humidity
    """
    data = self.data
    for col in data[["temperature", "pressure", "vibration", "humidity"]]:
      plt.figure(figsize=(10,4))
      sns.kdeplot(data[col], fill=True)
      plt.axvline(
        data[col].median(),
        color="red",
        label=f"Median: {round(data[col].median(), 2)}",
        linestyle="dashed"
      )
      plt.axvline(
        data[col].mean(),
        color="green",
        label=f"Mean: {round(data[col].mean(), 2)}",
        linestyle="dashed"
      )
      plt.legend(loc=0)
      plt.show()

  # Plot Boxplot
  @property
  def plot_boxplot(self):
    """Plot the BoxPlot of the numeric columns
    """
    data = self.data
    for col in data.drop(columns=["equipment", "location", "faulty"]):
      fig = px.box(data, y=col)
      fig.update_layout(
        scene=dict(
          xaxis_title='Temperature',
          yaxis_title='Pressure',
          zaxis_title='Vibration'
        ),
        width=1000, height=800
      )
      fig.show()
  
  # Método do Quartil.
  @property
  def outliers(self):
    """Return a dict with the outliers of each numeric column

    Returns:
        dict: variable name:outliers
    """
    outliers = {}
    data = self.data
    for col in data.drop(columns=["equipment", "location", "faulty"]):
      Q1 = data[col].quantile(0.25)
      Q3 = data[col].quantile(0.75)
      IQR = Q3 - Q1
      
      aux_outliers = data[
        (data[col] < Q1-(IQR*1.5)) |
        (data[col] > Q3+(IQR*1.5))
      ]
      
      outliers.update({col:aux_outliers.index.tolist()})
      
    return outliers
  
  # Correlation with Faulty
  @property
  def correlation(self):
    """Print correlation between numeric columns and faulty column
    """
    data = self.data
    for col in data.drop(columns=["equipment", "location", "faulty"]):
      print(
        f"Correlation {col} x faulty: {round(
          np.corrcoef(data[col], data["faulty"])[0][1], 3)}")
  
  # Correlation HeatMap
  def corr_heatmap(self):
    """Plot a HeatMap of Correlation between numeric columns
    """
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.drop(columns=["equipment", "location"]).corr(), annot=True)
    plt.title(
      "Correlation".upper(),
      fontdict={'family':'calibri', 'fontsize':18, 'weight':'bold', 'color':'red'},
      pad=20
    )
    plt.show()
    
  # Plot Logistic Plot
  def logistic_plot(self):
    """ScatterPlot for all numeric columns where y=faulty"""
    data = self.data
    for col in data.drop(columns=["equipment", "location", "faulty"]):
      fig = go.Figure()
      fig.add_trace(go.Scatter(
        x=data[col],
        y=data["faulty"],
        mode='markers',
        marker_color=data["faulty"]
        )
      )
      fig.show()
    
  def scatter_plot(self):
    data = self.data
    sns.pairplot(
      data=data[["temperature","pressure","vibration","humidity", "faulty"]],
      hue="faulty")
    plt.show()

  def scatter3d(self):
    data = self.data
    
    data_0 = data[data["faulty"] == 0]
    data_1 = data[data["faulty"] == 1]
    
    # Create a trace for faulty False
    faulty0 = go.Scatter3d(
      x=data_0["temperature"],
      y=data_0["pressure"],
      z=data_0["vibration"],
      mode="markers",
      marker=dict(
        size=3,
        color="blue",
        symbol="circle"
      ),
      name="Faulty False"
    )

    # Create a trace for faulty True
    faulty_1 = go.Scatter3d(
      x=data_1["temperature"],
      y=data_1["pressure"],
      z=data_1["vibration"],
      mode="markers",
      marker=dict(
        size=3,
        color="red",
        symbol="circle"
      ),
      name="Faulty True"
    )

    # Configure graph layout
    layout = go.Layout(
      scene=dict(
        xaxis_title="Temperature",
        yaxis_title="Pressure",
        zaxis_title="Vibration"
      ),
      legend=dict(
        x=0,
        y=1
      ),
      width=1000,
      height=800
    )

    # Create a figure and add the traces
    fig = go.Figure(
      data=[faulty0, faulty_1],
      layout=layout)

    fig.update_layout(
      title={
        'text':"<b>Equipment Anomaly</b>",
        'font': {
          'family':'Arial Black',
          'size': 30,
          'color': 'black'
        },
        'x': 0.5,
        'xanchor': 'center'
      },
      legend={
        'font': {
          'family':'Calibri',
          'size': 16,
          'color': 'black'
      },
        'itemsizing':'constant'} # Increase legend marker size
    )
    # Show graph
    fig.show()

In [1]:
# Create a class to reduce the quantity of code
class TrainedModel:
  """Build and Train a RandomForestClassifier. Will load all class when run the code.

    Attributes:
    - X_train: 80% of Independents Variables
    - X_test: 20% of Independents Variables
    - Y_train 80% of Dependents Variables
    - Y_test: 20% of Dependents Variables
    - model: Trained Random Forest Model
    - y_pred: Predicted values
    - accuracy: Model accuracy
    - conf_matrix: Confusion Matrix of predicted values
    - class_report: Text report that shows main metrics
    - plot_conf_matrix(): Plot Confusion Matrix
    - scatter_3d_test_data(): Plot a 3D scatter of predicted values
  """
  def __init__(self, dataframe):
    self.data = dataframe
    self.X_train, self.X_test, self.Y_train, self.Y_test = self.__train_data
    self.model, self.y_pred, self.accuracy, self.conf_matrix, self.class_report = self.__best_model
  
  @property
  def __train_data(self):
    data = self.data
    X = data.drop(columns=["equipment", "location", "faulty", "humidity"]).values
    Y = data["faulty"].values

    X_train, X_test, y_train, y_test = train_test_split(
      X,
      Y,
      test_size=0.2,
      random_state=42
    )
    return X_train, X_test, y_train, y_test
  
  @property
  def __best_model(self):
    """Build and Train a RandomForestClassifier Model with GrindSearchCV

    Returns:
      5 variables:
        - Trained ML Model
        - Prediction
        - Accuracy
        - Confusion Matrix
        - Classification Report
    """
    
    X_train = self.X_train
    X_test = self.X_test
    y_train = self.Y_train
    y_test = self.Y_test
    
    # Definir o pipeline
    pipeline = ImbPipeline([
      ('scaler', StandardScaler()),
      ('sampling', SMOTE()),  # Aplicar SMOTE para oversampling
      ('cluster_under', ClusterCentroids()),  # Aplicar Cluster Centroids para undersampling
      ('classifier', KNeighborsClassifier())
    ])

    # Definir os hiperparâmetros para o GridSearch
    param_grid = [
      {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [3, 5, 7]
      },
      {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20]
      },
      {
        'classifier': [LogisticRegression(max_iter=1000)],
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['liblinear', 'saga']
      }
    ]

    # Realizar o GridSearch
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    return grid_search, y_pred, accuracy, conf_matrix, class_report
  
  # Ploting the Confusion Matrix
  def plot_conf_matrix(self):
    """Plot a Confusion Matrix

    Args:
        conf_matrix (matrix): Confusion Matrix calculated
        y_train (array): Data for Test
        title (str): Graph Title
    """
    
    conf_matrix = self.conf_matrix
    plt.figure(figsize=(10, 7))
    ax= sns.heatmap(
      conf_matrix,
      annot=True,
      fmt='d',
      cmap='Blues',
      xticklabels=[0,1],
      yticklabels=[0,1]
    )


    plt.xlabel('Predicted', fontdict={'fontsize':14}, labelpad=20)
    plt.ylabel('Actual', fontdict={'fontsize':14}, labelpad=20)
    plt.title("Confusion Matrix", fontdict={'fontsize':16})
    plt.show();


  # Create a trace for True Positive data
  def scatter_3d_test_data(self):
    """Create a Scatter Plot 3D of the prediction

    Args:
        x_test (ndarray): Array with the test data of 3 independent variables
        y_test (ndarray): Array with the test data of dependent variable
        y_pred (ndarray): Array of predicted values
    """

    X_test = self.X_test
    y_test = self.Y_test
    y_pred = self.y_pred
    
    class_TF = X_test[(y_test == 1) & (y_pred == 0)]
    class_TT = X_test[(y_test == 1) & (y_pred == 1)]
    class_FF = X_test[(y_test == 0) & (y_pred == 0)]
    class_FT = X_test[(y_test == 0) & (y_pred == 1)]

    true_positive = go.Scatter3d(
      x=class_TT[:, 0],
      y=class_TT[:, 1],
      z=class_TT[:, 2],
      mode="markers",
      marker=dict(
        size=5,
        color="blue",
        symbol="circle"
      ),
      name="True Positive"
    )

    # Create a trace for True Negative data
    true_negative = go.Scatter3d(
      x=class_FF[:, 0],
      y=class_FF[:, 1],
      z=class_FF[:, 2],
      mode="markers",
      marker=dict(
        size=5,
        color="green",
        symbol="circle"
      ),
      name="True Negative"
    )

    # Create a trace for False Positive data
    false_positive = go.Scatter3d(
      x=class_FT[:, 0],
      y=class_FT[:, 1],
      z=class_FT[:, 2],
      mode="markers",
      marker=dict(
        size=5,
        color="red",
        symbol="circle"
      ),
      name="False Positive"
    )

    # Create a trace for False Negative data
    false_negative = go.Scatter3d(
      x=class_TF[:, 0],
      y=class_TF[:, 1],
      z=class_TF[:, 2],
      mode="markers",
      marker=dict(
        size=5,
        color="orange",
        symbol="circle"
      ),
      name="False Negative"
    )

    # Configure graph layout
    layout = go.Layout(
      scene=dict(
        xaxis_title="Temperature",
        yaxis_title="Pressure",
        zaxis_title="Vibration"
      ),
      legend=dict(
        x=0,
        y=1
      ),
      width=1000,
      height=800
    )

    # Create a figure and add the traces
    fig = go.Figure(
      data=[true_positive, true_negative, false_positive, false_negative],
      layout=layout)

    fig.update_layout(
      title={
        'text':"<b>Predict Values</b>",
        'font': {
          'family':'Arial Black',
          'size': 30,
          'color': 'black'
        },
        'x': 0.5,
        'xanchor': 'center'
      },
      legend={
        'font': {
          'family':'Calibri',
          'size': 16,
          'color': 'black'
      },
        'itemsizing':'constant'} # Increase legend marker size
    )
    # Show graph
    fig.show()

In [28]:
# Print scatter 3d text structure
# https://miabellaai.net/index.html to visualize online
def scatter3d_text(dataframe):
  output_text = "Interactive 3D Scatter Plot;\n::temperature::pressure::vibration;\n"

  # Iterate over the data
  for idx, row in dataframe.iterrows():
      output_text += f"#{idx + 1}::{round(row["temperature"],5)}::{round(row["pressure"], 5)}::{round(row["vibration"], 5)}::{row["faulty"]:.0f}::10::A::1::0::0::0::0;\n"

  print(output_text)

# <h1 style="font-size:20px">All Equipments</h1>

## <h1 style="font-size:20px">Analysis</h1>

In [None]:
# Stats
df.describe()

In [None]:
# Presence null values
df.info()

In [66]:
analysis = Analysis(df)

In [None]:
analysis.scatter_plot()

In [None]:
analysis.plot_kde

In [None]:
analysis.plot_boxplot

In [None]:
analysis.proportion

In [None]:
analysis.outliers["temperature"]

In [None]:
analysis.correlation

In [None]:
analysis.corr_heatmap

In [None]:
analysis.logistic_plot

In [None]:
analysis.scatter3d()

In [None]:
# https://miabellaai.net/index.html to visualize online
scatter3d_text(df)

## <h1 style="font-size:20px">Machine Learning</h1>

In [None]:
all_equip = TrainedModel(df)

In [None]:
print(f"Accuracy: {round(all_equip.accuracy * 100, 2)}%")

In [None]:
print("Classification Report".upper().center(50))
print(all_equip.class_report)

In [None]:
all_equip.plot_conf_matrix()

In [None]:
all_equip.scatter_3d_test_data()

# <h1 style="font-size:20px">Each equipment</h1>

In [20]:
turbine = df[df["equipment"] == "Turbine"]
compressor = df[df["equipment"] == "Compressor"]
pump = df[df["equipment"] == "Pump"]

## <h1 style="font-size:20px">Turbine</h1>

### <h1 style="font-size:20px">Analysis</h1>

In [None]:
turbine.describe()

In [30]:
turb_analysis = Analysis(turbine)

In [None]:
turb_analysis.plot_kde

In [None]:
turb_analysis.plot_boxplot

In [None]:
turb_analysis.proportion

In [None]:
turb_analysis.outliers["temperature"]

In [None]:
turb_analysis.correlation

In [None]:
turb_analysis.corr_heatmap

In [None]:
turb_analysis.logistic_plot

### <h1 style="font-size:20px">Machine Learning</h1>

In [38]:
turbine_model = TrainedModel(turbine)

In [None]:
print(f"Accuracy: {turbine_model.accuracy:.3f}")
print()

# Precision has the best accuracy
print("Classification Report:".upper().center(50))
print(turbine_model.class_report)

In [None]:
turbine_model.plot_conf_matrix()

In [None]:
turbine_model.scatter_3d_test_data()

## <h1 style="font-size:20px">Compressor</h1>

### <h1 style="font-size:20px">Analysis</h1>

In [None]:
compressor.describe()

In [43]:
compr_analysis = Analysis(compressor)

In [None]:
compr_analysis.plot_kde

In [None]:
compr_analysis.plot_boxplot

In [None]:
compr_analysis.proportion

In [None]:
compr_analysis.outliers["temperature"]

In [None]:
compr_analysis.correlation

In [None]:
compr_analysis.corr_heatmap

In [None]:
compr_analysis.logistic_plot

### <h1 style="font-size:20px">Machine Learning</h1>

In [51]:
compressor_model = TrainedModel(compressor)

In [None]:
print(f"Accuracy: {compressor_model.accuracy:.3f}")
print()

# Precision has the best accuracy
print("Classification Report:")
print(compressor_model.class_report)

In [None]:
compressor_model.plot_conf_matrix()

In [None]:
compressor_model.scatter_3d_test_data()

## <h1 style="font-size:20px">Pump</h1>

### <h1 style="font-size:20px">Analysis</h1>

In [None]:
pump.describe()

In [56]:
pump_analysis = Analysis(pump)

In [None]:
pump_analysis.plot_kde

In [None]:
pump_analysis.plot_boxplot

In [None]:
pump_analysis.proportion

In [None]:
pump_analysis.outliers["temperature"]

In [None]:
pump_analysis.correlation

In [None]:
pump_analysis.corr_heatmap

In [None]:
pump_analysis.logistic_plot

### <h1 style="font-size:20px">Machine Learning</h1>

In [21]:
pump_model = TrainedModel(pump)

In [None]:
print(f"Accuracy: {(pump_model.accuracy * 100):.2f}%")

In [None]:
# Precision has the best accuracy
print("Classification Report:".upper().center(50))
print(pump_model.class_report)

In [None]:
pump_model.plot_conf_matrix()

In [None]:
pump_model.scatter_3d_test_data()