In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
from scipy.stats import norm
from tabpy.tabpy_tools.client import Client

In [None]:
# Deploy to Tableau
def deploy_tableau(title, function, description):
  client = Client('http://localhost:9004/')
  client.deploy(
    title,
    function,
    description,
    override=True
  )

In [2]:
# Import data
df = pd.read_csv("equipment_anomaly_data.csv")

In [3]:
# Split Independent variable (X) and Dependent variable (Y)
X = df.drop(columns=["equipment", "location", "faulty"]).values
Y = df["faulty"].values

In [4]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(
  X,
  Y,
  test_size=0.2,
  random_state=42
)

In [5]:
# Define pipeline
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('sampling', SMOTE()),  # Apply SMOTE for oversampling
    ('cluster_under', ClusterCentroids()),  # Apply Cluster Centroids for undersampling
    ('classifier', KNeighborsClassifier())
])

In [6]:
# Define hyperparameters for GridSearch
param_grid = [
  {
      'classifier': [KNeighborsClassifier()],
      'classifier__n_neighbors': [3, 5, 7]
  },
  {
      'classifier': [RandomForestClassifier()],
      'classifier__n_estimators': [50, 100, 200],
      'classifier__max_depth': [None, 10, 20]
  },
  {
      'classifier': [LogisticRegression(max_iter=1000)],
      'classifier__C': [0.1, 1, 10],
      'classifier__solver': ['liblinear', 'saga']
  }
]

In [None]:
# Execute GridSearch and Train Model
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [9]:
# Function to calculate the probability to classify given values as 'Faulty'
def prob_faulty(_arg1,_arg2,_arg3,_arg4):
  """Calculate the probability to classify given values as 'Faulty'

  Args:
      _arg1 (float, list): Temperature
      _arg2 (float, list): Pressure
      _arg3 (float, list): Vibration
      _arg4 (float, list): Humidity

  Returns:
      _type_: _description_
  """
  input_data = np.column_stack([_arg1,_arg2,_arg3,_arg4])
  X = pd.DataFrame(
    input_data,
    columns=["Temperature", "Pressure", "Vibration", "Humidity"]
  )
  result = grid_search.predict_proba(X.values)[0][1]
  return result.tolist()

In [11]:
deploy_tableau(
  "Probability_Faulty",
  prob_faulty,
  "Return the probability to classify given values as Faulty"
)

In [13]:
def norm_cdf(_arg1, _arg2, _arg3):
  """Return the Cumulative Distribution Function (CDF)

  Args:
      _arg1 (float): Pressure value_
      _arg2 (float): Average Pressure
      _arg3 (float): Std Pressure
  """
  prob = norm.cdf(_arg1, loc=_arg2, scale=_arg3)
  return prob.tolist()

In [15]:
# Deploy to Tableau
deploy_tableau('Probability_CDF', norm_cdf, 'Returns the probability CDF')

In [16]:
def norm_range(_arg1, _arg2, _arg3, _arg4):
  """Return the probability inside a range

  Args:
      _arg1 (float): Lower Pressure value_
      _arg2 (float): Upper Pressure value_
      _arg3 (float): Average Pressure
      _arg4 (float): Std Pressure
  """
  prob_lower = norm.cdf(_arg1, loc=_arg3, scale=_arg4)
  prob_upper = norm.cdf(_arg2, loc=_arg3, scale=_arg4)
  prob = prob_upper - prob_lower
  return prob.tolist()

In [18]:
def norm_pdf(_arg1, _arg2, _arg3):
  """Return the Probability Density Function (PDF)

  Args:
      _arg1 (float): Pressure value_
      _arg2 (float): Average Pressure
      _arg3 (float): Std Pressure
  """
  prob = norm.pdf(_arg1, loc=_arg2, scale=_arg3)
  return prob.tolist()