In [2]:
# Import Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, classification_report, precision_recall_curve, auc, accuracy_score, precision_score, recall_score, f1_score
import ipywidgets as widgets
from ipywidgets import interact
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import warnings
import shap
import numpy as np
from art.attacks.poisoning import PoisoningAttackBackdoor, FeatureCollisionAttack
from art.estimators.classification import SklearnClassifier
from art.utils import to_categorical

sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

In [3]:
# Load the preprocessed data
data = pd.read_csv('Preprocessed_Data.csv')

# Rename columns to remove special characters
data.rename(columns={
    'Air temperature [K]': 'Air_temperature_K',
    'Process temperature [K]': 'Process_temperature_K',
    'Rotational speed [rpm]': 'Rotational_speed_rpm',
    'Torque [Nm]': 'Torque_Nm',
    'Tool wear [min]': 'Tool_wear_min'
}, inplace=True)

# Define features and target
X = data[['Type','Air_temperature_K', 'Process_temperature_K', 'Rotational_speed_rpm', 'Torque_Nm', 'Tool_wear_min']]
y = data['Machine failure']

In [4]:
# Initialize stratified split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Apply SMOTE to oversample the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [5]:
# Define models with the best parameters
models = {
    'Random Forest': RandomForestClassifier(
        max_depth=20, 
        min_samples_split=2, 
        n_estimators=350, 
        random_state=42,
    ),
    'XGBoost': XGBClassifier(
        learning_rate=0.2, 
        max_depth=7, 
        n_estimators=350, 
        subsample=0.9, 
        random_state=42,
    ), 'Neural Network': MLPClassifier(
        activation='tanh', 
        hidden_layer_sizes=(50, 50), 
        solver='adam',
        max_iter=400,
        random_state=42
    )
}

In [6]:
# Define the poisoning function for binary classification (flipping 0 to 1 or 1 to 0)
def label_flip_poisoning(X_train, y_train, poison_percentage, target_class):
    # Find the indices where y_train indicates the target class (failure or no failure)
    class_indices = np.where(y_train == target_class)[0]
    
    # Determine the number of labels to flip based on the poison percentage
    num_to_flip = int(poison_percentage * len(class_indices))
    
    # Randomly select indices to flip
    flip_indices = np.random.choice(class_indices, size=num_to_flip, replace=False)
    
    # Create poisoned labels by copying y_train
    y_train_poisoned = y_train.copy()
    
    # Flip the selected failure/no failure labels (if 1 -> 0, if 0 -> 1)
    y_train_poisoned[flip_indices] = 1 - target_class
    
    return X_train, y_train_poisoned

In [7]:
# Define the poisoning percentages to test
poison_percentages = [0, 0.1, 0.2, 0.3, 0.4, 0.5]
target_classes = [0, 1]  # 0 for no failure, 1 for failure

# Initialize dictionaries to hold DataFrames for each metric, model, and target class
accuracy_results = {'Model': [], 'Target': []}
precision_results = {'Model': [], 'Target': []}
recall_results = {'Model': [], 'Target': []}
f1_results = {'Model': [], 'Target': []}

# Add columns for each poisoning percentage in the results dictionaries
for poison_percentage in poison_percentages:
    accuracy_results[f'{int(poison_percentage * 100)}%'] = []
    precision_results[f'{int(poison_percentage * 100)}%'] = []
    recall_results[f'{int(poison_percentage * 100)}%'] = []
    f1_results[f'{int(poison_percentage * 100)}%'] = []

# Train and evaluate each model for both target classes
for name, model in models.items():
    print(f'Model: {name}')
    
    # Loop through each target class (0 and 1)
    for target_class in target_classes:
        print(f'Target Class: {target_class}')
        
        # Store the model name and target class in each results dictionary
        accuracy_results['Model'].append(name)
        precision_results['Model'].append(name)
        recall_results['Model'].append(name)
        f1_results['Model'].append(name)
        
        accuracy_results['Target'].append(target_class)
        precision_results['Target'].append(target_class)
        recall_results['Target'].append(target_class)
        f1_results['Target'].append(target_class)
        
        # Train the model on clean data
        model.fit(X_train_res, y_train_res)
        
        # Make predictions on the clean test set
        y_pred = model.predict(X_test)
        
        # Calculate metrics for clean data (0% poisoning)
        clean_accuracy = accuracy_score(y_test, y_pred)
        clean_precision = precision_score(y_test, y_pred, average='weighted')
        clean_recall = recall_score(y_test, y_pred, average='weighted')
        clean_f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Store the clean metrics (0% poisoning) for the current target class
        accuracy_results['0%'].append(clean_accuracy)
        precision_results['0%'].append(clean_precision)
        recall_results['0%'].append(clean_recall)
        f1_results['0%'].append(clean_f1)
        
        # Loop through each poisoning percentage (starting from 10%)
        for poison_percentage in poison_percentages[1:]:
            print(f'Poison percentage: {poison_percentage}')
            
            # Poison the training data based on the current target class
            X_train_poisoned, y_train_poisoned = label_flip_poisoning(X_train_res, y_train_res, poison_percentage, target_class)
            
            # Train the model on poisoned data
            model.fit(X_train_poisoned, y_train_poisoned)
            
            # Make predictions on the clean test set
            y_pred_poisoned = model.predict(X_test)
            
            # Calculate metrics on the clean test set after training on poisoned data
            poisoned_accuracy = accuracy_score(y_test, y_pred_poisoned)
            poisoned_precision = precision_score(y_test, y_pred_poisoned, average='weighted')
            poisoned_recall = recall_score(y_test, y_pred_poisoned, average='weighted')
            poisoned_f1 = f1_score(y_test, y_pred_poisoned, average='weighted')
            
            # Store the poisoned metrics based on the poison percentage for the current target class
            accuracy_results[f'{int(poison_percentage * 100)}%'].append(poisoned_accuracy)
            precision_results[f'{int(poison_percentage * 100)}%'].append(poisoned_precision)
            recall_results[f'{int(poison_percentage * 100)}%'].append(poisoned_recall)
            f1_results[f'{int(poison_percentage * 100)}%'].append(poisoned_f1)

# Convert the results dictionaries to DataFrames
accuracy_df = pd.DataFrame(accuracy_results)
precision_df = pd.DataFrame(precision_results)
recall_df = pd.DataFrame(recall_results)
f1_df = pd.DataFrame(f1_results)

# Display the results
print("Accuracy Results:")
display(accuracy_df)

print("Precision Results:")
display(precision_df)

print("Recall Results:")
display(recall_df)

print("F1 Score Results:")
display(f1_df)


Model: Random Forest
Target Class: 0
Poison percentage: 0.1
Poison percentage: 0.2
Poison percentage: 0.3
Poison percentage: 0.4
Poison percentage: 0.5
Target Class: 1
Poison percentage: 0.1
Poison percentage: 0.2
Poison percentage: 0.3
Poison percentage: 0.4
Poison percentage: 0.5
Model: XGBoost
Target Class: 0
Poison percentage: 0.1
Poison percentage: 0.2
Poison percentage: 0.3
Poison percentage: 0.4
Poison percentage: 0.5
Target Class: 1
Poison percentage: 0.1
Poison percentage: 0.2
Poison percentage: 0.3
Poison percentage: 0.4
Poison percentage: 0.5
Model: Neural Network
Target Class: 0
Poison percentage: 0.1
Poison percentage: 0.2
Poison percentage: 0.3
Poison percentage: 0.4
Poison percentage: 0.5
Target Class: 1
Poison percentage: 0.1
Poison percentage: 0.2
Poison percentage: 0.3
Poison percentage: 0.4
Poison percentage: 0.5
Accuracy Results:


Unnamed: 0,Model,Target,0%,10%,20%,30%,40%,50%
0,Random Forest,0,0.964,0.955333,0.944667,0.899333,0.794,0.467333
1,Random Forest,1,0.964,0.97,0.970667,0.975667,0.976333,0.968333
2,XGBoost,0,0.976667,0.955333,0.888333,0.812333,0.661,0.491333
3,XGBoost,1,0.976667,0.974,0.974333,0.971,0.969333,0.971667
4,Neural Network,0,0.97,0.940333,0.915667,0.861667,0.761333,0.576667
5,Neural Network,1,0.97,0.966667,0.969,0.968667,0.971,0.974667


Precision Results:


Unnamed: 0,Model,Target,0%,10%,20%,30%,40%,50%
0,Random Forest,0,0.972834,0.97223,0.970101,0.968803,0.967627,0.965015
1,Random Forest,1,0.972834,0.974135,0.97208,0.973366,0.97346,0.961484
2,XGBoost,0,0.979587,0.975541,0.968928,0.967192,0.963765,0.963122
3,XGBoost,1,0.979587,0.974753,0.972842,0.966921,0.962228,0.968259
4,Neural Network,0,0.975675,0.970566,0.97009,0.96886,0.966938,0.965473
5,Neural Network,1,0.975675,0.972446,0.971544,0.971696,0.968521,0.970856


Recall Results:


Unnamed: 0,Model,Target,0%,10%,20%,30%,40%,50%
0,Random Forest,0,0.964,0.955333,0.944667,0.899333,0.794,0.467333
1,Random Forest,1,0.964,0.97,0.970667,0.975667,0.976333,0.968333
2,XGBoost,0,0.976667,0.955333,0.888333,0.812333,0.661,0.491333
3,XGBoost,1,0.976667,0.974,0.974333,0.971,0.969333,0.971667
4,Neural Network,0,0.97,0.940333,0.915667,0.861667,0.761333,0.576667
5,Neural Network,1,0.97,0.966667,0.969,0.968667,0.971,0.974667


F1 Score Results:


Unnamed: 0,Model,Target,0%,10%,20%,30%,40%,50%
0,Random Forest,0,0.967493,0.96172,0.954329,0.925688,0.859013,0.602699
1,Random Forest,1,0.967493,0.971738,0.971327,0.974169,0.972287,0.956794
2,XGBoost,0,0.977846,0.962554,0.918884,0.870871,0.766495,0.625667
3,XGBoost,1,0.977846,0.974358,0.973489,0.968284,0.963102,0.963263
4,Neural Network,0,0.972256,0.95169,0.936079,0.90227,0.837352,0.700043
5,Neural Network,1,0.972256,0.969063,0.970147,0.970011,0.969558,0.970662


In [8]:
# Function to plot a metric for all models across poisoning percentages
def plot_metric(metric_df, metric_name, target_class):
    plt.figure(figsize=(10, 6))
    
    # Filter the DataFrame for the selected target class
    filtered_df = metric_df[metric_df['Target'] == target_class]
    
    # Iterate through each model and plot its metric across poisoning percentages
    for i, model in enumerate(filtered_df['Model']):
        # Get the values as numpy array, stripping the '%' symbol and converting to integers
        x_values = [int(col.strip('%')) for col in filtered_df.columns[2:]]  # Skip the 'Model' and 'Target' columns
        y_values = filtered_df.iloc[i, 2:].values  # Get the corresponding metric values (from the 2nd column onward)
        
        # Plot the line for each model
        plt.plot(x_values, y_values, label=model, marker='o')
    
    # Add labels and title
    plt.title(f'{metric_name} Across Poisoning Percentages (Target Class: {target_class})', fontsize=16)
    plt.xlabel('Poisoning Percentage (%)', fontsize=12)
    plt.ylabel(metric_name, fontsize=12)
    
    # Display legend
    plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Show plot
    plt.tight_layout()
    plt.show()

# Define a function that will update the plot based on user input
def interactive_plot(metric_name, target_class):
    if metric_name == 'Accuracy':
        plot_metric(accuracy_df, "Accuracy", target_class)
    elif metric_name == 'Precision':
        plot_metric(precision_df, "Precision", target_class)
    elif metric_name == 'Recall':
        plot_metric(recall_df, "Recall", target_class)
    elif metric_name == 'F1 Score':
        plot_metric(f1_df, "F1 Score", target_class)

# Create a dropdown menu for selecting the metric
metric_dropdown = widgets.Dropdown(
    options=['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    value='Accuracy',
    description='Metric:',
    disabled=False,
)

# Create a dropdown menu for selecting the target class (0 or 1)
target_dropdown = widgets.Dropdown(
    options=[0, 1],
    value=0,
    description='Target Class:',
    disabled=False,
)

# Use the interact function to update the plot dynamically
interact(interactive_plot, metric_name=metric_dropdown, target_class=target_dropdown)


interactive(children=(Dropdown(description='Metric:', options=('Accuracy', 'Precision', 'Recall', 'F1 Score'),…

<function __main__.interactive_plot(metric_name, target_class)>

In [9]:
# Calculate the number of failures in the resampled training data
num_failures_resampled = len(y_train_res[y_train_res == 1])

# Calculate the total number of samples in the resampled training data
total_resampled = len(y_train_res)

# Calculate the percentage of failures
failure_percentage_resampled = (num_failures_resampled / total_resampled) * 100

# Print the result
print(f"Number of failures in resampled data: {num_failures_resampled}")
print(f"Percentage of failures in resampled data: {failure_percentage_resampled:.2f}%")

Number of failures in resampled data: 6763
Percentage of failures in resampled data: 50.00%


In [10]:
#class distribution in the test set
print(f"Test set class distribution: {y_test.value_counts()}")

Test set class distribution: Machine failure
0    2898
1     102
Name: count, dtype: int64
