# Multi-Label Defect Prediction using Perceptron

This notebook implements a Perceptron model for multi-label defect prediction, including online learning mode.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Perceptron
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, hamming_loss, classification_report, f1_score, precision_score, recall_score
import joblib
import sys
sys.path.append('..')
from utils.defect_utils import preprocess_data, get_defect_types

## Load and Explore Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../data/dataset.csv')

# Display the first few rows
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values[missing_values > 0] if any(missing_values > 0) else "No missing values")

In [None]:
# Analyze the defect labels
defect_counts = {}
for defects in df['defects']:
    for defect in defects.split(','):
        defect_counts[defect] = defect_counts.get(defect, 0) + 1

# Convert to DataFrame for visualization
defect_df = pd.DataFrame(list(defect_counts.items()), columns=['Defect', 'Count'])
defect_df = defect_df.sort_values('Count', ascending=False)

# Plot defect distribution
plt.figure(figsize=(12, 6))
sns.barplot(x='Defect', y='Count', data=defect_df)
plt.title('Defect Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Check for class imbalance
print("Defect distribution:")
for defect, count in defect_counts.items():
    print(f"{defect}: {count} ({count/len(df):.2%})")

## Preprocess Data

In [None]:
# Preprocess the data
# Extract features (all columns except 'defects')
feature_cols = [col for col in df.columns if col != 'defects']
X = df[feature_cols].values

# Process labels
defects = df['defects'].str.split(',').tolist()

# Convert to multi-hot encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(defects)

# Get the defect class names
defect_classes = mlb.classes_
print(f"Defect classes: {defect_classes}")
print(f"Number of classes: {len(defect_classes)}")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")
print(f"Label matrix shape: {y.shape}")

## Split Data

In [None]:
# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## Train Perceptron Model (Batch Learning)

In [None]:
# Define the Perceptron model with OneVsRestClassifier for multi-label classification
base_perceptron = Perceptron(max_iter=1000, eta0=0.1, random_state=42)
perceptron_model = OneVsRestClassifier(base_perceptron)

# Train the model
print("Training Perceptron model (batch learning)...")
perceptron_model.fit(X_train, y_train)
print("Training completed.")

## Evaluate Batch Learning Model on Validation Set

In [None]:
# Predict on validation set
y_val_pred = perceptron_model.predict(X_val)

# Calculate metrics
hamming = hamming_loss(y_val, y_val_pred)
micro_f1 = f1_score(y_val, y_val_pred, average='micro')
macro_f1 = f1_score(y_val, y_val_pred, average='macro')
micro_precision = precision_score(y_val, y_val_pred, average='micro')
macro_precision = precision_score(y_val, y_val_pred, average='macro')
micro_recall = recall_score(y_val, y_val_pred, average='micro')
macro_recall = recall_score(y_val, y_val_pred, average='macro')

# Print metrics
print(f"Validation Metrics (Batch Learning):")
print(f"Hamming Loss: {hamming:.4f}")
print(f"Micro-F1 Score: {micro_f1:.4f}")
print(f"Macro-F1 Score: {macro_f1:.4f}")
print(f"Micro-Precision: {micro_precision:.4f}")
print(f"Macro-Precision: {macro_precision:.4f}")
print(f"Micro-Recall: {micro_recall:.4f}")
print(f"Macro-Recall: {macro_recall:.4f}")

## Train Perceptron Model (Online Learning)

In online learning, the model is updated after each sample.

In [None]:
# Define a custom online learning function for multi-label classification
def train_perceptron_online(X, y, n_classes, learning_rate=0.1, n_epochs=5):
    """Train a multi-label perceptron model in online learning mode."""
    n_samples, n_features = X.shape
    
    # Initialize weights and bias for each class
    weights = np.zeros((n_classes, n_features))
    bias = np.zeros(n_classes)
    
    # Track metrics during training
    epoch_metrics = []
    
    # Training loop
    for epoch in range(n_epochs):
        # Shuffle the data
        indices = np.random.permutation(n_samples)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        correct_predictions = 0
        total_predictions = 0
        
        # Process each sample
        for i in range(n_samples):
            x_i = X_shuffled[i]
            y_i = y_shuffled[i]
            
            # Make predictions for each class
            y_pred = np.zeros(n_classes)
            for j in range(n_classes):
                # Calculate the activation
                activation = np.dot(weights[j], x_i) + bias[j]
                # Apply step function
                y_pred[j] = 1 if activation >= 0 else 0
            
            # Update weights and bias for each class
            for j in range(n_classes):
                # Update only if prediction is wrong
                if y_pred[j] != y_i[j]:
                    # Update weights and bias
                    update = learning_rate * (y_i[j] - y_pred[j])
                    weights[j] += update * x_i
                    bias[j] += update
                else:
                    correct_predictions += 1
                
                total_predictions += 1
        
        # Calculate accuracy for this epoch
        epoch_accuracy = correct_predictions / total_predictions
        epoch_metrics.append(epoch_accuracy)
        
        print(f"Epoch {epoch+1}/{n_epochs} - Accuracy: {epoch_accuracy:.4f}")
    
    return weights, bias, epoch_metrics

# Function to make predictions with the trained online perceptron
def predict_perceptron_online(X, weights, bias):
    """Make predictions using the trained online perceptron model."""
    n_samples = X.shape[0]
    n_classes = weights.shape[0]
    
    # Initialize predictions
    y_pred = np.zeros((n_samples, n_classes))
    
    # Make predictions for each sample
    for i in range(n_samples):
        x_i = X[i]
        
        # Make predictions for each class
        for j in range(n_classes):
            # Calculate the activation
            activation = np.dot(weights[j], x_i) + bias[j]
            # Apply step function
            y_pred[i, j] = 1 if activation >= 0 else 0
    
    return y_pred

In [None]:
# Train the perceptron model in online learning mode
print("Training Perceptron model (online learning)...")
n_classes = y_train.shape[1]
weights, bias, epoch_metrics = train_perceptron_online(X_train, y_train, n_classes, learning_rate=0.01, n_epochs=10)
print("Training completed.")

# Plot training progress
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(epoch_metrics) + 1), epoch_metrics, marker='o')
plt.title('Online Perceptron Training Progress')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(True, alpha=0.3)
plt.show()

## Evaluate Online Learning Model on Validation Set

In [None]:
# Make predictions on validation set using the online perceptron
y_val_pred_online = predict_perceptron_online(X_val, weights, bias)

# Calculate metrics
hamming = hamming_loss(y_val, y_val_pred_online)
micro_f1 = f1_score(y_val, y_val_pred_online, average='micro')
macro_f1 = f1_score(y_val, y_val_pred_online, average='macro')
micro_precision = precision_score(y_val, y_val_pred_online, average='micro')
macro_precision = precision_score(y_val, y_val_pred_online, average='macro')
micro_recall = recall_score(y_val, y_val_pred_online, average='micro')
macro_recall = recall_score(y_val, y_val_pred_online, average='macro')

# Print metrics
print(f"Validation Metrics (Online Learning):")
print(f"Hamming Loss: {hamming:.4f}")
print(f"Micro-F1 Score: {micro_f1:.4f}")
print(f"Macro-F1 Score: {macro_f1:.4f}")
print(f"Micro-Precision: {micro_precision:.4f}")
print(f"Macro-Precision: {macro_precision:.4f}")
print(f"Micro-Recall: {micro_recall:.4f}")
print(f"Macro-Recall: {macro_recall:.4f}")

## Compare Batch and Online Learning

In [None]:
# Make predictions on test set using both models
y_test_pred_batch = perceptron_model.predict(X_test)
y_test_pred_online = predict_perceptron_online(X_test, weights, bias)

# Calculate metrics for batch learning
batch_hamming = hamming_loss(y_test, y_test_pred_batch)
batch_micro_f1 = f1_score(y_test, y_test_pred_batch, average='micro')
batch_macro_f1 = f1_score(y_test, y_test_pred_batch, average='macro')

# Calculate metrics for online learning
online_hamming = hamming_loss(y_test, y_test_pred_online)
online_micro_f1 = f1_score(y_test, y_test_pred_online, average='micro')
online_macro_f1 = f1_score(y_test, y_test_pred_online, average='macro')

# Print comparison
print("Comparison of Batch vs. Online Learning on Test Set:")
print(f"{'Metric':<15} {'Batch':<10} {'Online':<10}")
print(f"{'Hamming Loss':<15} {batch_hamming:<10.4f} {online_hamming:<10.4f}")
print(f"{'Micro-F1':<15} {batch_micro_f1:<10.4f} {online_micro_f1:<10.4f}")
print(f"{'Macro-F1':<15} {batch_macro_f1:<10.4f} {online_macro_f1:<10.4f}")

# Visualize comparison
metrics = ['Hamming Loss', 'Micro-F1', 'Macro-F1']
batch_values = [batch_hamming, batch_micro_f1, batch_macro_f1]
online_values = [online_hamming, online_micro_f1, online_macro_f1]

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, batch_values, width, label='Batch Learning')
rects2 = ax.bar(x + width/2, online_values, width, label='Online Learning')

ax.set_ylabel('Score')
ax.set_title('Comparison of Batch vs. Online Learning')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

# Add value labels
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()

## Detailed Evaluation of the Best Model

In [None]:
# Choose the best model based on the comparison
if batch_micro_f1 >= online_micro_f1:
    print("Batch learning model performs better. Using batch learning model for detailed evaluation.")
    best_model_pred = y_test_pred_batch
    best_model_name = "Batch Learning"
else:
    print("Online learning model performs better. Using online learning model for detailed evaluation.")
    best_model_pred = y_test_pred_online
    best_model_name = "Online Learning"

# Calculate per-class metrics
class_report = classification_report(y_test, best_model_pred, target_names=defect_classes, output_dict=True)
class_metrics = pd.DataFrame(class_report).transpose()
class_metrics = class_metrics.drop('accuracy', errors='ignore')

# Display per-class metrics
print(f"Per-class metrics for {best_model_name}:")
display(class_metrics)

# Visualize per-class F1 scores
plt.figure(figsize=(12, 6))
sns.barplot(x=class_metrics.index[:-3], y=class_metrics['f1-score'][:-3])
plt.title(f'F1 Score per Defect Class ({best_model_name})')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

## Calculate Precision@k for the Best Model

In [None]:
# Function to calculate Precision@k
def precision_at_k(y_true, y_score, k):
    """Calculate Precision@k for multi-label classification."""
    # Get the indices of the top k predictions for each sample
    top_k_indices = np.argsort(y_score, axis=1)[:, ::-1][:, :k]
    
    # Create a matrix of predictions with 1s at the top k positions
    y_pred_k = np.zeros_like(y_score)
    for i, indices in enumerate(top_k_indices):
        y_pred_k[i, indices] = 1
    
    # Calculate precision
    precision = 0
    for i in range(len(y_true)):
        if np.sum(y_pred_k[i]) > 0:  # Avoid division by zero
            precision += np.sum(y_true[i] & y_pred_k[i]) / np.sum(y_pred_k[i])
    
    return precision / len(y_true)

# For batch learning model, we can get probability scores
if best_model_name == "Batch Learning":
    y_test_score = perceptron_model.predict_proba(X_test)
    
    # Calculate Precision@k for different values of k
    k_values = [1, 2, 3]
    for k in k_values:
        p_at_k = precision_at_k(y_test, y_test_score, k)
        print(f"Precision@{k}: {p_at_k:.4f}")
else:
    print("Precision@k calculation requires probability scores, which are not available for the custom online learning model.")

## Save Model

In [None]:
# Save the batch learning model, scaler, and label binarizer
model_path = '../models/perceptron_defect.pkl'
scaler_path = '../models/perceptron_defect_scaler.pkl'
mlb_path = '../models/perceptron_defect_mlb.pkl'

joblib.dump(perceptron_model, model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(mlb, mlb_path)

print(f"Batch learning model saved to {model_path}")
print(f"Scaler saved to {scaler_path}")
print(f"MultiLabelBinarizer saved to {mlb_path}")

# Save the online learning model weights and bias
online_model_path = '../models/perceptron_online_defect.npz'
np.savez(online_model_path, weights=weights, bias=bias)
print(f"Online learning model saved to {online_model_path}")