# Multi-Label Defect Prediction using Logistic Regression

This notebook implements a Logistic Regression model for multi-label defect prediction.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, hamming_loss, classification_report, f1_score, precision_score, recall_score
import joblib
import sys
sys.path.append('..')
from utils.defect_utils import preprocess_data, get_defect_types

## Load and Explore Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../data/dataset.csv')

# Display the first few rows
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values[missing_values > 0] if any(missing_values > 0) else "No missing values")

In [None]:
# Analyze the defect labels
defect_counts = {}
for defects in df['defects']:
    for defect in defects.split(','):
        defect_counts[defect] = defect_counts.get(defect, 0) + 1

# Convert to DataFrame for visualization
defect_df = pd.DataFrame(list(defect_counts.items()), columns=['Defect', 'Count'])
defect_df = defect_df.sort_values('Count', ascending=False)

# Plot defect distribution
plt.figure(figsize=(12, 6))
sns.barplot(x='Defect', y='Count', data=defect_df)
plt.title('Defect Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Check for class imbalance
print("Defect distribution:")
for defect, count in defect_counts.items():
    print(f"{defect}: {count} ({count/len(df):.2%})")

## Preprocess Data

In [None]:
# Preprocess the data
# Extract features (all columns except 'defects')
feature_cols = [col for col in df.columns if col != 'defects']
X = df[feature_cols].values

# Process labels
defects = df['defects'].str.split(',').tolist()

# Convert to multi-hot encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(defects)

# Get the defect class names
defect_classes = mlb.classes_
print(f"Defect classes: {defect_classes}")
print(f"Number of classes: {len(defect_classes)}")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Feature matrix shape: {X_scaled.shape}")
print(f"Label matrix shape: {y.shape}")

## Split Data

In [None]:
# Split data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## Hyperparameter Tuning

In [None]:
# Define the Logistic Regression model with OneVsRestClassifier for multi-label classification
base_lr = LogisticRegression(solver='liblinear', random_state=42)
lr_model = OneVsRestClassifier(base_lr)

# Define hyperparameter grid
param_grid = {
    'estimator__C': [0.1, 1.0, 10.0],
    'estimator__penalty': ['l1', 'l2']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    lr_model,
    param_grid,
    cv=3,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best model
best_lr_model = grid_search.best_estimator_

## Evaluate Model on Validation Set

In [None]:
# Predict on validation set
y_val_pred = best_lr_model.predict(X_val)

# Calculate metrics
hamming = hamming_loss(y_val, y_val_pred)
micro_f1 = f1_score(y_val, y_val_pred, average='micro')
macro_f1 = f1_score(y_val, y_val_pred, average='macro')
micro_precision = precision_score(y_val, y_val_pred, average='micro')
macro_precision = precision_score(y_val, y_val_pred, average='macro')
micro_recall = recall_score(y_val, y_val_pred, average='micro')
macro_recall = recall_score(y_val, y_val_pred, average='macro')

# Print metrics
print(f"Validation Metrics:")
print(f"Hamming Loss: {hamming:.4f}")
print(f"Micro-F1 Score: {micro_f1:.4f}")
print(f"Macro-F1 Score: {macro_f1:.4f}")
print(f"Micro-Precision: {micro_precision:.4f}")
print(f"Macro-Precision: {macro_precision:.4f}")
print(f"Micro-Recall: {micro_recall:.4f}")
print(f"Macro-Recall: {macro_recall:.4f}")

## Calculate Precision@k

In [None]:
# Function to calculate Precision@k
def precision_at_k(y_true, y_score, k):
    """Calculate Precision@k for multi-label classification."""
    # Get the indices of the top k predictions for each sample
    top_k_indices = np.argsort(y_score, axis=1)[:, ::-1][:, :k]
    
    # Create a matrix of predictions with 1s at the top k positions
    y_pred_k = np.zeros_like(y_score)
    for i, indices in enumerate(top_k_indices):
        y_pred_k[i, indices] = 1
    
    # Calculate precision
    precision = 0
    for i in range(len(y_true)):
        if np.sum(y_pred_k[i]) > 0:  # Avoid division by zero
            precision += np.sum(y_true[i] & y_pred_k[i]) / np.sum(y_pred_k[i])
    
    return precision / len(y_true)

# Get probability scores for validation set
y_val_score = best_lr_model.predict_proba(X_val)

# Calculate Precision@k for different values of k
k_values = [1, 2, 3]
for k in k_values:
    p_at_k = precision_at_k(y_val, y_val_score, k)
    print(f"Precision@{k}: {p_at_k:.4f}")

## Evaluate Model on Test Set

In [None]:
# Predict on test set
y_test_pred = best_lr_model.predict(X_test)

# Calculate metrics
hamming = hamming_loss(y_test, y_test_pred)
micro_f1 = f1_score(y_test, y_test_pred, average='micro')
macro_f1 = f1_score(y_test, y_test_pred, average='macro')
micro_precision = precision_score(y_test, y_test_pred, average='micro')
macro_precision = precision_score(y_test, y_test_pred, average='macro')
micro_recall = recall_score(y_test, y_test_pred, average='micro')
macro_recall = recall_score(y_test, y_test_pred, average='macro')

# Print metrics
print(f"Test Metrics:")
print(f"Hamming Loss: {hamming:.4f}")
print(f"Micro-F1 Score: {micro_f1:.4f}")
print(f"Macro-F1 Score: {macro_f1:.4f}")
print(f"Micro-Precision: {micro_precision:.4f}")
print(f"Macro-Precision: {macro_precision:.4f}")
print(f"Micro-Recall: {micro_recall:.4f}")
print(f"Macro-Recall: {macro_recall:.4f}")

In [None]:
# Calculate per-class metrics
class_report = classification_report(y_test, y_test_pred, target_names=defect_classes, output_dict=True)
class_metrics = pd.DataFrame(class_report).transpose()
class_metrics = class_metrics.drop('accuracy', errors='ignore')

# Display per-class metrics
print("Per-class metrics:")
display(class_metrics)

# Visualize per-class F1 scores
plt.figure(figsize=(12, 6))
sns.barplot(x=class_metrics.index[:-3], y=class_metrics['f1-score'][:-3])
plt.title('F1 Score per Defect Class')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

## Analyze Feature Importance

In [None]:
# Extract feature importance (coefficients) for each class
feature_names = feature_cols
coefficients = []

for i, defect_class in enumerate(defect_classes):
    # Get the coefficients for this class
    coef = best_lr_model.estimators_[i].coef_[0]
    coefficients.append(coef)

# Convert to DataFrame
coef_df = pd.DataFrame(coefficients, index=defect_classes, columns=feature_names)

# Plot heatmap of coefficients
plt.figure(figsize=(14, 10))
sns.heatmap(coef_df, cmap='coolwarm', center=0, annot=False)
plt.title('Logistic Regression Coefficients per Defect Class')
plt.tight_layout()
plt.show()

# Show the top 3 most important features for each class
print("Top 3 most important features for each defect class:")
for defect_class in defect_classes:
    # Get the absolute coefficients for this class
    abs_coef = np.abs(coef_df.loc[defect_class])
    # Get the top 3 features
    top_features = abs_coef.nlargest(3).index.tolist()
    # Get the coefficient values
    top_values = coef_df.loc[defect_class, top_features].tolist()
    print(f"{defect_class}:")
    for feature, value in zip(top_features, top_values):
        print(f"  {feature}: {value:.4f}")

## Save Model

In [None]:
# Save model, scaler, and label binarizer
model_path = '../models/logistic_defect.pkl'
scaler_path = '../models/logistic_defect_scaler.pkl'
mlb_path = '../models/logistic_defect_mlb.pkl'

joblib.dump(best_lr_model, model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(mlb, mlb_path)

print(f"Model saved to {model_path}")
print(f"Scaler saved to {scaler_path}")
print(f"MultiLabelBinarizer saved to {mlb_path}")