# SMOTE + Logistic Regression Model
## Mammography Dataset - Imbalanced-Learn Pipeline with SMOTE Oversampling


In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
    recall_score,
    f1_score,
    precision_score
)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from data_processing.data_loader import load_mammography_data

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)


## 1. Load and Split Data


In [None]:
# Load the mammography dataset
X, y = load_mammography_data()

# Split into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("=" * 70)
print("DATA LOADED AND SPLIT")
print("=" * 70)
print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set:     {X_test.shape[0]:,} samples")
print(f"Features:     {X_train.shape[1]} features")
print("\nTraining set class distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count:,} samples ({count/len(y_train)*100:.2f}%)")
print("=" * 70)


## 2. Create Imbalanced-Learn Pipeline with SMOTE


In [None]:
# Create an imbalanced-learn pipeline with:
# 1. StandardScaler - Normalize features
# 2. SMOTE - Oversample minority class (malignant)
# 3. LogisticRegression - Classifier with balanced class weights

pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced'
    ))
])

print("=" * 70)
print("IMBALANCED-LEARN PIPELINE CREATED")
print("=" * 70)
print("Pipeline steps:")
for step_name, step in pipeline.steps:
    print(f"  - {step_name}: {type(step).__name__}")
print("\nNote: SMOTE will oversample the minority class during training")
print("=" * 70)


## 3. Train the Model


In [None]:
# Train the model on training data
# SMOTE will automatically oversample the minority class during fit
print("Training model with SMOTE oversampling...")
print("This may take a moment as SMOTE generates synthetic samples...")

pipeline.fit(X_train, y_train)

# Check the resampled training data size after SMOTE
X_resampled, y_resampled = pipeline.named_steps['smote'].fit_resample(
    pipeline.named_steps['scaler'].transform(X_train), 
    y_train
)

print("\n✅ Model training completed!")
print(f"\nOriginal training set size: {len(y_train):,} samples")
print(f"After SMOTE resampling:      {len(y_resampled):,} samples")
print(f"\nResampled class distribution:")
unique, counts = np.unique(y_resampled, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count:,} samples ({count/len(y_resampled)*100:.2f}%)")


## 4. Make Predictions


In [None]:
# Make predictions on test set
# Note: SMOTE is only applied during training, not during prediction
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

print("Predictions generated for test set.")
print(f"Predicted classes: {np.unique(y_pred)}")
print(f"Prediction probabilities shape: {y_pred_proba.shape}")


## 5. Confusion Matrix


In [None]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print confusion matrix in a formatted way
print("=" * 70)
print("CONFUSION MATRIX")
print("=" * 70)
print("\nConfusion Matrix (Raw Counts):")
print(f"{'':<20s} {'Predicted Benign':<20s} {'Predicted Malignant':<20s}")
print("-" * 70)
print(f"{'Actual Benign':<20s} {cm[0, 0]:<20d} {cm[0, 1]:<20d}")
print(f"{'Actual Malignant':<20s} {cm[1, 0]:<20d} {cm[1, 1]:<20d}")
print("=" * 70)

# Calculate percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

print("\nConfusion Matrix (Percentages):")
print(f"{'':<20s} {'Predicted Benign':<20s} {'Predicted Malignant':<20s}")
print("-" * 70)
print(f"{'Actual Benign':<20s} {cm_percent[0, 0]:<20.2f}% {cm_percent[0, 1]:<20.2f}%")
print(f"{'Actual Malignant':<20s} {cm_percent[1, 0]:<20.2f}% {cm_percent[1, 1]:<20.2f}%")
print("=" * 70)

# Visualize confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Raw counts
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign', 'Malignant'])
disp1.plot(ax=axes[0], cmap='Blues', values_format='d')
axes[0].set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold', pad=10)

# Plot 2: Normalized percentages
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_percent, display_labels=['Benign', 'Malignant'])
disp2.plot(ax=axes[1], cmap='Blues', values_format='.2f')
axes[1].set_title('Confusion Matrix (Percentages)', fontsize=14, fontweight='bold', pad=10)

plt.suptitle('SMOTE + Logistic Regression - Confusion Matrix', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print key metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print("\n" + "=" * 70)
print("CONFUSION MATRIX METRICS")
print("=" * 70)
print(f"True Negatives (TN):  {tn:>6d}  (Correctly predicted Benign)")
print(f"False Positives (FP): {fp:>6d}  (Predicted Malignant, actually Benign)")
print(f"False Negatives (FN): {fn:>6d}  (Predicted Benign, actually Malignant)")
print(f"True Positives (TP):   {tp:>6d}  (Correctly predicted Malignant)")
print("=" * 70)


In [None]:
# Generate classification report
class_report = classification_report(
    y_test, 
    y_pred, 
    target_names=['Benign', 'Malignant'],
    output_dict=True
)

print("=" * 70)
print("CLASSIFICATION REPORT")
print("=" * 70)
print("\nPer-Class Metrics:")
print("-" * 70)
print(f"{'Class':<15s} {'Precision':<12s} {'Recall':<12s} {'F1-Score':<12s} {'Support':<12s}")
print("-" * 70)

for class_name in ['Benign', 'Malignant']:
    metrics = class_report[class_name]
    print(f"{class_name:<15s} {metrics['precision']:<12.4f} {metrics['recall']:<12.4f} "
          f"{metrics['f1-score']:<12.4f} {int(metrics['support']):<12d}")

print("-" * 70)
print(f"{'Accuracy':<15s} {'':<12s} {'':<12s} {class_report['accuracy']:<12.4f} "
      f"{int(class_report['macro avg']['support']):<12d}")
print(f"{'Macro Avg':<15s} {class_report['macro avg']['precision']:<12.4f} "
      f"{class_report['macro avg']['recall']:<12.4f} {class_report['macro avg']['f1-score']:<12.4f} "
      f"{int(class_report['macro avg']['support']):<12d}")
print(f"{'Weighted Avg':<15s} {class_report['weighted avg']['precision']:<12.4f} "
      f"{class_report['weighted avg']['recall']:<12.4f} {class_report['weighted avg']['f1-score']:<12.4f} "
      f"{int(class_report['weighted avg']['support']):<12d}")
print("=" * 70)

# Print detailed classification report
print("\n" + "=" * 70)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 70)
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
print("=" * 70)


In [None]:
# Calculate recall and F1-score specifically for the malignant class (class 1)
malignant_recall = recall_score(y_test, y_pred, pos_label=1)
malignant_f1 = f1_score(y_test, y_pred, pos_label=1)

# Also get precision for context
malignant_precision = precision_score(y_test, y_pred, pos_label=1)

print("=" * 70)
print("MALIGNANT CLASS (Class 1) METRICS")
print("=" * 70)
print(f"\n{'Metric':<20s} {'Value':<15s} {'Interpretation'}")
print("-" * 70)
print(f"{'Recall (Sensitivity)':<20s} {malignant_recall:<15.4f} "
      f"({malignant_recall*100:.2f}% of malignant cases correctly identified)")
print(f"{'Precision':<20s} {malignant_precision:<15.4f} "
      f"({malignant_precision*100:.2f}% of predicted malignant are actually malignant)")
print(f"{'F1-Score':<20s} {malignant_f1:<15.4f} "
      f"(Harmonic mean of precision and recall)")
print("=" * 70)

# Additional context
tn, fp, fn, tp = cm.ravel()
print("\n" + "=" * 70)
print("MALIGNANT CLASS PERFORMANCE BREAKDOWN")
print("=" * 70)
print(f"True Positives (TP):  {tp:>6d}  - Correctly identified malignant cases")
print(f"False Negatives (FN): {fn:>6d}  - Missed malignant cases (Type II error)")
print(f"False Positives (FP): {fp:>6d}  - Incorrectly flagged as malignant")
print(f"\nRecall = TP / (TP + FN) = {tp} / ({tp} + {fn}) = {malignant_recall:.4f}")
print(f"F1-Score = 2 × (Precision × Recall) / (Precision + Recall) = {malignant_f1:.4f}")
print("=" * 70)

# Visualize key metrics
fig, ax = plt.subplots(figsize=(8, 6))
metrics_names = ['Recall', 'Precision', 'F1-Score']
metrics_values = [malignant_recall, malignant_precision, malignant_f1]
colors = ['#e74c3c', '#3498db', '#2ecc71']

bars = ax.bar(metrics_names, metrics_values, color=colors, edgecolor='black', linewidth=1.5)
ax.set_ylim([0, 1.1])
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Malignant Class Performance Metrics', fontsize=14, fontweight='bold', pad=15)
ax.grid(axis='y', alpha=0.3, linestyle='--')

# Add value labels on bars
for bar, value in zip(bars, metrics_values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{value:.4f}\n({value*100:.2f}%)',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()


## 8. Summary

**SMOTE + Logistic Regression Model Performance:**
- **Pipeline**: StandardScaler → SMOTE → LogisticRegression
- **SMOTE**: Oversamples the minority class (malignant) to balance the dataset
- **Training**: SMOTE applied only during training, not on test set
- **Evaluation**: Tested on original (unresampled) test set

**Key Metrics for Malignant Class:**
- **Recall**: Measures how many malignant cases were correctly identified
- **F1-Score**: Balanced metric combining precision and recall
- **Precision**: Measures accuracy of malignant predictions

**Advantages of SMOTE:**
- Addresses class imbalance by creating synthetic minority samples
- Helps the model learn better decision boundaries
- Can improve recall for the minority class
- Works well with Logistic Regression

**Next Steps:**
- Compare with baseline model (without SMOTE)
- Try ADASYN as an alternative to SMOTE
- Experiment with different SMOTE parameters (k_neighbors)
- Use cross-validation for more robust evaluation
