# Baseline Logistic Regression Model
## Mammography Dataset - Baseline Model with StandardScaler Pipeline


In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay
)

from data_processing.data_loader import load_mammography_data

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)


## 1. Load and Split Data


In [None]:
# Load the mammography dataset
X, y = load_mammography_data()

# Split into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("=" * 70)
print("DATA LOADED AND SPLIT")
print("=" * 70)
print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set:     {X_test.shape[0]:,} samples")
print(f"Features:     {X_train.shape[1]} features")
print("=" * 70)


## 2. Create Pipeline with StandardScaler and Logistic Regression


In [None]:
# Create a pipeline with StandardScaler and LogisticRegression
# StandardScaler normalizes features (mean=0, std=1)
# LogisticRegression with class_weight='balanced' to handle imbalanced data

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced'  # Handle class imbalance
    ))
])

print("=" * 70)
print("PIPELINE CREATED")
print("=" * 70)
print("Pipeline steps:")
for step_name, step in pipeline.steps:
    print(f"  - {step_name}: {type(step).__name__}")
print("=" * 70)


## 3. Train the Model


In [None]:
# Train the model on training data
print("Training baseline Logistic Regression model...")
pipeline.fit(X_train, y_train)
print("âœ… Model training completed!")


## 4. Make Predictions


In [None]:
# Make predictions on test set
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

print("Predictions generated for test set.")
print(f"Predicted classes: {np.unique(y_pred)}")
print(f"Prediction probabilities shape: {y_pred_proba.shape}")


## 5. Model Evaluation


In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("=" * 70)
print("MODEL EVALUATION RESULTS")
print("=" * 70)
print(f"\n{'METRIC':<30s} {'VALUE':<20s}")
print("-" * 70)
print(f"{'Accuracy':<30s} {accuracy:.4f} ({accuracy*100:.2f}%)")
print("=" * 70)


## 6. Confusion Matrix


In [None]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print confusion matrix in a formatted way
print("=" * 70)
print("CONFUSION MATRIX")
print("=" * 70)
print("\nConfusion Matrix (Raw Counts):")
print(f"{'':<20s} {'Predicted Benign':<20s} {'Predicted Malignant':<20s}")
print("-" * 70)
print(f"{'Actual Benign':<20s} {cm[0, 0]:<20d} {cm[0, 1]:<20d}")
print(f"{'Actual Malignant':<20s} {cm[1, 0]:<20d} {cm[1, 1]:<20d}")
print("=" * 70)

# Calculate percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

print("\nConfusion Matrix (Percentages):")
print(f"{'':<20s} {'Predicted Benign':<20s} {'Predicted Malignant':<20s}")
print("-" * 70)
print(f"{'Actual Benign':<20s} {cm_percent[0, 0]:<20.2f}% {cm_percent[0, 1]:<20.2f}%")
print(f"{'Actual Malignant':<20s} {cm_percent[1, 0]:<20.2f}% {cm_percent[1, 1]:<20.2f}%")
print("=" * 70)

# Visualize confusion matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Raw counts
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign', 'Malignant'])
disp1.plot(ax=axes[0], cmap='Blues', values_format='d')
axes[0].set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold', pad=10)

# Plot 2: Normalized percentages
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_percent, display_labels=['Benign', 'Malignant'])
disp2.plot(ax=axes[1], cmap='Blues', values_format='.2f')
axes[1].set_title('Confusion Matrix (Percentages)', fontsize=14, fontweight='bold', pad=10)

plt.suptitle('Baseline Logistic Regression - Confusion Matrix', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print key metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print("\n" + "=" * 70)
print("CONFUSION MATRIX METRICS")
print("=" * 70)
print(f"True Negatives (TN):  {tn:>6d}  (Correctly predicted Benign)")
print(f"False Positives (FP): {fp:>6d}  (Predicted Malignant, actually Benign)")
print(f"False Negatives (FN): {fn:>6d}  (Predicted Benign, actually Malignant)")
print(f"True Positives (TP):   {tp:>6d}  (Correctly predicted Malignant)")
print("=" * 70)


In [None]:
# Generate classification report
class_report = classification_report(
    y_test, 
    y_pred, 
    target_names=['Benign', 'Malignant'],
    output_dict=True
)

print("=" * 70)
print("CLASSIFICATION REPORT")
print("=" * 70)
print("\nPer-Class Metrics:")
print("-" * 70)
print(f"{'Class':<15s} {'Precision':<12s} {'Recall':<12s} {'F1-Score':<12s} {'Support':<12s}")
print("-" * 70)

for class_name in ['Benign', 'Malignant']:
    metrics = class_report[class_name]
    print(f"{class_name:<15s} {metrics['precision']:<12.4f} {metrics['recall']:<12.4f} "
          f"{metrics['f1-score']:<12.4f} {int(metrics['support']):<12d}")

print("-" * 70)
print(f"{'Accuracy':<15s} {'':<12s} {'':<12s} {class_report['accuracy']:<12.4f} "
      f"{int(class_report['macro avg']['support']):<12d}")
print(f"{'Macro Avg':<15s} {class_report['macro avg']['precision']:<12.4f} "
      f"{class_report['macro avg']['recall']:<12.4f} {class_report['macro avg']['f1-score']:<12.4f} "
      f"{int(class_report['macro avg']['support']):<12d}")
print(f"{'Weighted Avg':<15s} {class_report['weighted avg']['precision']:<12.4f} "
      f"{class_report['weighted avg']['recall']:<12.4f} {class_report['weighted avg']['f1-score']:<12.4f} "
      f"{int(class_report['weighted avg']['support']):<12d}")
print("=" * 70)

# Print detailed classification report
print("\n" + "=" * 70)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 70)
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant']))
print("=" * 70)


## 8. Summary

**Baseline Model Performance:**
- **Model**: Logistic Regression with StandardScaler pipeline
- **Training**: Trained on 80% of the data with stratified sampling
- **Evaluation**: Tested on 20% of the data

**Key Observations:**
- This baseline model provides a starting point for comparison
- The model uses `class_weight='balanced'` to handle class imbalance
- StandardScaler normalizes features for better model performance
- Further improvements can be made using techniques like SMOTE, ADASYN, or other algorithms

**Next Steps:**
- Try different algorithms (Random Forest, SVM, etc.)
- Apply resampling techniques (SMOTE, ADASYN)
- Optimize hyperparameters
- Use cross-validation for more robust evaluation
