In [9]:
# Cell 1: Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score, 
roc_auc_score, confusion_matrix, classification_report
 )
from imblearn.over_sampling import SMOTE
import os
import warnings
warnings.filterwarnings('ignore')
print("="*80)
print("STEP 3: SMOTE BALANCING IMPLEMENTATION")
print("="*80)

STEP 3: SMOTE BALANCING IMPLEMENTATION


In [12]:
 # Cell 2: Load data from Step 1
# Fix: use consistent relative path to the processed data directory
X_train_scaled = np.load('../data/processed/X_train_scaled.npy')
X_test_scaled = np.load('../data/processed/X_test_scaled.npy')
y_train = np.load('../data/processed/y_train.npy')
y_test = np.load('../data/processed/y_test.npy')
print(f"Original training data shape: {X_train_scaled.shape}")
print(f"Original class distribution: {np.bincount(y_train)}")
print(f"  Malignant (0): {np.sum(y_train == 0)} samples")
print(f"  Benign (1): {np.sum(y_train == 1)} samples")
print(f"  Imbalance ratio: {np.sum(y_train == 1) / np.sum(y_train == 0):.2f}:1")

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/X_train_scaled.npy'

In [8]:
# Cell 3: Apply SMOTE balancing
print("\n" + "="*60)
print("APPLYING SMOTE")
print("="*60)
 # Initialize SMOTE with k_neighbors=5 (default)
smote = SMOTE(random_state=42, k_neighbors=5)
 # Apply SMOTE to training data
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print(f"\n✅ SMOTE applied successfully!")
print(f"\nBalanced training data shape: {X_train_smote.shape}")
print(f"Balanced class distribution: {np.bincount(y_train_smote)}")
print(f"  Malignant (0): {np.sum(y_train_smote == 0)} samples")
print(f"  Benign (1): {np.sum(y_train_smote == 1)} samples")
print(f"  New balance ratio: {np.sum(y_train_smote == 1) / np.sum(y_train_smote == 0):.2f}:1")
# Calculate oversampling statistics
original_minority = np.sum(y_train == 0)
new_minority = np.sum(y_train_smote == 0)
synthetic_samples = new_minority - original_minority
print(f"\n SMOTE Statistics:")
print(f"Original minority samples: {original_minority}")
print(f"New minority samples: {new_minority}")
print(f"   Synthetic samples created: {synthetic_samples}")
print(f"   Total training samples: {len(y_train_smote)} (was {len(y_train)})")


APPLYING SMOTE


NameError: name 'SMOTE' is not defined

In [None]:
 # Cell 4: Define evaluation function (reuse from Step 2)
def evaluate_model_with_balancing(model, X_train, y_train, X_test, y_test, 
                                 model_name, method_name):
    """
    Comprehensive model evaluation for balanced data.
    """
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Method': method_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1_Score': f1_score(y_test, y_pred, average='weighted'),
        'ROC_AUC': roc_auc_score(y_test, y_pred_proba),
        
        # Class-specific metrics (CRITICAL for imbalanced data)
        'Precision_Malignant': precision_score(y_test, y_pred, pos_label=0),
        'Recall_Malignant': recall_score(y_test, y_pred, pos_label=0),
        'F1_Malignant': f1_score(y_test, y_pred, pos_label=0),
        'Precision_Benign': precision_score(y_test, y_pred, pos_label=1),
        'Recall_Benign': recall_score(y_test, y_pred, pos_label=1),
        'F1_Benign': f1_score(y_test, y_pred, pos_label=1),
    }
    
    # Print results
    print(f"\n{'='*60}")
    print(f"Model: {model_name} | Method: {method_name}")
    print(f"{'='*60}")
    print(f"Overall: Acc={metrics['Accuracy']:.4f}, "
          f"F1={metrics['F1_Score']:.4f}, AUC={metrics['ROC_AUC']:.4f}")
    print(f"Malignant: P={metrics['Precision_Malignant']:.4f}, "
          f"R={metrics['Recall_Malignant']:.4f}, F1={metrics['F1_Malignant']:.4f}")
    print(f"Benign: P={metrics['Precision_Benign']:.4f}, "
          f"R={metrics['Recall_Benign']:.4f}, F1={metrics['F1_Benign']:.4f}")
    
    return metrics, y_pred, y_pred_proba


In [13]:
# Cell 5: Train all models with SMOTE-balanced data
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}
smote_results = []
smote_predictions = {}
print("\n" + "="*80)
print("TRAINING MODELS WITH SMOTE-BALANCED DATA")
print("="*80)
for name, model in models.items():
    print(f"\n Training {name}...")
    metrics, y_pred, y_pred_proba = evaluate_model_with_balancing(
        model, X_train_smote, y_train_smote, X_test_scaled, y_test,
        name, "SMOTE"
    )
    smote_results.append(metrics)
    smote_predictions[name] = y_pred
 # Create results DataFrame
smote_results_df = pd.DataFrame(smote_results)


TRAINING MODELS WITH SMOTE-BALANCED DATA

 Training Logistic Regression...


NameError: name 'X_train_smote' is not defined