In [None]:
# Breast Cancer Detection - AutoML Training
# Dataset: Breast Histopathology Images
# Target Accuracy: 90%+

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import shutil
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import cv2
from tqdm import tqdm
import pickle

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("TensorFlow version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

# ============================================================================
# STEP 1: DATASET PREPARATION
# ============================================================================

# Dataset path
DATASET_PATH = '/kaggle/input/breast-histopathology-images'
BASE_DIR = '/kaggle/working'
MODEL_DIR = os.path.join(BASE_DIR, 'models')
os.makedirs(MODEL_DIR, exist_ok=True)

print("Dataset structure:")
for root, dirs, files in os.walk(DATASET_PATH):
    level = root.replace(DATASET_PATH, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    if level < 2:
        subindent = ' ' * 2 * (level + 1)
        for file in files[:3]:
            print(f'{subindent}{file}')
        if len(files) > 3:
            print(f'{subindent}... and {len(files)-3} more files')

# ============================================================================
# STEP 2: LOAD AND ORGANIZE DATA
# ============================================================================

def load_dataset_info(dataset_path):
    """Load and organize dataset information"""
    image_paths = []
    labels = []
    
    patient_dirs = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    
    print(f"\nFound {len(patient_dirs)} patient directories")
    
    for patient_id in tqdm(patient_dirs, desc="Loading dataset"):
        patient_path = os.path.join(dataset_path, patient_id)
        
        for img_class in ['0', '1']:
            class_path = os.path.join(patient_path, img_class)
            if os.path.exists(class_path):
                for img_file in os.listdir(class_path):
                    if img_file.endswith('.png'):
                        image_paths.append(os.path.join(class_path, img_file))
                        labels.append(int(img_class))
    
    return image_paths, labels

print("Loading dataset information...")
image_paths, labels = load_dataset_info(DATASET_PATH)

print(f"\nTotal images: {len(image_paths)}")
print(f"Positive samples (IDC): {sum(labels)}")
print(f"Negative samples (No IDC): {len(labels) - sum(labels)}")
print(f"Class balance: {sum(labels)/len(labels)*100:.2f}% positive")

# Create DataFrame with string labels for Keras
df = pd.DataFrame({
    'image_path': image_paths,
    'label': [str(label) for label in labels]  # Convert to strings
})

# Save dataset info
df.to_csv(os.path.join(BASE_DIR, 'dataset_info.csv'), index=False)
print(f"\nDataset info saved to dataset_info.csv")

# ============================================================================
# STEP 3: DATA SPLITTING
# ============================================================================

# Split data: 70% train, 15% validation, 15% test
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"\nTrain samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

# ============================================================================
# STEP 4: DATA GENERATORS WITH AUGMENTATION
# ============================================================================

IMG_SIZE = 96  # Scaled up from 50x50 for better feature extraction
BATCH_SIZE = 64

# Training data augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2,
    shear_range=0.1,
    fill_mode='nearest'
)

# Validation and test data (only rescaling)
val_test_datagen = ImageDataGenerator(rescale=1./255)

# Create generators
train_generator = train_datagen.flow_from_dataframe(
    train_df,
    x_col='image_path',
    y_col='label',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    classes=['0', '1'],  # Explicit class mapping
    shuffle=True
)

val_generator = val_test_datagen.flow_from_dataframe(
    val_df,
    x_col='image_path',
    y_col='label',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    classes=['0', '1'],  # Explicit class mapping
    shuffle=False
)

test_generator = val_test_datagen.flow_from_dataframe(
    test_df,
    x_col='image_path',
    y_col='label',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    classes=['0', '1'],  # Explicit class mapping
    shuffle=False
)

# ============================================================================
# STEP 5: BUILD AUTOML MODEL (Transfer Learning with EfficientNetB0)
# ============================================================================

def build_model(img_size=96, use_pretrained=True):
    """Build EfficientNetB0-based model for AutoML approach"""
    
    if use_pretrained:
        try:
            # Try to load pre-trained EfficientNetB0
            print("Attempting to download pre-trained EfficientNetB0 weights...")
            base_model = EfficientNetB0(
                include_top=False,
                weights='imagenet',
                input_shape=(img_size, img_size, 3)
            )
            print("Pre-trained weights loaded successfully!")
        except Exception as e:
            print(f"Failed to download pre-trained weights: {e}")
            print("Falling back to custom CNN architecture...")
            use_pretrained = False
    
    if not use_pretrained:
        # Build custom CNN from scratch (no internet required)
        base_model = models.Sequential([
            # First Conv Block
            layers.Conv2D(32, (3, 3), activation='relu', padding='same', 
                         input_shape=(img_size, img_size, 3)),
            layers.BatchNormalization(),
            layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2)),
            layers.Dropout(0.25),
            
            # Second Conv Block
            layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2)),
            layers.Dropout(0.25),
            
            # Third Conv Block
            layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2)),
            layers.Dropout(0.25),
            
            # Fourth Conv Block
            layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((2, 2)),
            layers.Dropout(0.25),
        ], name='custom_cnn_base')
    
    # Freeze base model initially (if using pre-trained)
    if use_pretrained:
        base_model.trainable = False
    
    # Build complete model
    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')
    ])
    
    return model, base_model

print("\nBuilding AutoML model...")
model, base_model = build_model(IMG_SIZE)

# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc'), 
             keras.metrics.Precision(name='precision'),
             keras.metrics.Recall(name='recall')]
)

model.summary()

# ============================================================================
# STEP 6: CALLBACKS
# ============================================================================

callbacks = [
    EarlyStopping(
        monitor='val_accuracy',
        patience=7,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    ),
    ModelCheckpoint(
        os.path.join(MODEL_DIR, 'best_model.h5'),
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    )
]

# ============================================================================
# STEP 7: TRAIN MODEL (PHASE 1 - Frozen Base)
# ============================================================================

print("\n" + "="*70)
print("PHASE 1: Training with frozen base model")
print("="*70)

history_phase1 = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=15,
    callbacks=callbacks,
    verbose=1
)

# ============================================================================
# STEP 8: FINE-TUNING (PHASE 2 - Unfrozen Base)
# ============================================================================

print("\n" + "="*70)
print("PHASE 2: Fine-tuning")
print("="*70)

# Check if base model is trainable (only for pre-trained models)
if hasattr(base_model, 'layers') and len(base_model.layers) > 100:
    # Unfreeze base model (for transfer learning)
    base_model.trainable = True
    
    # Freeze early layers, unfreeze later layers
    for layer in base_model.layers[:100]:
        layer.trainable = False
    
    print("Fine-tuning with unfrozen base model layers")
else:
    # Custom CNN - all layers already trainable
    print("Continuing training with all layers trainable")

# Recompile with lower learning rate
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc'),
             keras.metrics.Precision(name='precision'),
             keras.metrics.Recall(name='recall')]
)

history_phase2 = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=20,
    callbacks=callbacks,
    verbose=1
)

# ============================================================================
# STEP 9: EVALUATE MODEL
# ============================================================================

print("\n" + "="*70)
print("EVALUATING MODEL ON TEST SET")
print("="*70)

# Load best model
model = keras.models.load_model(os.path.join(MODEL_DIR, 'best_model.h5'))

# Evaluate on test set
test_loss, test_acc, test_auc, test_precision, test_recall = model.evaluate(test_generator)

print(f"\nTest Accuracy: {test_acc*100:.2f}%")
print(f"Test AUC: {test_auc:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {2*(test_precision*test_recall)/(test_precision+test_recall):.4f}")

# Get predictions
test_generator.reset()
y_pred_proba = model.predict(test_generator)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
y_true = test_df['label'].astype(int).values  # Convert string back to int

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['No IDC (0)', 'IDC (1)']))

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No IDC', 'IDC'],
            yticklabels=['No IDC', 'IDC'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(os.path.join(BASE_DIR, 'confusion_matrix.png'), dpi=300, bbox_inches='tight')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
roc_auc = roc_auc_score(y_true, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig(os.path.join(BASE_DIR, 'roc_curve.png'), dpi=300, bbox_inches='tight')
plt.show()

# Training history plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Combine histories
all_history = {
    'accuracy': history_phase1.history['accuracy'] + history_phase2.history['accuracy'],
    'val_accuracy': history_phase1.history['val_accuracy'] + history_phase2.history['val_accuracy'],
    'loss': history_phase1.history['loss'] + history_phase2.history['loss'],
    'val_loss': history_phase1.history['val_loss'] + history_phase2.history['val_loss']
}

# Accuracy
axes[0, 0].plot(all_history['accuracy'], label='Train')
axes[0, 0].plot(all_history['val_accuracy'], label='Validation')
axes[0, 0].set_title('Model Accuracy')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Loss
axes[0, 1].plot(all_history['loss'], label='Train')
axes[0, 1].plot(all_history['val_loss'], label='Validation')
axes[0, 1].set_title('Model Loss')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Sample predictions
axes[1, 0].axis('off')
axes[1, 1].axis('off')

plt.tight_layout()
plt.savefig(os.path.join(BASE_DIR, 'training_history.png'), dpi=300, bbox_inches='tight')
plt.show()

# ============================================================================
# STEP 10: SAVE MODEL FOR DEPLOYMENT
# ============================================================================

print("\n" + "="*70)
print("SAVING MODEL FOR DEPLOYMENT")
print("="*70)

# Save as .h5 format
model.save(os.path.join(MODEL_DIR, 'breast_cancer_model.h5'))
print(f"Model saved as: {os.path.join(MODEL_DIR, 'breast_cancer_model.h5')}")

# Save model configuration
model_config = {
    'img_size': IMG_SIZE,
    'test_accuracy': float(test_acc),
    'test_auc': float(test_auc),
    'test_precision': float(test_precision),
    'test_recall': float(test_recall),
    'class_names': ['No IDC (Negative)', 'IDC Positive']
}

with open(os.path.join(MODEL_DIR, 'model_config.pkl'), 'wb') as f:
    pickle.dump(model_config, f)

print(f"Model config saved as: {os.path.join(MODEL_DIR, 'model_config.pkl')}")

# Save results summary
results_summary = {
    'total_images': len(df),
    'train_samples': len(train_df),
    'val_samples': len(val_df),
    'test_samples': len(test_df),
    'test_accuracy': float(test_acc),
    'test_auc': float(test_auc),
    'test_precision': float(test_precision),
    'test_recall': float(test_recall),
    'test_f1': float(2*(test_precision*test_recall)/(test_precision+test_recall))
}

results_df = pd.DataFrame([results_summary])
results_df.to_csv(os.path.join(BASE_DIR, 'model_results.csv'), index=False)

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)
print(f"\nFinal Test Accuracy: {test_acc*100:.2f}%")
print(f"\nFiles to download:")
print(f"1. {os.path.join(MODEL_DIR, 'breast_cancer_model.h5')}")
print(f"2. {os.path.join(MODEL_DIR, 'model_config.pkl')}")
print(f"3. {os.path.join(BASE_DIR, 'model_results.csv')}")
print(f"4. {os.path.join(BASE_DIR, 'confusion_matrix.png')}")
print(f"5. {os.path.join(BASE_DIR, 'roc_curve.png')}")
print(f"6. {os.path.join(BASE_DIR, 'training_history.png')}")
print("\nDownload these files and place them in your Flask app directory!")
print("="*70)