In [None]:
# ============================================================================
# BREAST CANCER DETECTION - GOOGLE COLAB TRAINING (TENSORFLOW 2.19+ FIX)
# Dataset: Breast Histopathology Images (277,524+ images)
# Target Accuracy: 90%+
# FIX: Compatible with TensorFlow 2.19.0+ - Resolves class_weight with generator issue
# ============================================================================

# ============================================================================
# STEP 1: MOUNT GOOGLE DRIVE AND SETUP
# ============================================================================

from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0, MobileNetV2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.utils import class_weight
import cv2
from tqdm import tqdm
import pickle
import json

# Set random seeds
np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("BREAST CANCER DETECTION - GOOGLE COLAB (TF 2.19+ COMPATIBLE)")
print("="*70)
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")
print("="*70)

# ============================================================================
# STEP 2: SETUP KAGGLE API AND DOWNLOAD DATASET
# ============================================================================

# Create Kaggle directory
!mkdir -p ~/.kaggle

# Create kaggle.json with your credentials
kaggle_credentials = {
    "username": "professorraimal",
    "key": "KGAT_24f6310a3bc8ef58c1c084f178820236"
}

import json
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_credentials, f)

# Set permissions
!chmod 600 ~/.kaggle/kaggle.json

print("\n✓ Kaggle API configured successfully!")

# Download dataset
print("\nDownloading Breast Histopathology Images dataset...")
print("This may take 5-10 minutes depending on your internet speed...")

!kaggle datasets download -d paultimothymooney/breast-histopathology-images

print("\n✓ Dataset downloaded!")

# Unzip dataset
print("\nExtracting dataset...")
!unzip -q breast-histopathology-images.zip -d /content/dataset

print("✓ Dataset extracted successfully!")

# ============================================================================
# STEP 3: CREATE GOOGLE DRIVE FOLDER STRUCTURE
# ============================================================================

# Create project folders in Google Drive
DRIVE_BASE = '/content/drive/MyDrive/BreastCancerDetection'
MODEL_DIR = os.path.join(DRIVE_BASE, 'models')
RESULTS_DIR = os.path.join(DRIVE_BASE, 'results')

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"\n✓ Created folder structure in Google Drive:")
print(f"  - {DRIVE_BASE}")
print(f"  - {MODEL_DIR}")
print(f"  - {RESULTS_DIR}")

# ============================================================================
# STEP 4: EXPLORE DATASET
# ============================================================================

DATASET_PATH = '/content/dataset'

print("\n" + "="*70)
print("EXPLORING DATASET")
print("="*70)

def count_images(dataset_path):
    """Count total images and class distribution"""
    total_images = 0
    positive_count = 0
    negative_count = 0

    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.endswith('.png'):
                total_images += 1
                if '/1/' in root:
                    positive_count += 1
                elif '/0/' in root:
                    negative_count += 1

    return total_images, positive_count, negative_count

total, positive, negative = count_images(DATASET_PATH)
print(f"\nTotal Images: {total:,}")
print(f"IDC Positive (Class 1): {positive:,} ({positive/total*100:.2f}%)")
print(f"No IDC (Class 0): {negative:,} ({negative/total*100:.2f}%)")
print(f"\n✓ Dataset has {total:,} images (Requirement: 20,000+)")

# ============================================================================
# STEP 5: LOAD AND ORGANIZE DATA
# ============================================================================

print("\n" + "="*70)
print("LOADING DATASET")
print("="*70)

def load_dataset_info(dataset_path, sample_fraction=1.0):
    """
    Load dataset information
    sample_fraction: 1.0 for full dataset, 0.3 for 30% sample
    """
    image_paths = []
    labels = []

    # Find base path
    for item in os.listdir(dataset_path):
        item_path = os.path.join(dataset_path, item)
        if os.path.isdir(item_path):
            if 'IDC' in item or len(os.listdir(item_path)) > 100:
                dataset_path = item_path
                break

    patient_dirs = []
    for item in os.listdir(dataset_path):
        item_path = os.path.join(dataset_path, item)
        if os.path.isdir(item_path):
            patient_dirs.append(item_path)

    print(f"Found {len(patient_dirs)} patient directories")

    # Sample if needed
    if sample_fraction < 1.0:
        import random
        random.seed(42)
        n_sample = int(len(patient_dirs) * sample_fraction)
        patient_dirs = random.sample(patient_dirs, n_sample)
        print(f"Sampling {sample_fraction*100}%: {len(patient_dirs)} directories")

    # Load image paths and labels
    for patient_path in tqdm(patient_dirs, desc="Loading images"):
        for img_class in ['0', '1']:
            class_path = os.path.join(patient_path, img_class)
            if os.path.exists(class_path):
                for img_file in os.listdir(class_path):
                    if img_file.endswith('.png'):
                        image_paths.append(os.path.join(class_path, img_file))
                        labels.append(int(img_class))

    return image_paths, labels

# Load full dataset for 90%+ accuracy
print("Loading dataset information...")
image_paths, labels = load_dataset_info(DATASET_PATH, sample_fraction=1.0)

print(f"\n✓ Loaded Images: {len(image_paths):,}")
print(f"  Positive samples (IDC): {sum(labels):,}")
print(f"  Negative samples (No IDC): {len(labels) - sum(labels):,}")
print(f"  Class balance: {sum(labels)/len(labels)*100:.2f}% positive")

# Create DataFrame
df = pd.DataFrame({
    'image_path': image_paths,
    'label': labels
})

# ============================================================================
# STEP 6: SPLIT DATA
# ============================================================================

print("\n" + "="*70)
print("SPLITTING DATA")
print("="*70)

# Split: 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"\nDataset Split:")
print(f"  Training: {len(train_df):,} images")
print(f"  Validation: {len(val_df):,} images")
print(f"  Test: {len(test_df):,} images")

# ============================================================================
# STEP 7: SETUP DATA GENERATORS (WITH CLASS WEIGHT FIX)
# ============================================================================

print("\n" + "="*70)
print("CONFIGURING DATA AUGMENTATION")
print("="*70)

IMG_SIZE = 96
BATCH_SIZE = 64

# Training augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.15,
    shear_range=0.15,
    fill_mode='nearest'
)

# Validation/Test: only rescaling
val_test_datagen = ImageDataGenerator(rescale=1./255)

def create_generator(df, datagen, batch_size, shuffle=True, class_weights=None):
    """
    Create data generator with optional class weights
    FIX: class_weights applied inside generator (TF 2.19+ compatible)
    When class_weights are provided, yields (images, labels, sample_weights)
    """
    def generator():
        while True:
            if shuffle:
                df_shuffled = df.sample(frac=1).reset_index(drop=True)
            else:
                df_shuffled = df

            for i in range(0, len(df_shuffled), batch_size):
                batch_df = df_shuffled.iloc[i:i+batch_size]

                images = []
                labels = []

                for _, row in batch_df.iterrows():
                    img = cv2.imread(row['image_path'])
                    if img is not None:
                        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
                        images.append(img)
                        labels.append(row['label'])

                if len(images) > 0:
                    images = np.array(images, dtype=np.float32)
                    labels = np.array(labels, dtype=np.float32)

                    # Apply augmentation
                    for j in range(len(images)):
                        images[j] = datagen.random_transform(images[j])
                    images = datagen.standardize(images)

                    # FIX: Apply sample weights if provided (TF 2.19+ compatible)
                    if class_weights is not None:
                        sample_weights = np.array([class_weights[int(label)] for label in labels])
                        yield images, labels, sample_weights
                    else:
                        yield images, labels

    return generator

# Calculate class weights
class_weights_array = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights_dict = dict(enumerate(class_weights_array))

print(f"\nClass weights: {class_weights_dict}")
print("✓ Class weights will be applied inside generator (TF 2.19+ compatible)")

# Create generators WITH class weights for training
train_gen = create_generator(train_df, train_datagen, BATCH_SIZE, shuffle=True, class_weights=class_weights_dict)
val_gen = create_generator(val_df, val_test_datagen, BATCH_SIZE, shuffle=False, class_weights=None)
test_gen = create_generator(test_df, val_test_datagen, BATCH_SIZE, shuffle=False, class_weights=None)

# Calculate steps
train_steps = len(train_df) // BATCH_SIZE
val_steps = len(val_df) // BATCH_SIZE
test_steps = len(test_df) // BATCH_SIZE

print(f"\nData Generators Created:")
print(f"  Training steps per epoch: {train_steps}")
print(f"  Validation steps per epoch: {val_steps}")
print(f"  Test steps: {test_steps}")

# ============================================================================
# STEP 8: BUILD MODEL (TENSORFLOW 2.19+ COMPATIBLE)
# ============================================================================

print("\n" + "="*70)
print("BUILDING MODEL (TF 2.19+ COMPATIBLE)")
print("="*70)

def build_model(model_type='efficientnet'):
    """
    Build model with TensorFlow 2.19+ compatibility fix
    FIX: Removed include_preprocessing parameter - not available in TF 2.19
    """
    if model_type == 'efficientnet':
        # FIX: Don't use include_preprocessing - it doesn't exist in TF 2.19
        base_model = EfficientNetB0(
            include_top=False,
            weights='imagenet',
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )
        print("Using EfficientNetB0 as base model")
    else:
        base_model = MobileNetV2(
            include_top=False,
            weights='imagenet',
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )
        print("Using MobileNetV2 as base model")

    # Freeze base model initially
    base_model.trainable = False

    # Build model using Functional API for better compatibility
    inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    x = base_model(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model(inputs=inputs, outputs=outputs)

    return model, base_model

# Try EfficientNetB0 first, fallback to MobileNetV2 if needed
try:
    model, base_model = build_model('efficientnet')
    model_type = 'efficientnet'
except Exception as e:
    print(f"⚠ EfficientNet failed: {e}")
    print("Falling back to MobileNetV2...")
    model, base_model = build_model('mobilenet')
    model_type = 'mobilenet'

print(f"\n✓ Model built successfully ({model_type.upper()})")
print(f"Total parameters: {model.count_params():,}")

# ============================================================================
# STEP 9: COMPILE MODEL
# ============================================================================

# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc')]
)

print("✓ Model compiled")

# ============================================================================
# STEP 10: SETUP CALLBACKS
# ============================================================================

# Save best model during training - use .keras format
checkpoint_path = os.path.join(MODEL_DIR, 'best_model.keras')
checkpoint_callback = ModelCheckpoint(
    checkpoint_path,
    monitor='val_auc',
    mode='max',
    save_best_only=True,
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_auc',
    patience=5,
    mode='max',
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-7,
    verbose=1
)

callbacks = [checkpoint_callback, early_stopping, reduce_lr]

# ============================================================================
# STEP 11: TRAIN MODEL (TWO-PHASE TRAINING) - FIXED FOR TF 2.19+
# ============================================================================

print("\n" + "="*70)
print("PHASE 1: TRAINING WITH FROZEN BASE MODEL")
print("="*70)

epochs_phase1 = 6

# FIX: Remove class_weight parameter - weights are in generator now
history_phase1 = model.fit(
    train_gen(),
    steps_per_epoch=train_steps,
    validation_data=val_gen(),
    validation_steps=val_steps,
    epochs=epochs_phase1,
    callbacks=callbacks,
    verbose=1
)

print("\n" + "="*70)
print("PHASE 2: FINE-TUNING")
print("="*70)

# Unfreeze last layers
base_model.trainable = True
for layer in base_model.layers[:-30]:
    layer.trainable = False

print(f"Unfrozen {sum([layer.trainable for layer in base_model.layers])} layers")

# Recompile with lower learning rate
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc')]
)

epochs_phase2 = 5

# FIX: Remove class_weight parameter - weights are in generator now
history_phase2 = model.fit(
    train_gen(),
    steps_per_epoch=train_steps,
    validation_data=val_gen(),
    validation_steps=val_steps,
    epochs=epochs_phase2,
    callbacks=callbacks,
    verbose=1
)

# ============================================================================
# STEP 12: EVALUATE MODEL
# ============================================================================

print("\n" + "="*70)
print("EVALUATING MODEL ON TEST SET")
print("="*70)

# Load best model
print(f"\nLoading best model from: {checkpoint_path}")
model = keras.models.load_model(checkpoint_path)

# Evaluate on test set
test_loss, test_acc, test_auc = model.evaluate(
    test_gen(),
    steps=test_steps,
    verbose=1
)

print(f"\n{'='*70}")
print("TEST SET RESULTS")
print(f"{'='*70}")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc*100:.2f}%")
print(f"Test AUC: {test_auc:.4f}")

# Get predictions for detailed metrics
print("\nGenerating predictions for detailed metrics...")
y_pred_proba = []
y_true = []

for i in range(test_steps):
    batch_data = next(test_gen())
    X_batch = batch_data[0]
    y_batch = batch_data[1]
    y_pred_proba.extend(model.predict(X_batch, verbose=0).flatten())
    y_true.extend(y_batch)

y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)
y_pred = (y_pred_proba > 0.5).astype(int)

# Classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=['No IDC', 'IDC Positive']))

# Additional metrics
from sklearn.metrics import precision_score, recall_score, f1_score

test_precision = precision_score(y_true, y_pred)
test_recall = recall_score(y_true, y_pred)
test_f1 = f1_score(y_true, y_pred)

print(f"\nAdditional Metrics:")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall: {test_recall:.4f}")
print(f"  F1-Score: {test_f1:.4f}")

# ============================================================================
# STEP 13: VISUALIZE RESULTS
# ============================================================================

print("\n" + "="*70)
print("CREATING VISUALIZATIONS")
print("="*70)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
           xticklabels=['No IDC', 'IDC Positive'],
           yticklabels=['No IDC', 'IDC Positive'])
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(os.path.join(RESULTS_DIR, 'confusion_matrix.png'),
           dpi=300, bbox_inches='tight')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
roc_auc = roc_auc_score(y_true, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
        label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.savefig(os.path.join(RESULTS_DIR, 'roc_curve.png'),
           dpi=300, bbox_inches='tight')
plt.show()

# Training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

all_history = {
    'accuracy': history_phase1.history['accuracy'] + history_phase2.history['accuracy'],
    'val_accuracy': history_phase1.history['val_accuracy'] + history_phase2.history['val_accuracy'],
    'loss': history_phase1.history['loss'] + history_phase2.history['loss'],
    'val_loss': history_phase1.history['val_loss'] + history_phase2.history['val_loss']
}

axes[0].plot(all_history['accuracy'], label='Train', linewidth=2)
axes[0].plot(all_history['val_accuracy'], label='Validation', linewidth=2)
axes[0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(all_history['loss'], label='Train', linewidth=2)
axes[1].plot(all_history['val_loss'], label='Validation', linewidth=2)
axes[1].set_title('Model Loss', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'training_history.png'),
           dpi=300, bbox_inches='tight')
plt.show()

# ============================================================================
# STEP 14: SAVE MODEL FOR DEPLOYMENT (TF 2.19+ COMPATIBLE)
# ============================================================================

print("\n" + "="*70)
print("SAVING MODEL FOR DEPLOYMENT (TF 2.19+ COMPATIBLE)")
print("="*70)

# Save in both .keras (native) and .h5 (compatibility) formats
keras_model_path = os.path.join(MODEL_DIR, 'breast_cancer_model.keras')
h5_model_path = os.path.join(MODEL_DIR, 'breast_cancer_model.h5')

# Save in native Keras format (recommended for TF 2.19+)
print("Saving model in .keras format...")
model.save(keras_model_path)
print(f"✓ Model saved (Keras format): {keras_model_path}")

# Save in H5 format for backward compatibility
print("\nSaving model in .h5 format...")
try:
    model.save(h5_model_path, save_format='h5')
    print(f"✓ Model saved (H5 format): {h5_model_path}")
except Exception as e:
    print(f"⚠ Warning: H5 save failed: {e}")
    print("  Using .keras format only (recommended for TF 2.19+)")

# Verify file sizes
print("\nVerifying saved files...")
if os.path.exists(keras_model_path):
    size_mb = os.path.getsize(keras_model_path) / (1024 * 1024)
    print(f"✓ .keras file: {size_mb:.2f} MB")
if os.path.exists(h5_model_path):
    size_mb = os.path.getsize(h5_model_path) / (1024 * 1024)
    print(f"✓ .h5 file: {size_mb:.2f} MB")

# Save configuration
model_config = {
    'img_size': IMG_SIZE,
    'test_accuracy': float(test_acc),
    'test_auc': float(test_auc),
    'test_precision': float(test_precision),
    'test_recall': float(test_recall),
    'test_f1': float(test_f1),
    'class_names': ['No IDC (Negative)', 'IDC Positive'],
    'model_type': model_type,
    'tensorflow_version': tf.__version__
}

config_path = os.path.join(MODEL_DIR, 'model_config.pkl')
with open(config_path, 'wb') as f:
    pickle.dump(model_config, f)
print(f"✓ Config saved: {config_path}")

# Verify config file
if os.path.exists(config_path):
    size_kb = os.path.getsize(config_path) / 1024
    print(f"✓ Config file: {size_kb:.2f} KB")

# Save results
results = {
    'total_images': len(df),
    'train_samples': len(train_df),
    'val_samples': len(val_df),
    'test_samples': len(test_df),
    'test_accuracy': float(test_acc),
    'test_auc': float(test_auc),
    'test_precision': float(test_precision),
    'test_recall': float(test_recall),
    'test_f1': float(test_f1),
    'model_type': model_type,
    'tensorflow_version': tf.__version__
}

results_df = pd.DataFrame([results])
results_df.to_csv(os.path.join(RESULTS_DIR, 'model_results.csv'), index=False)
print(f"✓ Results saved: {os.path.join(RESULTS_DIR, 'model_results.csv')}")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*70)
print("✓ TRAINING COMPLETE!")
print("="*70)
print(f"\nFinal Test Accuracy: {test_acc*100:.2f}%")
print(f"Test AUC: {test_auc:.4f}")
print(f"Model Type: {model_type.upper()}")
print(f"TensorFlow Version: {tf.__version__}")
print(f"\n{'='*70}")
print("FILES SAVED TO GOOGLE DRIVE:")
print(f"{'='*70}")
print(f"\nModel Files:")
print(f"  1. {keras_model_path}")
if os.path.exists(h5_model_path):
    print(f"  2. {h5_model_path}")
print(f"  3. {config_path}")
print(f"\nVisualization Files:")
print(f"  4. {os.path.join(RESULTS_DIR, 'model_results.csv')}")
print(f"  5. {os.path.join(RESULTS_DIR, 'confusion_matrix.png')}")
print(f"  6. {os.path.join(RESULTS_DIR, 'roc_curve.png')}")
print(f"  7. {os.path.join(RESULTS_DIR, 'training_history.png')}")
print(f"\n{'='*70}")
print("NEXT STEPS:")
print(f"{'='*70}")
print("1. Go to Google Drive: MyDrive/BreastCancerDetection/models/")
print("2. Download:")
print("   ✓ breast_cancer_model.keras (RECOMMENDED, ~50MB)")
print("   ✓ model_config.pkl (~1KB)")
print("   (Optional: breast_cancer_model.h5 if available)")
print("3. Place them in your Flask app's 'models/' folder")
print("4. Run the Flask application locally")
print(f"{'='*70}")

# Display Google Drive link
print(f"\n✓ Access your files here:")
print(f"https://drive.google.com/drive/folders/MyDrive/BreastCancerDetection")
print("\n" + "="*70)
print("SUCCESS! Model training completed successfully!")
print("="*70)

In [1]:
import tensorflow as tf
print(tf.__version__)

2.20.0
