# Train Breast Cancer Detection on Full CBIS-DDSM (152GB)
Training with full dataset on Colab Pro - assumes data already downloaded to /content/CBIS-DDSM-Full

In [None]:
# Cell 1: Check GPU and verify downloaded data
!nvidia-smi
import os
print(f"\nData available: {os.path.exists('/content/CBIS-DDSM-Full')}")
!du -sh /content/CBIS-DDSM-Full

In [None]:
# Cell 2: Clone your breast cancer detection repository
!git clone https://github.com/SaiRam-Peruri/breast-cancer-detection.git
%cd breast-cancer-detection

In [None]:
# Cell 3: Install dependencies
!pip install torch torchvision
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu118/torch2.0/index.html
!pip install pydicom xmltodict opencv-python pandas scikit-learn cloudpickle pyyaml tqdm pillow

In [None]:
# Cell 4: Setup datasets and download CSV files
import os
import shutil
import urllib.request

# Reset script to clean state
!git checkout convert_dataset.py

# Create datasets directory
os.makedirs('datasets/CBIS-DDSM', exist_ok=True)

# Link downloaded data to 'dicom' directory
if os.path.exists('datasets/CBIS-DDSM/dicom'):
    os.remove('datasets/CBIS-DDSM/dicom')
    
print("Linking dataset...")
!ln -s /content/CBIS-DDSM-Full datasets/CBIS-DDSM/dicom
print("‚úì Dataset linked")

# Download CSV metadata files from Kaggle (official CBIS-DDSM annotations)
print("\nDownloading CSV metadata files from Kaggle...")
csv_dir = 'datasets/CBIS-DDSM/csv'
os.makedirs(csv_dir, exist_ok=True)

# Install kaggle if needed
!pip install -q kaggle

# Setup Kaggle API (you'll need your kaggle.json in ~/.kaggle/)
# If not authenticated, you'll need to upload kaggle.json
import os
kaggle_dir = os.path.expanduser('~/.kaggle')
os.makedirs(kaggle_dir, exist_ok=True)

# Check if kaggle.json exists
if not os.path.exists(os.path.join(kaggle_dir, 'kaggle.json')):
    print("‚ö†Ô∏è Kaggle API key not found!")
    print("Please upload your kaggle.json file:")
    print("1. Go to https://www.kaggle.com/settings")
    print("2. Click 'Create New API Token'")
    print("3. Upload the downloaded kaggle.json")
    from google.colab import files
    uploaded = files.upload()
    if 'kaggle.json' in uploaded:
        with open(os.path.join(kaggle_dir, 'kaggle.json'), 'wb') as f:
            f.write(uploaded['kaggle.json'])
        !chmod 600 ~/.kaggle/kaggle.json
        print("‚úì Kaggle credentials configured")

# Download CSV files from CBIS-DDSM Kaggle dataset
!kaggle datasets download -d awsaf49/cbis-ddsm-breast-cancer-image-dataset --force -p /tmp/cbis_csv

# Extract CSVs
import zipfile
zip_path = '/tmp/cbis_csv/cbis-ddsm-breast-cancer-image-dataset.zip'
if os.path.exists(zip_path):
    print("Inspecting zip contents...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # List all files to see structure
        all_files = zip_ref.namelist()
        csv_files = [f for f in all_files if f.endswith('.csv')]
        
        print(f"Found {len(csv_files)} CSV files in zip")
        
        # Extract all CSV files regardless of path
        for csv_file in csv_files:
            # Get just the filename (remove any directory path)
            filename = os.path.basename(csv_file)
            
            # Extract to temporary location
            zip_ref.extract(csv_file, '/tmp/cbis_csv_extract')
            
            # Move to our csv_dir with clean filename
            src = os.path.join('/tmp/cbis_csv_extract', csv_file)
            dst = os.path.join(csv_dir, filename)
            shutil.move(src, dst)
            print(f"  ‚úì {filename}")
    
    print("\n‚úì CSV files extracted")
    !ls -lh {csv_dir}
    
    # Cleanup
    !rm -rf /tmp/cbis_csv /tmp/cbis_csv_extract
else:
    print("‚ö†Ô∏è Download failed - trying manual CSV creation...")
    # Fallback: Create minimal CSV structure for testing
    print("Creating minimal CSV files for testing...")

# Configure convert_dataset.py
print("\nConfiguring convert_dataset.py...")
with open('convert_dataset.py', 'r') as f:
    content = f.read()

# Update settings for proper train/val/test split (70/15/15)
content = content.replace(
    "chosen_datasets = ['inbreast', 'cbis-ddsm', 'mias']",
    "chosen_datasets = ['cbis-ddsm']"
)
content = content.replace(
    "split_ratio = [0.8, 0.1, 0.1]",
    "split_ratio = [0.7, 0.15, 0.15]"
)

with open('convert_dataset.py', 'w') as f:
    f.write(content)

print("‚úì Configuration complete")
print("\nReady for conversion in next cell!")

In [None]:
# Cell 4.5: Convert all DICOM files to JPEG (like original CBIS-DDSM distribution)
# This allows us to use the existing convert_dataset.py without modifications
import os
import pydicom
from tqdm import tqdm
from utils import read_dicom
from pathlib import Path

print("Converting DICOM files to JPEG format...")
print("This preserves the directory structure for annotation matching")

# Create jpeg output directory
jpeg_dir = 'datasets/CBIS-DDSM/jpeg'
os.makedirs(jpeg_dir, exist_ok=True)

# Scan all DICOM files
dicom_files = []
for root, dirs, files in os.walk('datasets/CBIS-DDSM/dicom'):
    for file in files:
        if file.endswith('.dcm'):
            dicom_files.append(os.path.join(root, file))

print(f"Found {len(dicom_files)} DICOM files")
print(f"Converting to JPEG...\n")

converted = 0
failed = 0

for dcm_path in tqdm(dicom_files, desc="Converting DICOM‚ÜíJPEG"):
    try:
        # Read DICOM and convert to image
        img = read_dicom(dcm_path)
        
        # Recreate directory structure
        rel_path = os.path.relpath(dcm_path, 'datasets/CBIS-DDSM/dicom')
        jpg_path = os.path.join(jpeg_dir, rel_path).replace('.dcm', '.jpg')
        
        # Create parent directories
        os.makedirs(os.path.dirname(jpg_path), exist_ok=True)
        
        # Save as JPEG
        img.convert('RGB').save(jpg_path, 'JPEG', quality=95)
        converted += 1
        
    except Exception as e:
        failed += 1
        if failed <= 10:  # Show first 10 errors only
            print(f"\n‚ö†Ô∏è Failed to convert {os.path.basename(dcm_path)}: {str(e)}")

print(f"\n‚úì Conversion complete!")
print(f"  Successfully converted: {converted} files")
print(f"  Failed: {failed} files")
print(f"  Output directory: {jpeg_dir}")

# Update convert_dataset.py to use 'jpeg' instead of 'dicom'
print("\nUpdating convert_dataset.py to use JPEG directory...")
with open('convert_dataset.py', 'r') as f:
    content = f.read()

# Change back to jpeg if it was changed to dicom
content = content.replace(
    "cbis_jpeg = os.path.join(cbis_path, 'dicom')",
    "cbis_jpeg = os.path.join(cbis_path, 'jpeg')"
)

with open('convert_dataset.py', 'w') as f:
    f.write(content)

print("‚úì Ready for COCO conversion in next cell!")

In [None]:
# Cell 4.6: Reorganize existing JPEGs to match convert_dataset.py expectations
# FASTER: Use existing 10K+ JPEGs instead of converting again (saves 2 hours!)

import os
from tqdm import tqdm
from pathlib import Path
from csv import DictReader
import shutil

print("üöÄ Reorganizing existing JPEG files to match convert_dataset.py structure...")
print("‚è±Ô∏è This takes ~5-10 minutes instead of 2 hours reconversion!\n")

# Load dicom_info.csv
dicom_info_path = 'datasets/CBIS-DDSM/csv/dicom_info.csv'
with open(dicom_info_path) as f:
    dicom_info = list(DictReader(f))

# Filter non-cropped only
dicom_info = [item for item in dicom_info if 'crop' not in item['SeriesDescription']]
print(f"‚úì {len(dicom_info)} non-cropped images to organize\n")

jpeg_base = 'datasets/CBIS-DDSM/jpeg'

# Build a COMPLETE lookup of ALL existing JPEGs with multiple indexing strategies
print("üìÇ Scanning existing JPEG files...")
existing_jpegs_by_uid = {}
all_jpegs_by_filename = {}

for root, dirs, files in os.walk(jpeg_base):
    for file in files:
        if file.endswith('.jpg'):
            full_path = os.path.join(root, file)
            
            # Index by UID
            parts = root.split(os.sep)
            for part in parts:
                if part.startswith('1.3.6.1'):
                    if part not in existing_jpegs_by_uid:
                        existing_jpegs_by_uid[part] = []
                    existing_jpegs_by_uid[part].append(full_path)
                    break
            
            # Also index by just filename for better matching
            if file not in all_jpegs_by_filename:
                all_jpegs_by_filename[file] = []
            all_jpegs_by_filename[file].append(full_path)

total_files = sum(len(v) for v in existing_jpegs_by_uid.values())
print(f"‚úì Found {total_files} existing JPEG files")
print(f"‚úì Covering {len(existing_jpegs_by_uid)} unique UIDs")
print(f"‚úì Indexed {len(all_jpegs_by_filename)} unique filenames\n")

# Reorganize based on CSV - using SAME logic as convert_dataset.py
moved = 0
already_correct = 0
missing = 0
missing_list = []

for item in tqdm(dicom_info, desc="Reorganizing JPEGs"):
    # Use EXACT SAME path construction as convert_dataset.py line 340
    jpeg_rel_path = os.path.join(*Path(item['image_path'].strip()).parts[-2:])
    expected_path = os.path.join(jpeg_base, jpeg_rel_path)
    
    # Check if already in correct location
    if os.path.exists(expected_path):
        already_correct += 1
        continue
    
    # Extract UID and filename
    parts = Path(item['image_path'].strip()).parts[-2:]
    if len(parts) < 2:
        missing += 1
        continue
        
    uid = parts[0]
    filename = parts[1]
    
    src_path = None
    
    # Strategy 1: Match by UID + exact filename
    if uid in existing_jpegs_by_uid:
        for candidate_path in existing_jpegs_by_uid[uid]:
            if candidate_path.endswith(filename):
                src_path = candidate_path
                break
    
    # Strategy 2: If UID match failed, try just filename across all UIDs
    if not src_path and filename in all_jpegs_by_filename:
        # Find the one in the matching UID if possible
        for candidate_path in all_jpegs_by_filename[filename]:
            if uid in candidate_path:
                src_path = candidate_path
                break
        # Otherwise take first match
        if not src_path:
            src_path = all_jpegs_by_filename[filename][0]
    
    if not src_path:
        missing += 1
        if len(missing_list) < 5:
            missing_list.append(f"{uid}/{filename}")
        continue
    
    # Create destination directory and copy
    os.makedirs(os.path.dirname(expected_path), exist_ok=True)
    try:
        shutil.copy2(src_path, expected_path)
        moved += 1
    except Exception as e:
        missing += 1
        if len(missing_list) < 5:
            missing_list.append(f"{uid}/{filename}: {str(e)[:50]}")

print(f"\n‚úÖ Reorganization complete!")
print(f"  üì¶ Copied to correct location: {moved}")
print(f"  ‚úì Already in correct location: {already_correct}")
print(f"  ‚ö†Ô∏è Missing from source: {missing}")

if missing_list:
    print(f"\n‚ö†Ô∏è Sample missing files:")
    for miss in missing_list:
        print(f"    {miss}")

# Verify using SAME logic as convert_dataset.py
print("\nüîç Verifying reorganized files (using convert_dataset.py logic)...")
sample_count = 0
verified_missing = []
for item in dicom_info[:100]:
    jpeg_rel_path = os.path.join(*Path(item['image_path'].strip()).parts[-2:])
    full_path = os.path.join(jpeg_base, jpeg_rel_path)
    if os.path.exists(full_path):
        sample_count += 1
    elif len(verified_missing) < 3:
        verified_missing.append(jpeg_rel_path)

print(f"‚úì {sample_count}/100 sample files verified at convert_dataset.py expected paths")

if verified_missing:
    print(f"\n‚ö†Ô∏è Still missing after reorganization:")
    for vm in verified_missing:
        print(f"    {vm}")
else:
    print(f"\nüéØ All verified! Ready for COCO conversion in Cell 5!")

In [None]:
# Cell 4.7: Debug CSV-JPEG matching
# Check if CSV files correctly map to JPEG structure

print("Debugging CSV-JPEG path matching...\n")

import pandas as pd
from pathlib import Path
from csv import DictReader
import os

# Check dicom_info.csv
dicom_info_path = 'datasets/CBIS-DDSM/csv/dicom_info.csv'
if os.path.exists(dicom_info_path):
    with open(dicom_info_path) as f:
        list_of_dict = list(DictReader(f))
    
    print(f"‚úì dicom_info.csv loaded: {len(list_of_dict)} entries")
    
    # Show first few entries
    print("\nüìã Sample dicom_info.csv entries:")
    for i, item in enumerate(list_of_dict[:3]):
        print(f"\nEntry {i+1}:")
        print(f"  file_path: {item.get('file_path', 'N/A')[:80]}")
        print(f"  image_path: {item.get('image_path', 'N/A')[:80]}")
        print(f"  SeriesDescription: {item.get('SeriesDescription', 'N/A')}")
    
    # Check how many non-crop entries
    non_crop = [item for item in list_of_dict if 'crop' not in item['SeriesDescription']]
    print(f"\n‚úì Non-cropped images: {len(non_crop)}")
    
    # Check dcm_jpeg_dict creation
    dcm_jpeg_dict = {}
    for item in list_of_dict:
        if 'crop' not in item['SeriesDescription']:
            dcm_path = Path(item['file_path'].strip()).parent.parts[-1]
            jpeg_path = os.path.join(*Path(item['image_path'].strip()).parts[-2:])
            dcm_jpeg_dict[dcm_path] = jpeg_path
    
    print(f"\n‚úì dcm_jpeg_dict created: {len(dcm_jpeg_dict)} mappings")
    
    # Show sample mappings
    print("\nüìã Sample DICOM‚ÜíJPEG mappings:")
    for i, (dcm, jpg) in enumerate(list(dcm_jpeg_dict.items())[:3]):
        print(f"  {dcm} ‚Üí {jpg}")
    
    # Check if JPEG files actually exist
    jpeg_dir = 'datasets/CBIS-DDSM/jpeg'
    existing_count = 0
    for jpg_path in list(dcm_jpeg_dict.values())[:100]:
        full_path = os.path.join(jpeg_dir, jpg_path)
        if os.path.exists(full_path):
            existing_count += 1
    
    print(f"\n‚úì Sample check: {existing_count}/100 JPEG files exist")
    
else:
    print("‚ö†Ô∏è dicom_info.csv not found!")

# Check mass CSV files
print("\n" + "="*60)
print("Checking mass annotation CSVs...")

csv_dir = 'datasets/CBIS-DDSM/csv'
mass_csvs = [
    'mass_case_description_train_set.csv',
    'mass_case_description_test_set.csv'
]

for csv_name in mass_csvs:
    csv_path = os.path.join(csv_dir, csv_name)
    if os.path.exists(csv_path):
        with open(csv_path) as f:
            mass_data = list(DictReader(f))
        
        print(f"\n‚úì {csv_name}: {len(mass_data)} entries")
        
        # Show first entry
        if mass_data:
            item = mass_data[0]
            print(f"\n  Sample entry:")
            print(f"    image file path: {item.get('image file path', 'N/A')[:60]}")
            print(f"    ROI mask file path: {item.get('ROI mask file path', 'N/A')[:60]}")
            print(f"    assessment: {item.get('assessment', 'N/A')}")
            
            # Check patient_dir extraction
            patient_dir = Path(item['image file path'].strip()).parent.parts[-1]
            print(f"    extracted patient_dir: {patient_dir}")
            
            # Check if it's in dcm_jpeg_dict
            if patient_dir in dcm_jpeg_dict:
                print(f"    ‚úì Found in dcm_jpeg_dict ‚Üí {dcm_jpeg_dict[patient_dir]}")
            else:
                print(f"    ‚úó NOT in dcm_jpeg_dict")
    else:
        print(f"\n‚ö†Ô∏è {csv_name} not found!")

print("\n" + "="*60)

In [None]:
# Cell 4.8: Cleanup duplicates and broken directories
# Run this to free up storage before running Cell 5 again

import os
import shutil

print("üßπ Cleaning up storage...\n")

# 1. Remove old train/val/test directories
for dir_name in ['train', 'val', 'test']:
    if os.path.exists(dir_name):
        size_mb = sum(os.path.getsize(os.path.join(dirpath, f)) 
                      for dirpath, dirnames, filenames in os.walk(dir_name) 
                      for f in filenames) / (1024*1024)
        print(f"Removing {dir_name}/ ({size_mb:.1f} MB)...")
        shutil.rmtree(dir_name)

# 2. Remove old JSON files
for file_name in ['train.json', 'val.json', 'test.json', 'dataset.yaml']:
    if os.path.exists(file_name):
        print(f"Removing {file_name}...")
        os.remove(file_name)

# 3. Remove old images/labels directories (will be recreated)
for dir_name in ['images', 'labels']:
    if os.path.exists(dir_name):
        size_mb = sum(os.path.getsize(os.path.join(dirpath, f)) 
                      for dirpath, dirnames, filenames in os.walk(dir_name) 
                      for f in filenames) / (1024*1024)
        print(f"Removing {dir_name}/ ({size_mb:.1f} MB)...")
        shutil.rmtree(dir_name)

# 4. Find and remove empty directories in jpeg/
jpeg_dir = 'datasets/CBIS-DDSM/jpeg'
removed_dirs = 0
if os.path.exists(jpeg_dir):
    for root, dirs, files in os.walk(jpeg_dir, topdown=False):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            try:
                if not os.listdir(dir_path):  # Empty directory
                    os.rmdir(dir_path)
                    removed_dirs += 1
            except:
                pass

if removed_dirs > 0:
    print(f"Removed {removed_dirs} empty directories from jpeg/\n")

# 5. Check current storage usage
jpeg_size = sum(os.path.getsize(os.path.join(dirpath, f)) 
                for dirpath, dirnames, filenames in os.walk(jpeg_dir) 
                for f in filenames if f.endswith('.jpg')) / (1024*1024*1024)

print(f"\n‚úÖ Cleanup complete!")
print(f"üìä Current JPEG storage: {jpeg_size:.2f} GB")
print(f"üéØ Ready for fresh conversion in Cell 5!")

In [None]:
# Cell 4.9: Re-configure for 70/15/15 split (if you already ran Cell 4 with old config)
# Only run this if you need to update from 70/0/30 to 70/15/15 split

print("Updating convert_dataset.py to 70/15/15 split...")
with open('convert_dataset.py', 'r') as f:
    content = f.read()

# Update to proper split ratio
content = content.replace(
    "split_ratio = [0.7, 0.0, 0.3]",
    "split_ratio = [0.7, 0.15, 0.15]"
)
# Also ensure it's not the default ratio
content = content.replace(
    "split_ratio = [0.8, 0.1, 0.1]",
    "split_ratio = [0.7, 0.15, 0.15]"
)

with open('convert_dataset.py', 'w') as f:
    f.write(content)

print("‚úì Split ratio updated to [0.7, 0.15, 0.15]")
print("\nNow run Cell 5 to regenerate JSON files with correct split!")

In [None]:
# Cell 5: Convert JPEG images + CSV annotations to COCO format (ALL IMAGES: Mass + Calcifications)
# ‚ö†Ô∏è Run Cell 4.8 first to cleanup and free storage!
# This will take ~20-30 minutes since images are already converted to JPEG
import time
import json
import shutil

print("Converting CBIS-DDSM to COCO format...")
print("Dataset: ~3,100 annotated images (Mass + Calcification)")
print("Classes: Mass AND Calcification (all abnormalities)")
print("Split: 70% train, 15% val, 15% test")
print("This will take 20-30 minutes...\n")

# Enable BOTH mass and calcification classes
print("Configuring for ALL classes (mass + calcification)...")
with open('convert_dataset.py', 'r') as f:
    content = f.read()
content = content.replace(
    "chosen_classes = ['mass']",
    "chosen_classes = ['mass', 'calcification']"
)
# Disable offline augmentation (faster)
content = content.replace(
    'offline_augmentation_enabled = True',
    'offline_augmentation_enabled = False'
)
with open('convert_dataset.py', 'w') as f:
    f.write(content)
print("‚úì Both mass and calcification enabled\n")

start = time.time()

# Run conversion - answer 'y' for mass_low/mass_high classes
!echo "y" | python convert_dataset.py

elapsed = (time.time() - start) / 60
print(f"\n‚úì Conversion complete in {elapsed:.1f} minutes!")

# Verify output - Check all three splits
if os.path.exists('train.json') and os.path.exists('val.json') and os.path.exists('test.json'):
    with open('train.json') as f:
        train = json.load(f)
    with open('val.json') as f:
        val = json.load(f)
    with open('test.json') as f:
        test = json.load(f)
    
    print("\nüìä Dataset Statistics:")
    print(f"  Train: {len(train['images'])} images, {len(train['annotations'])} annotations")
    print(f"  Val:   {len(val['images'])} images, {len(val['annotations'])} annotations")
    print(f"  Test:  {len(test['images'])} images, {len(test['annotations'])} annotations")
    print(f"  Categories: {train['categories']}")
    
    # Count annotations per class
    train_class_counts = {}
    for ann in train['annotations']:
        cat_id = ann['category_id']
        cat_name = next(c['name'] for c in train['categories'] if c['id'] == cat_id)
        train_class_counts[cat_name] = train_class_counts.get(cat_name, 0) + 1
    
    print(f"\nüìä Class distribution (train):")
    for cls, count in sorted(train_class_counts.items()):
        print(f"  {cls}: {count} annotations")
    
    # Verify split ratios
    total = len(train['images']) + len(val['images']) + len(test['images'])
    if total > 0:
        print(f"\n‚úì Split verification:")
        print(f"  Train: {len(train['images'])/total*100:.1f}% ({len(train['images'])} images)")
        print(f"  Val:   {len(val['images'])/total*100:.1f}% ({len(val['images'])} images)")
        print(f"  Test:  {len(test['images'])/total*100:.1f}% ({len(test['images'])} images)")
        print(f"\nüéØ Total images processed: {total}")
        print(f"‚ÑπÔ∏è  This is correct - only annotated images are used for training")
        print(f"‚ÑπÔ∏è  The remaining ~3,500 files are mask images (not for training)")
    else:
        print("\n‚ö†Ô∏è ERROR: No images were processed!")
else:
    print("\n‚ö†Ô∏è Conversion failed - JSON files not found")
    print("Missing files:")
    if not os.path.exists('train.json'): print("  - train.json")
    if not os.path.exists('val.json'): print("  - val.json")
    if not os.path.exists('test.json'): print("  - test.json")

In [None]:
# Cell 6: Configure detectron2 for Colab Pro GPU
# Optimize batch size and workers for better GPU utilization

with open('detectron.py', 'r') as f:
    content = f.read()

# Update for Colab Pro (potentially A100 or V100)
content = content.replace('batch_size = 1', 'batch_size = 8')  # Larger batch for better GPU
content = content.replace('num_workers = 2', 'num_workers = 4')
content = content.replace('epochs = 100', 'epochs = 150')  # More epochs for full dataset

# Enable checkpointing every 5000 iterations
if 'cfg.SOLVER.CHECKPOINT_PERIOD' not in content:
    # Add checkpoint period if not exists
    content = content.replace(
        'cfg.SOLVER.MAX_ITER',
        'cfg.SOLVER.CHECKPOINT_PERIOD = 5000  # Save every 5000 iterations\n    cfg.SOLVER.MAX_ITER'
    )

# Write updated content
with open('detectron.py', 'w') as f:
    f.write(content)

print("‚úì detectron.py configured:")
print("  - Batch size: 8")
print("  - Workers: 4")
print("  - Epochs: 150")
print("  - Checkpoint every 5000 iterations")

In [None]:
# Cell 7: Start training!
# This will run for ~6-10 hours with A100 GPU
# Model checkpoints saved in output/ directory every 5000 iterations

import time

print("=" * 60)
print("STARTING TRAINING ON FULL CBIS-DDSM DATASET")
print("=" * 60)
print(f"Dataset: 152GB, ~6,750 series")
print(f"Model: Faster R-CNN with ResNet-50-FPN")
print(f"Expected duration: 6-10 hours")
print(f"Checkpoints: output/model_XXXX.pth (every 5000 iterations)")
print("=" * 60)
print()

start = time.time()

!python detectron.py -c train

elapsed_hours = (time.time() - start) / 3600
print(f"\n‚úì Training complete in {elapsed_hours:.1f} hours!")
print("\nSaved models:")
!ls -lh output/

In [None]:
# Cell 8: Evaluate model on test set
!python detectron.py -c test

print("\nTest results saved!")
!cat output/test_results.txt

In [None]:
# Cell 9: Download trained model
from google.colab import files
import shutil
import datetime


# Create timestampprint("  - All checkpoints")

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")print("  - metrics.json (training metrics)")

model_name = f"breast_cancer_full_dataset_{timestamp}.zip"print("  - detectron.cfg.pkl (config)")

print("  - model_final.pth (trained weights)")

# Zip output folderprint("Files included:")

print(f"Creating {model_name}...")print("\n‚úì Model downloaded!")

shutil.make_archive(model_name.replace('.zip', ''), 'zip', 'output')

files.download(model_name)
print(f"\nDownloading model...")

In [None]:
# Cell 12: Download trained model
from google.colab import files
import shutil
import datetime

# Create timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
model_name = f"breast_cancer_full_dataset_{timestamp}.zip"

# Zip output folder
print(f"Creating {model_name}...")
shutil.make_archive(model_name.replace('.zip', ''), 'zip', 'output')

print(f"\nDownloading model...")
files.download(model_name)

print("\n‚úì Model downloaded!")
print("Files included:")
print("  - model_final.pth (trained weights)")
print("  - detectron.cfg.pkl (config)")
print("  - metrics.json (training metrics)")
print("  - All checkpoints")

## Training Complete! üéâ

Your model has been trained on the full 152GB CBIS-DDSM dataset with:
- ~4,725 training images (70%)
- ~2,025 test images (30%)
- 150 epochs
- Faster R-CNN architecture

The model is now ready to use for breast cancer detection in mammograms!

## ‚ö° Performance Optimization Tips (For Next Training Run)

Your A100 GPU is powerful but currently **78% bottlenecked by disk I/O** (8 sec data loading vs 10 sec total).

### Why Training Takes 8-9 Hours:
- **Data loading**: 8 seconds/iteration (disk bottleneck)
- **GPU compute**: Only 2 seconds/iteration (A100 is fast!)
- **Total iterations**: ~40,000+ iterations for 150 epochs
- **Object detection**: 4 loss functions (more complex than classification)

### Speed Improvements (Could reduce to 4-5 hours):

1. **Increase batch_size** (A100 has 80GB VRAM!)
   ```python
   batch_size = 16  # or even 24-32 with A100
   ```
   Currently using only 48GB/80GB

2. **More workers** for data loading
   ```python
   num_workers = 8  # or 12
   ```

3. **Reduce image resolution** (if acceptable)
   ```python
   cfg.INPUT.MIN_SIZE_TRAIN = (600,)  # instead of default 800
   ```

4. **Enable caching** (load all images to RAM once)
   ```python
   # In convert_dataset.py: use in-memory dataset
   ```

5. **Use SSD storage** instead of network storage

### Current Training is Normal:
‚úÖ Loss decreasing (0.358 is good!)  
‚úÖ GPU memory usage healthy (48/80 GB)  
‚úÖ A100 working correctly  

The **disk I/O bottleneck is typical for Colab** with large medical imaging datasets.

## üöÄ Alternative: Mount Kaggle Dataset in Colab (Faster I/O)

Instead of downloading 152GB to Colab storage, mount Kaggle dataset directly:

**Pros:**
- Faster I/O (Kaggle's optimized storage)
- Keep A100 GPU speed
- Could reduce data loading from 8 sec ‚Üí 4-5 sec
- Training might finish in **6-7 hours** instead of 8-9

**How to do it (for next run):**
1. Install Kaggle API in Colab
2. Use `kagglehub` to mount dataset
3. Point training to mounted path

This gives you **best of both worlds**: A100 speed + Kaggle storage speed!