# Train Breast Cancer Detection on Full CBIS-DDSM (152GB)
Training with full dataset on Colab Pro - assumes data already downloaded to /content/CBIS-DDSM-Full

In [None]:
# Cell 1: Check GPU and verify downloaded data
!nvidia-smi
import os
print(f"\nData available: {os.path.exists('/content/CBIS-DDSM-Full')}")
!du -sh /content/CBIS-DDSM-Full

In [None]:
# Cell 2: Clone your breast cancer detection repository
!git clone https://github.com/monajemi-arman/breast_cancer_detection.git
%cd breast_cancer_detection

In [None]:
# Cell 3: Install dependencies
!pip install torch torchvision
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu118/torch2.0/index.html
!pip install pydicom xmltodict opencv-python pandas scikit-learn cloudpickle pyyaml tqdm pillow

In [None]:
# Cell 4: Setup datasets directory structure
import os
import shutil

# Create datasets directory
os.makedirs('datasets/CBIS-DDSM', exist_ok=True)

# Move downloaded data to expected location
print("Moving CBIS-DDSM data to datasets folder...")
print("This creates symlink to avoid copying 152GB")

# Create symlink instead of copy to save space
!ln -s /content/CBIS-DDSM-Full datasets/CBIS-DDSM/dicom

print("âœ“ Dataset linked successfully")
!ls -lh datasets/CBIS-DDSM/

In [None]:
# Cell 5: Download CBIS-DDSM CSV metadata files
# These are needed for convert_dataset.py to work
import urllib.request
import os

csv_dir = 'datasets/CBIS-DDSM/csv'
os.makedirs(csv_dir, exist_ok=True)

csv_files = [
    'calc_case_description_test_set.csv',
    'calc_case_description_train_set.csv',
    'mass_case_description_test_set.csv',
    'mass_case_description_train_set.csv',
    'dicom_info.csv'
]

base_url = 'https://raw.githubusercontent.com/monajemi-arman/breast_cancer_detection/master/datasets/CBIS-DDSM/csv/'

for csv_file in csv_files:
    url = base_url + csv_file
    dest = os.path.join(csv_dir, csv_file)
    if not os.path.exists(dest):
        print(f"Downloading {csv_file}...")
        urllib.request.urlretrieve(url, dest)
        print(f"  âœ“ {csv_file}")

print("\nâœ“ All CSV files ready")
!ls -lh {csv_dir}

In [None]:
# Cell 6: Modify convert_dataset.py for Colab environment
# Update paths to work with our data structure

with open('convert_dataset.py', 'r') as f:
    content = f.read()

# Update CBIS-DDSM path to point to our linked data
content = content.replace(
    "cbis_jpeg = os.path.join(cbis_path, 'jpeg')",
    "cbis_jpeg = os.path.join(cbis_path, 'dicom')  # Changed from jpeg to dicom"
)

# Force only CBIS-DDSM dataset (skip INbreast and MIAS for now)
content = content.replace(
    "chosen_datasets = ['inbreast', 'cbis-ddsm', 'mias']",
    "chosen_datasets = ['cbis-ddsm']  # Only CBIS-DDSM for full dataset training"
)

# Set train/test split to 70/30 (no validation for now)
content = content.replace(
    "split_ratio = [0.8, 0.1, 0.1]",
    "split_ratio = [0.7, 0.0, 0.3]  # 70% train, 30% test"
)

with open('convert_dataset.py', 'w') as f:
    f.write(content)

print("âœ“ convert_dataset.py configured for Colab")

In [None]:
# Cell 7: Convert DICOM to COCO format
# This will take ~30-60 minutes for the full 152GB dataset
# Creates train.json, test.json with 70/30 split

import sys
import time

print("Converting CBIS-DDSM to COCO format...")
print("Dataset: 152GB, ~6,750 series")
print("Split: 70% train, 30% test")
print("This will take 30-60 minutes...\n")

start = time.time()

# Run conversion - answer 'y' for mass_low/mass_high classes
!echo "y" | python convert_dataset.py

elapsed = (time.time() - start) / 60
print(f"\nâœ“ Conversion complete in {elapsed:.1f} minutes!")
print("\nGenerated files:")
!ls -lh *.json

In [None]:
# Cell 8: Verify dataset split
import json

with open('train.json', 'r') as f:
    train_data = json.load(f)
    
with open('test.json', 'r') as f:
    test_data = json.load(f)

print("Dataset Statistics:")
print(f"  Train images: {len(train_data['images'])}")
print(f"  Train annotations: {len(train_data['annotations'])}")
print(f"  Test images: {len(test_data['images'])}")
print(f"  Test annotations: {len(test_data['annotations'])}")
print(f"\n  Total images: {len(train_data['images']) + len(test_data['images'])}")
print(f"  Train/Test ratio: {len(train_data['images']) / (len(train_data['images']) + len(test_data['images'])) * 100:.1f}% / {len(test_data['images']) / (len(train_data['images']) + len(test_data['images'])) * 100:.1f}%")
print(f"\n  Categories: {train_data['categories']}")

In [None]:
# Cell 9: Configure detectron2 for Colab Pro GPU
# Optimize batch size and workers for better GPU utilization

with open('detectron.py', 'r') as f:
    content = f.read()

# Update for Colab Pro (potentially A100 or V100)
content = content.replace('batch_size = 1', 'batch_size = 8')  # Larger batch for better GPU
content = content.replace('num_workers = 2', 'num_workers = 4')
content = content.replace('epochs = 100', 'epochs = 150')  # More epochs for full dataset

# Enable checkpointing every 5000 iterations
if 'cfg.SOLVER.CHECKPOINT_PERIOD' not in content:
    # Add checkpoint period if not exists
    content = content.replace(
        'cfg.SOLVER.MAX_ITER',
        'cfg.SOLVER.CHECKPOINT_PERIOD = 5000  # Save every 5000 iterations\n    cfg.SOLVER.MAX_ITER'
    )

with open('detectron.py', 'w') as f:
    f.write(content)

print("âœ“ detectron.py configured:")
print("  - Batch size: 8")
print("  - Workers: 4")
print("  - Epochs: 150")
print("  - Checkpoint every 5000 iterations")

In [None]:
# Cell 10: Start training!
# This will run for ~12-18 hours
# Model checkpoints saved in output/ directory every 5000 iterations

import time

print("=" * 60)
print("STARTING TRAINING ON FULL CBIS-DDSM DATASET")
print("=" * 60)
print(f"Dataset: 152GB, ~6,750 series")
print(f"Model: Faster R-CNN with ResNet-50-FPN")
print(f"Expected duration: 12-18 hours")
print(f"Checkpoints: output/model_XXXX.pth (every 5000 iterations)")
print("=" * 60)
print()

start = time.time()

!python detectron.py -c train

elapsed_hours = (time.time() - start) / 3600
print(f"\nâœ“ Training complete in {elapsed_hours:.1f} hours!")
print("\nSaved models:")
!ls -lh output/

In [None]:
# Cell 11: Evaluate model on test set
!python detectron.py -c test

print("\nTest results saved!")
!cat output/test_results.txt

In [None]:
# Cell 12: Download trained model
from google.colab import files
import shutil
import datetime

# Create timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
model_name = f"breast_cancer_full_dataset_{timestamp}.zip"

# Zip output folder
print(f"Creating {model_name}...")
shutil.make_archive(model_name.replace('.zip', ''), 'zip', 'output')

print(f"\nDownloading model...")
files.download(model_name)

print("\nâœ“ Model downloaded!")
print("Files included:")
print("  - model_final.pth (trained weights)")
print("  - detectron.cfg.pkl (config)")
print("  - metrics.json (training metrics)")
print("  - All checkpoints")

## Training Complete! ðŸŽ‰

Your model has been trained on the full 152GB CBIS-DDSM dataset with:
- ~4,725 training images (70%)
- ~2,025 test images (30%)
- 150 epochs
- Faster R-CNN architecture

The model is now ready to use for breast cancer detection in mammograms!