# Breast Cancer Detection Training - Full CBIS-DDSM Dataset (163GB)
Training Faster R-CNN on complete dataset using Kaggle GPU T4 x2

In [None]:
# Cell 1: Check GPU
!nvidia-smi

In [None]:
# Cell 2: Install dependencies
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu118/torch2.0/index.html
!pip install cloudpickle pydicom xmltodict opencv-python tqdm pandas scikit-learn matplotlib

In [None]:
# Cell 3: Clone repository
!git clone https://github.com/monajemi-arman/breast_cancer_detection
%cd breast_cancer_detection

In [None]:
# Cell 4: Setup datasets directory
# Add these datasets to your Kaggle notebook:
# 1. Your uploaded: cbis-ddsm-full-part1
# 2. Your uploaded: cbis-ddsm-full-part2  
# 3. Existing: ramanathansp20/inbreast-dataset
# 4. Existing: kmader/mias-mammography

!mkdir -p datasets/CBIS-DDSM
!mkdir -p datasets/INbreast
!mkdir -p datasets/MIAS

In [None]:
# Cell 5: Copy datasets from Kaggle input
import os
import shutil

# Copy CBIS-DDSM Part 1
print("Copying CBIS-DDSM Part 1...")
!cp -r /kaggle/input/cbis-ddsm-full-part1/* datasets/CBIS-DDSM/

# Copy CBIS-DDSM Part 2
print("Copying CBIS-DDSM Part 2...")
!cp -r /kaggle/input/cbis-ddsm-full-part2/* datasets/CBIS-DDSM/

# Copy INbreast
print("Copying INbreast...")
!cp -r /kaggle/input/inbreast-dataset/* datasets/

# Copy MIAS
print("Copying MIAS...")
!cp -r /kaggle/input/mias-mammography/* datasets/

print("Dataset copy complete!")

In [None]:
# Cell 6: Convert datasets to COCO/YOLO format
# This will process all 163GB + other datasets
!python convert_dataset.py
print("\nDataset conversion complete!")

In [None]:
# Cell 7: Check dataset statistics
import json

for split in ['train', 'val', 'test']:
    with open(f'{split}.json') as f:
        data = json.load(f)
        print(f"\n{split.upper()} Dataset:")
        print(f"  Images: {len(data['images'])}")
        print(f"  Annotations: {len(data['annotations'])}")
        print(f"  Categories: {len(data['categories'])}")

In [None]:
# Cell 8: Update training parameters for 16GB GPU
# Edit detectron.py to use larger batch size
with open('detectron.py', 'r') as f:
    content = f.read()

# Increase batch size from 1 to 8 (you have 16GB GPU now!)
content = content.replace('batch_size = 1', 'batch_size = 8')
content = content.replace('num_workers = 2', 'num_workers = 8')

with open('detectron.py', 'w') as f:
    f.write(content)

print("Updated batch_size=8 and num_workers=8 for faster training!")

In [None]:
# Cell 9: Start training
# This will take 10-15 hours on full dataset
!python detectron.py -c train

In [None]:
# Cell 10: Evaluate model
!python detectron.py -c evaluate -w output/model_final.pth

In [None]:
# Cell 11: Download trained model
# Save trained model to download
from IPython.display import FileLink
import shutil

# Copy model and config to easy download location
shutil.copy('output/model_final.pth', '/kaggle/working/model_final.pth')
shutil.copy('detectron.cfg.pkl', '/kaggle/working/detectron.cfg.pkl')

print("Model saved! Download from Kaggle Output section")
print("Files: model_final.pth, detectron.cfg.pkl")