# Download CBIS-DDSM Directly to Kaggle (163GB)
This notebook downloads the dataset directly to Kaggle storage, bypassing your local machine

In [None]:
# Cell 1: Install TCIA Utils
!pip install tcia-utils requests

In [None]:
# Cell 2: Import libraries
from tcia_utils import nbia
import os

# Set download directory
download_dir = '/kaggle/working/CBIS-DDSM-Full'
os.makedirs(download_dir, exist_ok=True)
print(f"Will download to: {download_dir}")

In [None]:
# Cell 3: Download CBIS-DDSM in SMALL BATCHES and monitor space
# Strategy: Download 500 series at a time (about 12GB) to stay under 57GB limit

collection = "CBIS-DDSM"
print(f"Downloading collection: {collection}")

# Get series data
series_data = nbia.getSeries(collection=collection)
total_series = len(series_data)
print(f"Found {total_series} series to download")

# Extract SeriesInstanceUIDs
series_uids = [item['SeriesInstanceUID'] for item in series_data]

# Download in SMALLER batches to fit in /kaggle/working/ 57GB limit
batch_size = 500  # Reduced from 1000 - approximately 12GB per batch
total_batches = (total_series + batch_size - 1) // batch_size

print(f"\nDownloading in {total_batches} batches of {batch_size} series each")
print("Each batch ~12GB to stay under Kaggle's 57GB working directory limit")
print("⚠️ IMPORTANT: After this completes, IMMEDIATELY save as dataset!\n")

import subprocess
import shutil

for batch_num in range(total_batches):
    start_idx = batch_num * batch_size
    end_idx = min(start_idx + batch_size, total_series)
    batch_uids = series_uids[start_idx:end_idx]
    
    # Check disk space before downloading
    disk_usage = shutil.disk_usage('/kaggle/working/')
    available_gb = disk_usage.free / (1024**3)
    used_gb = disk_usage.used / (1024**3)
    
    print(f"Batch {batch_num + 1}/{total_batches}: Downloading series {start_idx + 1} to {end_idx}")
    print(f"  Disk: {used_gb:.1f}GB used, {available_gb:.1f}GB available")
    
    if available_gb < 15:  # Need at least 15GB free for safety
        print(f"  ⚠️ WARNING: Low disk space! Stopping at batch {batch_num + 1}")
        print(f"  Downloaded {start_idx} of {total_series} series so far")
        print(f"  You need to SAVE THIS AS DATASET NOW, then continue in a new notebook")
        break
    
    try:
        nbia.downloadSeries(
            series_data=batch_uids,
            input_type="list",
            path=download_dir
        )
        print(f"  ✓ Batch {batch_num + 1} complete\n")
    except Exception as e:
        print(f"  ❌ Error in batch {batch_num + 1}: {e}")
        print(f"  Downloaded {start_idx} of {total_series} series")
        print(f"  SAVE AS DATASET NOW before continuing!")
        break

print("\n✓ Download phase complete!")
print(f"⚠️ NEXT: Immediately save this as a Kaggle Dataset (see Cell 5)")

In [None]:
# Cell 4: Check download size
import subprocess
result = subprocess.run(['du', '-sh', download_dir], capture_output=True, text=True)
print(f"Downloaded size: {result.stdout}")

In [None]:
# Cell 5: Create Kaggle Dataset from downloaded files
# After download completes, you need to save this as a Kaggle Dataset
# Go to: File > Save Version > Save & Run All
# Then: File > Create Dataset from Notebook Output

print("""\n
NEXT STEPS:
1. Click 'File' > 'Save Version' > 'Save & Run All'
2. Wait for notebook to finish running
3. Click 'File' > 'Create Dataset from Notebook Output'
4. Name it: 'CBIS-DDSM Full Collection'
5. This creates a reusable dataset from the downloaded files
""")

## After Creating Dataset Above, Use This Training Notebook:

In [None]:
# Cell 6: Clone breast cancer detection repo
!git clone https://github.com/monajemi-arman/breast_cancer_detection
%cd breast_cancer_detection

In [None]:
# Cell 7: Install detectron2
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu118/torch2.0/index.html
!pip install cloudpickle pydicom xmltodict opencv-python pandas scikit-learn

In [None]:
# Cell 8: Setup datasets
# Add these as data sources to your notebook:
# 1. Your created dataset: cbis-ddsm-full-collection
# 2. ramanathansp20/inbreast-dataset
# 3. kmader/mias-mammography

!mkdir -p datasets
!cp -r /kaggle/input/cbis-ddsm-full-collection/* datasets/
!cp -r /kaggle/input/inbreast-dataset/* datasets/
!cp -r /kaggle/input/mias-mammography/* datasets/

In [None]:
# Cell 9: Convert datasets
!python convert_dataset.py

In [None]:
# Cell 10: Update batch size for 16GB GPU
import fileinput

with open('detectron.py', 'r') as f:
    content = f.read()

content = content.replace('batch_size = 1', 'batch_size = 8')
content = content.replace('num_workers = 2', 'num_workers = 8')

with open('detectron.py', 'w') as f:
    f.write(content)

print("Updated for faster training!")

In [None]:
# Cell 11: Start training
!python detectron.py -c train

In [None]:
# Cell 12: Download trained model
import shutil
shutil.copy('output/model_final.pth', '/kaggle/working/')
shutil.copy('detectron.cfg.pkl', '/kaggle/working/')
print("Download model from Output section!")