# Download Datasets to Google Drive

This notebook downloads TB segmentation datasets directly to Google Drive, saving local disk space.

## Steps:
1. Mount Google Drive
2. Set up Kaggle API
3. Download datasets
4. Verify downloads


## Step 1: Install Dependencies


In [None]:
!pip install kaggle -q


## Step 2: Mount Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
from pathlib import Path

drive_root = Path('/content/drive/MyDrive')
data_dir = drive_root / 'TB_Segmentation_Data'
data_dir.mkdir(exist_ok=True)

print(f"✓ Google Drive mounted")
print(f"✓ Data directory: {data_dir}")


Mounted at /content/drive
✓ Google Drive mounted
✓ Data directory: /content/drive/MyDrive/TB_Segmentation_Data


## Step 3: Set Up Kaggle API

1. Go to https://www.kaggle.com/settings
2. Click 'Create New API Token'
3. Download kaggle.json
4. Upload it below


In [None]:
from google.colab import files
import shutil

# Upload kaggle.json
uploaded = files.upload()

# Move to .kaggle directory
kaggle_dir = Path.home() / '.kaggle'
kaggle_dir.mkdir(exist_ok=True)

for filename in uploaded.keys():
    if filename == 'kaggle.json':
        shutil.move(filename, str(kaggle_dir / 'kaggle.json'))
        os.chmod(kaggle_dir / 'kaggle.json', 0o600)
        print(f"✓ Kaggle API token saved")
        break
else:
    print("⚠️  kaggle.json not found in upload")


Saving kaggle.json to kaggle.json
✓ Kaggle API token saved


## Step 4: Accept Dataset Terms

**IMPORTANT**: Before downloading, accept terms on Kaggle:

1. **TBX11K Simplified**: https://www.kaggle.com/datasets/vbookshelf/tbx11k-simplified
   - Click "Download" → Accept terms

2. **Shenzhen & Montgomery**: https://www.kaggle.com/datasets/kmader/pulmonary-chest-xray-abnormalities
   - Click "Download" → Accept terms


## Step 5: Download TBX11K Simplified


In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import shutil

api = KaggleApi()
api.authenticate()

# Download TBX11K Simplified
print("Downloading TBX11K Simplified...")
print("This may take 30-60 minutes depending on your connection...")

target_dir = data_dir / 'tbx11k'
target_dir.mkdir(exist_ok=True)

# Download to temp location first
temp_dir = Path('/content/temp_tbx11k')
temp_dir.mkdir(exist_ok=True)

try:
    api.dataset_download_files(
        'vbookshelf/tbx11k-simplified',
        path=str(temp_dir),
        unzip=True
    )

    # Move to Drive
    print("Moving to Google Drive...")
    for item in temp_dir.iterdir():
        if item.is_file():
            shutil.copy2(item, target_dir / item.name)
        else:
            dest = target_dir / item.name
            if dest.exists():
                shutil.rmtree(dest)
            shutil.copytree(item, dest)

    # Cleanup
    shutil.rmtree(temp_dir)

    print(f"✓ TBX11K Simplified downloaded to: {target_dir}")

except Exception as e:
    print(f"✗ Download failed: {e}")
    print("Make sure you accepted the dataset terms on Kaggle")


Downloading TBX11K Simplified...
This may take 30-60 minutes depending on your connection...
Dataset URL: https://www.kaggle.com/datasets/vbookshelf/tbx11k-simplified
Moving to Google Drive...
✓ TBX11K Simplified downloaded to: /content/drive/MyDrive/TB_Segmentation_Data/tbx11k


## Step 6: Download Shenzhen & Montgomery


In [None]:
print("Downloading Shenzhen & Montgomery...")

target_dir = data_dir / 'shenzhen_montgomery'
target_dir.mkdir(exist_ok=True)

# Download to temp location
temp_dir = Path('/content/temp_shenzhen_montgomery')
temp_dir.mkdir(exist_ok=True)

try:
    api.dataset_download_files(
        'kmader/pulmonary-chest-xray-abnormalities',
        path=str(temp_dir),
        unzip=True
    )

    # Move to Drive
    print("Moving to Google Drive...")
    for item in temp_dir.iterdir():
        if item.is_file():
            shutil.copy2(item, target_dir / item.name)
        else:
            dest = target_dir / item.name
            if dest.exists():
                shutil.rmtree(dest)
            shutil.copytree(item, dest)

    # Cleanup
    shutil.rmtree(temp_dir)

    print(f"✓ Shenzhen & Montgomery downloaded to: {target_dir}")

except Exception as e:
    print(f"✗ Download failed: {e}")
    print("Make sure you accepted the dataset terms on Kaggle")


Downloading Shenzhen & Montgomery...
Dataset URL: https://www.kaggle.com/datasets/kmader/pulmonary-chest-xray-abnormalities
Moving to Google Drive...
✓ Shenzhen & Montgomery downloaded to: /content/drive/MyDrive/TB_Segmentation_Data/shenzhen_montgomery


## Step 7: Verify Downloads


In [None]:
import os

def count_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count

def get_size(directory):
    total = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            total += os.path.getsize(os.path.join(root, file))
    return total / (1024**3)  # GB

print("Download Verification:")
print("="*60)

# Check TBX11K
tbx11k_dir = data_dir / 'tbx11k'
if tbx11k_dir.exists():
    file_count = count_files(tbx11k_dir)
    size_gb = get_size(tbx11k_dir)
    print(f"✓ TBX11K Simplified:")
    print(f"  Files: {file_count}")
    print(f"  Size: {size_gb:.2f} GB")
else:
    print("✗ TBX11K Simplified not found")

# Check Shenzhen & Montgomery
shenzhen_montgomery_dir = data_dir / 'shenzhen_montgomery'
if shenzhen_montgomery_dir.exists():
    file_count = count_files(shenzhen_montgomery_dir)
    size_gb = get_size(shenzhen_montgomery_dir)
    print(f"✓ Shenzhen & Montgomery:")
    print(f"  Files: {file_count}")
    print(f"  Size: {size_gb:.2f} GB")
else:
    print("✗ Shenzhen & Montgomery not found")

print("="*60)
print(f"\nAll datasets saved to: {data_dir}")
print("\nNext steps:")
print("1. Update config files to point to Google Drive location")
print("2. Or mount Drive locally to access from your project")


Download Verification:
✓ TBX11K Simplified:
  Files: 11703
  Size: 3.79 GB
✓ Shenzhen & Montgomery:
  Files: 1888
  Size: 4.09 GB

All datasets saved to: /content/drive/MyDrive/TB_Segmentation_Data

Next steps:
1. Update config files to point to Google Drive location
2. Or mount Drive locally to access from your project
