In [1]:
!pip install -q pydicom nibabel

In [None]:
import os
import shutil
from pathlib import Path
import zipfile
import json
from tqdm import tqdm
import SimpleITK as sitk
import numpy as np
from kaggle.api.kaggle_api_extended import KaggleApi
import nibabel as nib

In [2]:
# Make sure the .config/kaggle directory exists
os.makedirs("/root/.config/kaggle", exist_ok=True)

# Move kaggle.json to expected directory
shutil.copy("/kaggle/input/kaggle-json/kaggle.json", "/root/.config/kaggle/kaggle.json")

# Set permissions 
os.chmod("/root/.config/kaggle/kaggle.json", 0o600)

# Import and authenticate
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

In [3]:
# --- CONFIGURATION ---
dataset_owner = "" # kaggle username
dataset_slug = "" # dataset
dataset_id = f"{dataset_owner}/{dataset_slug}"
max_batch_size_gb = 12.0 

# Define batch indices for this run (adjust these indices per run)
start_idx = 1900 # # 0-54, 54-106
end_idx =  2000 # next 5 studies (adjust as needed)
batch_index = 1  

In [4]:
# Authenticate Kaggle API
api = KaggleApi()
api.authenticate()

# Paths
working_dir = Path("/kaggle/working")
nifti_output_dir = working_dir / "nifti_temp"
zip_dir = working_dir / "nifti_zips"
nifti_output_dir.mkdir(exist_ok=True)
zip_dir.mkdir(exist_ok=True)

# Load the list of DICOM study directories (adjust path as needed)
with open("/kaggle/input/kaggle-json/study_paths.json") as f:
    all_study_dirs = json.load(f)

print(len(all_study_dirs))
study_dirs = all_study_dirs[start_idx:end_idx]
print(len(study_dirs))

4711
100


In [5]:
def load_dicom_series(dicom_folder):
    """Load DICOM series with SimpleITK."""
    reader = sitk.ImageSeriesReader()
    dicom_files = reader.GetGDCMSeriesFileNames(str(dicom_folder))
    reader.SetFileNames(dicom_files)
    image = reader.Execute()
    return image

def save_compressed_nifti(sitk_image, output_path, compress=True):
    """Save as .nii.gz using NiBabel with proper compression."""
    image_array = sitk.GetArrayFromImage(sitk_image)
    affine = np.eye(4)  # Replace with correct affine if needed
    
    # Create NIfTI image and save
    nii = nib.Nifti1Image(image_array, affine)
    
    # Ensure path ends with .nii.gz for compression
    if compress and not str(output_path).endswith('.nii.gz'):
        output_path = str(output_path) + '.nii.gz'
    
    nib.save(nii, str(output_path))  # Compression is automatic with .nii.gz suffix


def convert_to_nifti(dicom_folder, output_dir):
    sitk_image = load_dicom_series(dicom_folder)
    if sitk_image.GetSize() == (0, 0, 0):
        return None
    
    # Generate filename
    parts = Path(dicom_folder).parts
    name = f"{parts[-2]}_{parts[-1]}.nii.gz"  # Explicit .nii.gz suffix
    out_path = output_dir / name
    
    save_compressed_nifti(sitk_image, out_path)
    return out_path

In [6]:
batch_dir = Path("/kaggle/working/nifti_zips")  # folder that contains batch_*.zip
metadata_path = batch_dir / "dataset-metadata.json"

In [None]:
converted = []
batch_size = 0

for study_path in tqdm(study_dirs):
    nifti_path = convert_to_nifti(study_path, nifti_output_dir)
    if nifti_path is None:
        print(f"Skipping {study_path} (empty or invalid series)")
        continue
    
    size_gb = nifti_path.stat().st_size / (1024 ** 3)
    if batch_size + size_gb > max_batch_size_gb:
        print(f"Batch size limit reached ({batch_size:.2f} GB). Creating ZIP and uploading...")
        
        zip_path = zip_dir / f"batch_{batch_index}.zip"
        with zipfile.ZipFile(zip_path, "w") as zipf:
            for f in converted:
                zipf.write(f, arcname=f.name)
        print(f"Created ZIP {zip_path.name}")
        
        # Upload batch to Kaggle
        api.dataset_create_version(
            folder=str(zip_dir),
            version_notes=f"Upload batch {batch_index} (Studies {start_idx} to {start_idx + len(converted) - 1})",
            delete_old_versions=False,
            convert_to_csv=False,
        )
        print(f"Uploaded batch {batch_index} to Kaggle.")
        
        # Clean up
        for f in converted:
            f.unlink()
        converted = []
        batch_size = 0

    converted.append(nifti_path)
    batch_size += size_gb

# Final leftover batch
if converted:
    zip_path = zip_dir / f"batch_{batch_index}.zip"
    with zipfile.ZipFile(zip_path, "w") as zipf:
        for f in converted:
            zipf.write(f, arcname=f.name)
    print(f"Created ZIP {zip_path.name}")
    
    # STEP 1: Create metadata file (overwrite if needed)
    metadata = {
        "title": "Nifti Data",
        "id": dataset_id,
        "licenses": [{"name": "CC0-1.0"}]
    }
    with open(metadata_path, "w") as f:
        json.dump(metadata, f, indent=2)
    
    # STEP 2: Confirm folder structure
    print("Files in upload folder:")
    for f in batch_dir.iterdir():
        print(" -", f.name)
    
    # STEP 3: Upload to Kaggle
    print(f"\nUploading to Kaggle dataset: {dataset_id}")
    api.dataset_create_version(
        folder=str(batch_dir),
        version_notes="Added NIfTI zip batch uploads",
        delete_old_versions=False,
        convert_to_csv=False
    )
    print("Upload complete!")
    
print("All batches processed and uploaded.")

ImageSeriesReader (0x10900d70): Non uniform sampling or missing slices detected,  maximum nonuniformity:0.000994872

ImageSeriesReader (0x10900d70): Non uniform sampling or missing slices detected,  maximum nonuniformity:0.000696162

ImageSeriesReader (0x10900d70): Non uniform sampling or missing slices detected,  maximum nonuniformity:0.000998117

ImageSeriesReader (0x10900d70): Non uniform sampling or missing slices detected,  maximum nonuniformity:0.000576884

 76%|███████▌  | 76/100 [29:57<09:10, 22.93s/it]