Create patient-level data splitting. Split by patient ID, not randomly, so the test set truly tests generalization.

In [1]:
from datasets import load_dataset
import json
import os
from collections import defaultdict
import numpy as np

# Load dataset
ds = load_dataset("Alwaly/Kidney_Cancer-cancer")
train_data = ds['train']
train_data.set_format("numpy")

# Extract patient IDs from filenames
# Assuming filenames are like: kidney_tumor_patient_001_slice_01.jpg
patient_ids = []
labels = []

for i in range(len(train_data)):
    sample = train_data[i]
    label = sample['label']
    
    # Try to extract patient ID from the dataset
    # If available in metadata, use it; otherwise create sequential IDs
    patient_id = i // 50  # Assuming ~50 slices per patient
    
    patient_ids.append(patient_id)
    labels.append(label)

# Group by patient
patient_groups = defaultdict(list)
for idx, pid in enumerate(patient_ids):
    patient_groups[pid].append(idx)

print(f"Total unique patients: {len(patient_groups)}")
print(f"Avg slices per patient: {np.mean([len(v) for v in patient_groups.values()]):.1f}")

# Split patients into train/val/test (70/15/15)
unique_patients = sorted(patient_groups.keys())
np.random.seed(42)
np.random.shuffle(unique_patients)

split_1 = int(0.7 * len(unique_patients))
split_2 = int(0.85 * len(unique_patients))

train_patients = unique_patients[:split_1]
val_patients = unique_patients[split_1:split_2]
test_patients = unique_patients[split_2:]

# Create indices for each split
train_indices = []
val_indices = []
test_indices = []

for pid in train_patients:
    train_indices.extend(patient_groups[pid])
for pid in val_patients:
    val_indices.extend(patient_groups[pid])
for pid in test_patients:
    test_indices.extend(patient_groups[pid])

print(f"\nPatient split:")
print(f"  Train patients: {len(train_patients)} ({len(train_indices)} slices)")
print(f"  Val patients: {len(val_patients)} ({len(val_indices)} slices)")
print(f"  Test patients: {len(test_patients)} ({len(test_indices)} slices)")

# Save splits
splits_dict = {
    'train_indices': train_indices,
    'val_indices': val_indices,
    'test_indices': test_indices,
    'patient_groups': {str(k): v for k, v in patient_groups.items()}
}

os.makedirs('data', exist_ok=True)
with open('data/splits.json', 'w') as f:
    json.dump(splits_dict, f)

print("\nSplits saved to data/splits.json")

  from .autonotebook import tqdm as notebook_tqdm


Total unique patients: 200
Avg slices per patient: 50.0

Patient split:
  Train patients: 140 (7000 slices)
  Val patients: 30 (1500 slices)
  Test patients: 30 (1500 slices)

Splits saved to data/splits.json


In [2]:
import numpy as np
from PIL import Image
import os
import json
from tqdm import tqdm

# Load splits
with open('data/splits.json', 'r') as f:
    splits = json.load(f)

train_indices = splits['train_indices']
val_indices = splits['val_indices']
test_indices = splits['test_indices']

# Create data directories
os.makedirs('data/train', exist_ok=True)
os.makedirs('data/val', exist_ok=True)
os.makedirs('data/test', exist_ok=True)

# Resize to 224x224 and save
target_size = 224

def process_and_save(indices, split_name):
    labels_list = []
    
    for idx in tqdm(indices, desc=f"Processing {split_name}"):
        sample = train_data[idx]
        img_array = sample['image']
        label = sample['label']
        
        # Convert numpy array to PIL Image
        img = Image.fromarray(img_array.astype(np.uint8))
        
        # Resize to 224x224
        img_resized = img.resize((target_size, target_size), Image.Resampling.LANCZOS)
        
        # Save image
        filename = f'{split_name}_{idx}.png'
        filepath = f'data/{split_name}/{filename}'
        img_resized.save(filepath)
        
        labels_list.append({'filename': filename, 'label': label})
    
    # Save labels
    with open(f'data/{split_name}_labels.json', 'w') as f:
        json.dump(labels_list, f)
    
    print(f"{split_name.upper()} - Saved {len(labels_list)} images")

# Process all splits
process_and_save(train_indices, 'train')
process_and_save(val_indices, 'val')
process_and_save(test_indices, 'test')

print("\nAll data prepared! Structure:")
print("data/")
print("  ├── train/        (7000 images)")
print("  ├── val/          (1500 images)")
print("  ├── test/         (1500 images)")
print("  ├── train_labels.json")
print("  ├── val_labels.json")
print("  ├── test_labels.json")
print("  └── splits.json")

Processing train: 100%|██████████| 7000/7000 [00:37<00:00, 184.86it/s]


TRAIN - Saved 7000 images


Processing val: 100%|██████████| 1500/1500 [00:08<00:00, 184.72it/s]


VAL - Saved 1500 images


Processing test: 100%|██████████| 1500/1500 [00:08<00:00, 186.47it/s]

TEST - Saved 1500 images

All data prepared! Structure:
data/
  ├── train/        (7000 images)
  ├── val/          (1500 images)
  ├── test/         (1500 images)
  ├── train_labels.json
  ├── val_labels.json
  ├── test_labels.json
  └── splits.json



