In [19]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import GroupKFold
from collections import defaultdict

In [26]:
def generate_data(num_samples_per_group=20, num_groups=10):
    names = [f'patient_{i+1}' for i in range(num_groups)]
    data = []
    for name in names:
        for i in range(num_samples_per_group):
            data.append({
                'Name': name,
                'ImageId': f'{name}_{i+1:03d}',
                'CancerType': random.choice(['benign', 'malignant'])
            })
    return pd.DataFrame(data)

In [27]:
def stratified_group_k_fold(X, y, groups, k, random_state=None):
    np.random.seed(random_state)
    labels_num = np.unique(y)
    y_integers = pd.Series(y).map({n: i for i, n in enumerate(labels_num)}).values

    group_k_fold = GroupKFold(n_splits=k)
    folds = defaultdict(lambda: ([], []))
    
    for fold_index, (train_indices, test_indices) in enumerate(group_k_fold.split(X, y, groups)):
        group_train_labels = pd.Series(y).iloc[train_indices].values
        group_train_indices = np.arange(len(train_indices))

        np.random.shuffle(group_train_indices)
        group_train_labels = group_train_labels[group_train_indices]

        label_distribution = {label: np.sum(group_train_labels == label) for label in np.unique(group_train_labels)}

        label_folds = {label: [] for label in np.unique(group_train_labels)}

        for label, count in label_distribution.items():
            indices = np.where(group_train_labels == label)[0]
            np.random.shuffle(indices)
            fold_sizes = [count // k] * k
            for i in range(count % k):
                fold_sizes[i] += 1

            current = 0
            for fold_index in range(k):
                label_folds[label].append(train_indices[group_train_indices[indices[current:current + fold_sizes[fold_index]]]])
                current += fold_sizes[fold_index]

        for fold_index in range(k):
            fold_indices = []
            for label in label_folds:
                fold_indices.extend(label_folds[label][fold_index])
            np.random.shuffle(fold_indices)
            folds[fold_index][0].extend(fold_indices)
            folds[fold_index][1].extend(test_indices)

    return [(np.array(train), np.array(test)) for train, test in folds.values()]

In [28]:
df = generate_data(num_samples_per_group=20, num_groups=10)
df.to_csv('../data/CancerPrediction.csv', index=False)
X = df[['Name', 'ImageId']]
y = df['CancerType']
groups = df['Name']

In [29]:
folds = stratified_group_k_fold(X, y, groups, k=5, random_state=42)

for fold_index, (train_idx, test_idx) in enumerate(folds):
    train_groups = groups.iloc[train_idx].unique()
    test_groups = groups.iloc[test_idx].unique()

    train_classes = y.iloc[train_idx]
    test_classes = y.iloc[test_idx]

    print(f"Fold {fold_index + 1}")
    print("TRAIN groups:", train_groups)
    print("TRAIN class distribution:", train_classes.value_counts(normalize=True))
    print("TEST groups:", test_groups)
    print("TEST class distribution:", test_classes.value_counts(normalize=True))
    print()


Fold 1
TRAIN groups: ['patient_7' 'patient_6' 'patient_8' 'patient_1' 'patient_10' 'patient_2'
 'patient_5' 'patient_3' 'patient_9' 'patient_4']
TRAIN class distribution: CancerType
benign       0.506098
malignant    0.493902
Name: proportion, dtype: float64
TEST groups: ['patient_4' 'patient_9' 'patient_3' 'patient_8' 'patient_2' 'patient_7'
 'patient_6' 'patient_10' 'patient_1' 'patient_5']
TEST class distribution: CancerType
benign       0.505
malignant    0.495
Name: proportion, dtype: float64

Fold 2
TRAIN groups: ['patient_8' 'patient_7' 'patient_6' 'patient_3' 'patient_10' 'patient_5'
 'patient_1' 'patient_2' 'patient_4' 'patient_9']
TRAIN class distribution: CancerType
benign       0.506173
malignant    0.493827
Name: proportion, dtype: float64
TEST groups: ['patient_4' 'patient_9' 'patient_3' 'patient_8' 'patient_2' 'patient_7'
 'patient_6' 'patient_10' 'patient_1' 'patient_5']
TEST class distribution: CancerType
benign       0.505
malignant    0.495
Name: proportion, dtype: f