どうやら以下のことが言える模様。
- `shuffle` を False にすると決定的になる (`random_state` 指定するとエラー)
- `shuffle` を True にし `random_state` にシードを設定しないと非決定的
- `shuffle` を True にし `random_state` にシードを設定すると決定的

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

In [13]:
df = pd.DataFrame(
    {'y': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3],
     'groups': [0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 7, 7, 7]
    }
)
df[['x1', 'x2', 'x3']] = np.random.rand(df.shape[0], 3)
df = df.sample(frac=1.0).reset_index(drop=True)
df

Unnamed: 0,y,groups,x1,x2,x3
0,1,3,0.754404,0.106129,0.701739
1,3,7,0.326706,0.368672,0.257826
2,1,3,0.975612,0.908864,0.95197
3,1,2,0.50047,0.434404,0.305197
4,0,0,0.361858,0.802651,0.304794
5,3,7,0.09907,0.759912,0.153225
6,1,3,0.560227,0.482793,0.670281
7,0,0,0.431473,0.962507,0.650751
8,1,4,0.614938,0.789203,0.416457
9,2,5,0.380186,0.014565,0.550679


In [44]:
df['y'].value_counts()

1    5
0    4
2    4
3    3
Name: y, dtype: int64

In [16]:
X = df.drop(columns=['x1', 'x2', 'x3'])
y = df['y']
groups = df['groups']

In [36]:
for i in range(100):
    cv = StratifiedGroupKFold(shuffle=False, n_splits=3)
    indice = [(train_idx, test_idx) for (train_idx, test_idx) in cv.split(X, y, groups)]
    # If it is not first loop, compare indice with previous ones then raise ValueError if they are not same
    if i == 0:
        previous_indice = indice
    else:
        for (train_idx, test_idx), (previous_train_idx, previous_test_idx) in zip(indice, previous_indice):
            if (train_idx != previous_train_idx).sum() > 0 or (test_idx != previous_test_idx).sum() > 0:
                raise ValueError(i, (train_idx, test_idx), (previous_train_idx, previous_test_idx))

In [37]:
for i in range(100):
    cv = StratifiedGroupKFold(shuffle=True, n_splits=3)
    indice = [(train_idx, test_idx) for (train_idx, test_idx) in cv.split(X, y, groups)]
    if i == 0:
        previous_indice = indice
    else:
        for (train_idx, test_idx), (previous_train_idx, previous_test_idx) in zip(indice, previous_indice):
            if (train_idx != previous_train_idx).sum() > 0 or (test_idx != previous_test_idx).sum() > 0:
                raise ValueError(i, (train_idx, test_idx), (previous_train_idx, previous_test_idx))

  if (train_idx != previous_train_idx).sum() > 0 or (test_idx != previous_test_idx).sum() > 0:


AttributeError: 'bool' object has no attribute 'sum'

In [39]:
train_idx

array([ 1,  3,  5,  8, 10, 13, 14, 15])

In [40]:
previous_train_idx

array([ 1,  3,  5,  8,  9, 10, 11, 12, 14, 15])

多分 `shuffle` したせいで区切り位置が変わり fold 毎の件数が変わっている

In [41]:
for i in range(100):
    cv = StratifiedGroupKFold(shuffle=True, n_splits=3, random_state=0)
    indice = [(train_idx, test_idx) for (train_idx, test_idx) in cv.split(X, y, groups)]
    if i == 0:
        previous_indice = indice
    else:
        for (train_idx, test_idx), (previous_train_idx, previous_test_idx) in zip(indice, previous_indice):
            if (train_idx != previous_train_idx).sum() > 0 or (test_idx != previous_test_idx).sum() > 0:
                raise ValueError(i, (train_idx, test_idx), (previous_train_idx, previous_test_idx))

In [42]:
for i in range(100):
    cv = StratifiedGroupKFold(shuffle=True, n_splits=3, random_state=1)
    indice = [(train_idx, test_idx) for (train_idx, test_idx) in cv.split(X, y, groups)]
    if i == 0:
        previous_indice = indice
    else:
        for (train_idx, test_idx), (previous_train_idx, previous_test_idx) in zip(indice, previous_indice):
            if (train_idx != previous_train_idx).sum() > 0 or (test_idx != previous_test_idx).sum() > 0:
                raise ValueError(i, (train_idx, test_idx), (previous_train_idx, previous_test_idx))

In [43]:
for i in range(100):
    cv = StratifiedGroupKFold(shuffle=True, n_splits=3, random_state=2)
    indice = [(train_idx, test_idx) for (train_idx, test_idx) in cv.split(X, y, groups)]
    if i == 0:
        previous_indice = indice
    else:
        for (train_idx, test_idx), (previous_train_idx, previous_test_idx) in zip(indice, previous_indice):
            if (train_idx != previous_train_idx).sum() > 0 or (test_idx != previous_test_idx).sum() > 0:
                raise ValueError(i, (train_idx, test_idx), (previous_train_idx, previous_test_idx))