In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn import model_selection

In [2]:
def create_folds(data):
    data['kfold'] = -1
    data = data.sample(frac=1).reset_index(drop=True)

    num_bins = int(np.floor(1 + np.log2(len(data))))

    data.loc[:, 'bins'] = pd.cut(data['target'], bins=num_bins, labels=False)

    kf = model_selection.StratifiedKFold(n_splits=5)

    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    data = data.drop('bins', axis=1)
    return data

In [3]:
if __name__ == '__main__':
    X, y = datasets.make_regression(
        n_samples=15000, n_features=100, n_targets=1
    )

    df = pd.DataFrame(
        X,
        columns=[f"f_{i}" for i in range(X.shape[1])]
    )
    df.loc[:, 'target'] = y

    df = create_folds(df)
    print(df.head())
    print(df.kfold.value_counts())

        f_0       f_1       f_2       f_3       f_4       f_5       f_6  \
0  1.441875  0.990529 -0.089841 -0.761138 -2.162943 -0.643078  0.815571   
1  0.675871  1.219361 -1.103350 -0.757866 -0.871710  0.371006  0.966393   
2  0.669559  0.216508  0.399234  0.828717  0.155218 -1.630312 -0.628623   
3  2.665926  1.644843  0.364060  1.247800  0.618123 -0.111047  1.337880   
4  0.246123  1.209636 -1.379645  1.545741  1.053880  0.312332  0.562111   

        f_7       f_8       f_9  ...      f_92      f_93      f_94      f_95  \
0 -1.543337  1.337392  1.396891  ...  0.107236  0.864341 -1.145703  0.820905   
1  0.269284  2.076509 -0.273692  ... -0.417114 -0.067364  0.582168  0.252092   
2  0.013395 -0.209450  0.054814  ...  0.492073 -0.481061  0.887755  0.861871   
3 -0.432684 -0.286515  0.641890  ...  0.345779 -2.347182  0.857488 -0.262953   
4  0.195176 -1.353635  0.997533  ... -1.812276  1.819104 -1.503290 -0.457684   

       f_96      f_97      f_98      f_99      target  kfold  
0  0.