In [1]:
import os
import numpy as np
import h5py
from sklearn.model_selection import KFold
from dataloader import read_file
import gc
from optparse import OptionParser

## I have writen explanations as double comments in the function

In [2]:
def process_and_save(folder_path, n_splits,name):
    # List all files in the folder
    ## all_files (flags.sample=train) = ['/u/phebbar/Work/Datasets/JetClass/train_100M/TTBarLep_078.root', ...]
    all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # ----------------------------------------------------------------------------------------
    output_dir = os.path.join(folder_path, name)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # ----------------------------------------------------------------------------------------    
    
    # Split the files into n_splits
    kf = KFold(n_splits=n_splits)

    for i, (_, split_indices) in enumerate(kf.split(all_files)):
        ## i == split_indices : [0,1,2,3...,num_files], _ : [0,1,2,...,1000]
        
        print("Running Fold {}".format(i))
        # Initialize lists to store concatenated data
        concat_X = []
        concat_y = []
        concat_j = []
        # Process each file in the current split
        for idx in split_indices:
            file_path = all_files[idx]
            ##read_file() outputs : particles_features (13) , jet_features (4) , y_labels (10)
            X,j, y = read_file(file_path)

            concat_X.append(X)
            concat_y.append(y)
            concat_j.append(j)

        # Concatenate all X and y
        final_X = np.concatenate(concat_X, axis=0)
        final_y = np.concatenate(concat_y, axis=0)
        final_j = np.concatenate(concat_j, axis=0)
        


        # Save to h5py file
        with h5py.File('{}/{}/JetClass_{}.h5'.format(folder_path,name,i), 'w') as h5f:
            h5f.create_dataset('data', data=final_X)
            h5f.create_dataset('jet', data=final_j)
            h5f.create_dataset('pid', data=final_y)
        del final_X, final_y, concat_X, concat_y
        gc.collect()
        
# Usage

#if flags.sample == 'train':
 #   process_and_save(os.path.join(flags.folder,'train_100M/'), n_splits=1000,name='train')


In [45]:
data_path = "/u/phebbar/Work/Datasets/JetClass"
folder_path = os.path.join(data_path,'train_100M')
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
all_files

['/u/phebbar/Work/Datasets/JetClass/train_100M/TTBarLep_078.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/HToWW2Q1L_081.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/ZJetsToNuNu_097.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/HToBB_010.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/HToCC_021.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/HToGG_096.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/WToQQ_077.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/TTBarLep_072.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/HToWW2Q1L_035.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/TTBar_077.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/TTBarLep_075.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/TTBar_076.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/HToCC_074.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/TTBar_099.root',
 '/u/phebbar/Work/Datasets/JetClass/train_100M/HToBB_087.root',
 '/u/phebbar/Work

In [8]:
kf = KFold(n_splits=1000)

In [36]:
count=0
split_indices_list = []
for i, (_, split_indices) in enumerate(kf.split(all_files)):
    count=count+1
    if count<7:
        #print(i,split_indices)
        split_indices_list.append(split_indices)
    else:
        break

In [37]:
count=0
for i, (_, split_indices) in enumerate(kf.split(all_files)):
    count=count+1
    if count<2:
        #print(_)
        continue
    else:
        break

In [38]:
np.array(split_indices_list)[:,0]

array([0, 1, 2, 3, 4, 5])

In [39]:
concat_X = []
concat_y = []
concat_j = []
for idx in np.array(split_indices_list)[:,0]:
            file_path = all_files[idx]
            X,j, y = read_file(file_path)

            concat_X.append(X)
            concat_y.append(y)
            concat_j.append(j)

# Concatenate all X and y
final_X = np.concatenate(concat_X, axis=0)
final_y = np.concatenate(concat_y, axis=0)
final_j = np.concatenate(concat_j, axis=0)

In [40]:
type(final_X)

numpy.ndarray

In [41]:
# 6 files loaded as split_indices has [0,1,2,3,4,5,6]
print("13 particle features : ", final_X.shape, "\n", "4 jet features : ", final_j.shape, "\n", "10 labels : ", final_y.shape)

13 particle features :  (600000, 150, 13) 
 4 jet features :  (600000, 4) 
 10 labels :  (600000, 10)


In [46]:
name="train"
i=0
print(folder_path)
'{}/{}/JetClass_{}.h5'.format(folder_path,name,i)

/u/phebbar/Work/Datasets/JetClass/train_100M


'/u/phebbar/Work/Datasets/JetClass/train_100M/train/JetClass_0.h5'