# Creating two txt files, one for the training - validation dataset and one for the test dataset.

The final outputs are two txt files: **'train_files.txt'** and **'test_files.txt'**. In each file, each line corresponds to an event: plane 0, plane 1, plane 2, label (0 or 1). Plane 0, 1, and 2 are the npz filenames needed to create the images. The elements are separated by ', '.  
The structure is as follow: filename_plane0.extracted.npz, filename_plane1.extracted.npz, filename_plane2.extracted.npz, class

This notebook works with npz files organized by plane and then by class, e.g., path_to_npz/0/ATMO.

The 2 inputs required from the user are: the main directory where these files are located and the value needed to split the filenames into two subsets (test and train).

In [None]:
import os
import random

In [None]:
npz_dir = input("Please, write the path to npz files.")

In [None]:
train_set_size = input("Please, specify the size of the training set. Eg. if answer is 0.7 => 70% of the files will go into the training set and 30% of the files will go into the testing set\n")

In [None]:
print('Creating lists of filenames and labels from the directories')

files = []
labels = []

# Directories for AtmoNu e PDK for 3 planes
class_0_dirs = [f"{npz_dir}/{plane}/ATMO" for plane in range(3)]
class_1_dirs = [f"{npz_dir}/{plane}/PDK" for plane in range(3)]

# Function to collect event tuples and their labels from multiple directories
def event_in_tuple(directories, label):
    for f1, f2, f3 in zip(
            sorted(os.listdir(directories[0])),  # Plane 0
            sorted(os.listdir(directories[1])),  # Plane 1
            sorted(os.listdir(directories[2]))): # Plane 2
        # Files is a list of tuples. One tuple - one event. Each tuple has three elements beacuse each event has three planes.
        files.append((f1, f2, f3))
        labels.append(label)
    return files, labels

# Get files and labels for AtmoNu (0) and PDK (1)
files0, labels0 = event_in_tuple(class_0_dirs, 0)
files1, labels1 = event_in_tuple(class_1_dirs, 1)

In [None]:
print('Splitting and shuffling the files')

# Convert train_set_size input answer to a float to calculate the training set size
train_size = float(train_set_size)

# Set a random seed for reproducibility
# random.seed(667)

# Create a list of ATMO (class 0) and PDK (class 1) file tuples with corresponding labels
atmo_files = [(files0[i], labels0[i]) for i in range(len(files0)) if labels0[i] == 0]
pdk_files = [(files1[i], labels1[i]) for i in range(len(files1)) if labels1[i] == 1]

# Shuffle the file lists to randomize the order
random.shuffle(atmo_files)
random.shuffle(pdk_files)

# Calculate the number of ATMO and PDK files to include in the training set
train_atmo = int(len(atmo_files) * train_size)
train_pdk = int(len(pdk_files) * train_size)

# Split the files into training and testing sets
all_train_files = atmo_files[:train_atmo] + pdk_files[:train_pdk]
all_test_files = atmo_files[train_atmo:] + pdk_files[train_pdk:]

# Shuffle the final training and testing sets
random.shuffle(all_train_files)   # With both AtmoNu and PDK
random.shuffle(all_test_files)    # With both AtmoNu and PDK

In [None]:
print('Creating the txt files')

# Function to save the list of files and labels to a text file
def save_file_list(files, txt_filename):
    with open(txt_filename, 'w') as f:
        for event_tuple, label in files:
            # Convert the tuple to a comma-separated string: removes usless () and ''
            file_names = [file for file in event_tuple]
            f.write(f"{', '.join(file_names)}, {label}\n")

# Save the training and testing file lists to text files
save_file_list(all_train_files, 'train_files.txt')
save_file_list(all_test_files, 'test_files.txt')

print('Files saved')
print('Execution completed')