In [1]:
import os

pool_path = os.path.join(os.getcwd(),"pool")

In [2]:
for dirpath, dirnames, filenames in os.walk( pool_path ):
    # We need only the filenames
    break
len(filenames)

260

# Instance Classes

In [3]:
import collections

instance_classes = collections.Counter(
    [
        "_".join(filename.split("_")[:2])+"_"
        for filename in filenames
    ]
)
instance_classes, len(instance_classes)

(Counter({'1000_ep0.0075_': 10,
          '1000_ep0.0125_': 10,
          '1000_ep0.01_': 10,
          '1000_r0.058_': 10,
          '1000_r0.05_': 10,
          '1000_r0.065_': 10,
          '100_ep0.05_': 10,
          '100_ep0.075_': 10,
          '100_ep0.125_': 10,
          '100_ep0.1_': 10,
          '100_r0.169_': 10,
          '100_r0.195_': 10,
          '100_r0.219_': 10,
          '500_ep0.015_': 10,
          '500_ep0.025_': 10,
          '500_ep0.02_': 10,
          '500_r0.071_': 10,
          '500_r0.083_': 10,
          '500_r0.093_': 10,
          '50_ep0.15_': 10,
          '50_ep0.1_': 10,
          '50_ep0.25_': 10,
          '50_ep0.2_': 10,
          '50_r0.259_': 10,
          '50_r0.299_': 10,
          '50_r0.334_': 10}),
 26)

# Group files by class

In [4]:
group = dict()
for clazz in sorted(instance_classes):
    group[clazz] = [name for name in filenames if clazz in name]
    print(" {:<15}: {}".format(clazz,len(group[clazz])))
len(group)

 1000_ep0.0075_ : 10
 1000_ep0.0125_ : 10
 1000_ep0.01_   : 10
 1000_r0.058_   : 10
 1000_r0.05_    : 10
 1000_r0.065_   : 10
 100_ep0.05_    : 10
 100_ep0.075_   : 10
 100_ep0.125_   : 10
 100_ep0.1_     : 10
 100_r0.169_    : 10
 100_r0.195_    : 10
 100_r0.219_    : 10
 500_ep0.015_   : 10
 500_ep0.025_   : 10
 500_ep0.02_    : 10
 500_r0.071_    : 10
 500_r0.083_    : 10
 500_r0.093_    : 10
 50_ep0.15_     : 10
 50_ep0.1_      : 10
 50_ep0.25_     : 10
 50_ep0.2_      : 10
 50_r0.259_     : 10
 50_r0.299_     : 10
 50_r0.334_     : 10


26

# Split in training and testing sets

Different `seed` values will generate different training and testing folders.

In [5]:
training_portion = 0.7
seed = 1

In [6]:
import numpy
numpy.random.seed(seed)

training = list()
testing = list()

for clazz in sorted(group):
    random_for_training = numpy.random.choice(
        group[clazz],
        size=int(training_portion*len(group[clazz])),
        replace=False
    )
    training.extend( random_for_training )
    testing.extend( set(group[clazz]) - set(random_for_training)  )

len(training)/len((training+testing)),set(training)&set(testing),set(testing)&set(training)

(0.7, set(), set())

# Create folders

In [7]:
import shutil

training_folder_name = "{:02}_training".format(seed)
testing_folder_name = "{:02}_testing".format(seed)

In [8]:
if os.path.exists(training_folder_name) == True:
    print(training_folder_name, "already exists.")
else:
    os.mkdir(training_folder_name)
    training_path = os.path.join( os.getcwd(), training_folder_name )
    for file in training:
        shutil.copy(
            os.path.join(pool_path,file),
            os.path.join(training_path,file),
        )
    print(training_folder_name, "created.")

if os.path.exists(testing_folder_name) == True:
    print(testing_folder_name, "already exists.")
else:
    os.mkdir(testing_folder_name)
    testing_path = os.path.join( os.getcwd(), testing_folder_name )
    for file in testing:
        shutil.copy(
            os.path.join(pool_path,file),
            os.path.join(testing_path,file),
        )
    print(testing_folder_name, "created.")

01_training created.
01_testing created.
