In [214]:
import os
import shutil
import random

In [215]:
def make_dir(path):
    try:
        os.mkdir(path)
    except OSError:
        print (f'"{path}" Failed')
    else:
        print (f'"{path}" Created')

In [216]:
def make_test_set(df_type):
    source_dir = f'./Celeb-DF-v2/Celeb-{df_type}'
    source_real_dir = f'{source_dir}/Celeb-real'
    source_fake_dir = f'{source_dir}/Celeb-synthesis'
    
    dest_folder_name = f'Celeb-{df_type}-test'
    dest_dir = f'./Celeb-DF-v2/{dest_folder_name}'
    dest_real_dir = f'{dest_dir}/Celeb-real'
    dest_fake_dir = f'{dest_dir}/Celeb-synthesis'

    make_dir(dest_dir)
    make_dir(dest_real_dir)
    make_dir(dest_fake_dir)
    
    videos_in_real_source = set(os.listdir(source_real_dir)) #61 ids
    videos_in_fake_source = set(os.listdir(source_fake_dir)) #61 ids


    test_train_split = 0.3
    n = int(62 * test_train_split)
    actors_idz = range(0, n)

    real_extracted_actors = set()
    fake_extracted_actors = set()

    for id_number in actors_idz:
        for real_video in videos_in_real_source:
            actor_id = f'id{id_number}_'
            if actor_id in real_video:
                real_extracted_actors.add(real_video)

        for fake_video in videos_in_fake_source:
            actor_id = f'id{id_number}_i'
            if actor_id in fake_video:
                fake_extracted_actors.add(fake_video)

    real_unextracted_actors = videos_in_real_source - real_extracted_actors
    fake_unextracted_actors = videos_in_fake_source - fake_extracted_actors
    
    # creating test set
    for real_actor in real_extracted_actors:
        test_video = f'{source_real_dir}/{real_actor}'
        if real_actor not in os.listdir(dest_real_dir):
            shutil.move(test_video, dest_real_dir)

    for fake_actor in fake_extracted_actors:
        test_video = f'{source_fake_dir}/{fake_actor}'
        if fake_actor not in os.listdir(dest_fake_dir):
            shutil.move(test_video, dest_fake_dir)

In [217]:
for df in ['rnd', 'diff', 'avg']:
    make_test_set(df_type=df)

"./Celeb-DF-v2/Celeb-rnd-test" Created
"./Celeb-DF-v2/Celeb-rnd-test/Celeb-real" Created
"./Celeb-DF-v2/Celeb-rnd-test/Celeb-synthesis" Created
"./Celeb-DF-v2/Celeb-diff-test" Created
"./Celeb-DF-v2/Celeb-diff-test/Celeb-real" Created
"./Celeb-DF-v2/Celeb-diff-test/Celeb-synthesis" Created
"./Celeb-DF-v2/Celeb-avg-test" Created
"./Celeb-DF-v2/Celeb-avg-test/Celeb-real" Created
"./Celeb-DF-v2/Celeb-avg-test/Celeb-synthesis" Created


In [222]:
!mkdir ./Celeb-DF-v2/Celeb-rnd-bal
!cp -r ./Celeb-DF-v2/Celeb-rnd/* ./Celeb-DF-v2/Celeb-rnd-bal/

!mkdir ./Celeb-DF-v2/Celeb-diff-bal
!cp -r ./Celeb-DF-v2/Celeb-diff/* ./Celeb-DF-v2/Celeb-diff-bal/

!mkdir ./Celeb-DF-v2/Celeb-avg-bal
!cp -r ./Celeb-DF-v2/Celeb-avg/* ./Celeb-DF-v2/Celeb-avg-bal/

In [None]:
seed = 0
for df in ['rnd', 'diff', 'avg']:
    balance_dir = f'./Celeb-DF-v2/Celeb-{df}-bal'
    balance_real_dir = f'{balance_dir}/Celeb-real'
    balance_fake_dir = f'{balance_dir}/Celeb-synthesis'

    # os.listdir(balance_real_dir)
    videos_in_real_bal = os.listdir(balance_real_dir)
    videos_in_fake_bal = os.listdir(balance_fake_dir)
    
    random.Random(seed).shuffle(videos_in_fake_bal)

    for fake_video in videos_in_fake_bal[len(videos_in_real_bal):]:
        os.remove(f'{balance_fake_dir}/{fake_video}')
        
    print(len(os.listdir(balance_real_dir)), len(os.listdir(balance_fake_dir)))

422 422
422 422
