# Preprocess

In [18]:
import os
import random
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from pydub import AudioSegment

In [19]:
def count_files(folder_path):
    count = 0
    for root, dirs, files in os.walk(folder_path):
        count += len(files)
    return count

def find_folders(folder_path,n):
    folders = []
    for root, dirs, files in os.walk(folder_path):
        for dir in dirs:
            dir_path = os.path.join(root, dir)
            file_count = count_files(dir_path)
            if file_count > n:
                folders.append((dir, file_count))
    return folders

def find_folders_condition(folder_path,n,condition):
    folders = []
    for root, dirs, files in os.walk(folder_path):
        for dir in (filt_dirs for filt_dirs in dirs if filt_dirs in condition.keys()):
            dir_path = os.path.join(root, dir)
            file_count = count_files(dir_path)
            if file_count > n:
                folders.append((dir,condition[dir], file_count))
    return folders

In [20]:
# Set the root folder path
root_folder_path = './clips/'

# Find folders with more than 2000 files
folders = find_folders(root_folder_path,n=2000)

# Print the dataframe
print(folders)
display(pd.DataFrame(folders, columns=['Folder Title', 'Number of Elements']))

# Print the total number of folders
print(f"Total number of folders: {len(folders)}")

[('sound', 2769), ('movement', 2109), ('trying', 2215), ('friend', 2816), ('everyone', 3270), ('our', 6522), ('set', 3291), ('hey', 2482), ('tried', 2005), ('group', 3763), ('into', 9439), ('first', 15360), ('career', 2110), ('south', 3741), ('father', 3492), ('boy', 25100), ('currently', 2459), ('less', 2278), ('home', 5877), ('does', 5425), ('named', 5044), ('city', 7818), ('people', 13426), ('line', 3153), ('heard', 3422), ('answered', 3114), ('looking', 3402), ('eyes', 2750), ('university', 3862), ('wife', 2374), ('children', 4306), ('popular', 2639), ('anything', 4460), ('with', 23238), ('far', 2577), ('due', 2244), ('station', 4076), ('hear', 2161), ('the', 17671), ('number', 3607), ('seemed', 3555), ('those', 5688), ('things', 5738), ('after', 15292), ('never', 11274), ('part', 6425), ('gold', 2861), ('some', 17713), ('team', 2778), ('please', 4445), ('written', 2942), ('and', 116658), ('since', 3480), ('working', 2108), ('form', 2608), ('under', 4461), ('fire', 3314), ('table',

Unnamed: 0,Folder Title,Number of Elements
0,sound,2769
1,movement,2109
2,trying,2215
3,friend,2816
4,everyone,3270
...,...,...
454,blue,2983
455,top,3086
456,like,16353
457,has,7314


Total number of folders: 459


In [21]:
phoneme_list={
    "party"    :  "/p/" , 
    "books"    :  "/b/" , 
    "trying"   :  "/t/" , 
    "different":  "/d/" , 
    "county"   :  "/k/" , 
    "game"     :  "/g/" ,
    "felt"     :  "/f/" , 
    "love"     :  "/v/" , 
    "thing"    :  "/θ/" ,  
    "then"     :  "/ð/" ,  
    "sun"      :  "/s/" , 
    "has"      :  "/z/" , 
    "sure"     :  "/ʃ/" ,  
    "treasure" : " /ʒ/" ,
    "church"   : "/tʃ/" ,
    "general"  : "/dʒ/" ,
    "who"      :  "/h/" ,  
    "man"      :  "/m/" ,  
    "need"     :  "/n/" , 
    "song"     :  "/ŋ/" ,  
    "live"     :  "/l/" ,  
    "run"      :  "/r/" ,  
    "why"      :  "/w/" ,  
    "you"      :  "/j/" ,  
    "people"   : "/iː/" , 
    "river"    :  "/ɪ/" ,  
    "end"      :  "/e/" ,  
    "friend"   :  "/ɛ/" ,
    "back"     :  "/æ/" , 
    "does"     :  "/ʌ/" ,  
    "about"    :  "/ə/" ,  
    "car"      : "/ɑː/" , 
    "door"     : "/ɔː/" , 
    "boy"      : "/ɔɪ/" , 
    "side"     : "/aɪ/" , 
    "now"      : "/aʊ/" , 
    "day"      : "/eɪ/" , 
    "here"     : "/ɪə/" , 
    "our"      : "/ʊə/" , 
    "service"  : "/ər/" ,  
    "words"    : "/ɜː/" ,  
    "open"     : "/oʊ/" , 
    "north"    :"/ɔːr/" ,
    "put"      :  "/ʊ/" 
}
random_words=[
    "tried",
    "hey",
    "career",
    "south",
    "please",
    "working",
    "building",
    "old",
    "around",
    "company",
    "himself",
    "language",
    "album",
    "family",
    "young",
    "returned",
    "important",
    "throughout",
    "understand",
    "include",
    "business",
    "daughter",
    "everything",
    "englishman",
    "between",
    "outside",
]

In [22]:
# Find folders with more than 2000 files
phoneme_folders = find_folders_condition(root_folder_path,n=1,condition=phoneme_list)

# Create the dataframe
df2 = pd.DataFrame(phoneme_folders, columns=['Folder Title','Phoneme', 'Number of Elements'])
display(df2)

Unnamed: 0,Folder Title,Phoneme,Number of Elements
0,trying,/t/,2215
1,friend,/ɛ/,2816
2,our,/ʊə/,6522
3,boy,/ɔɪ/,25100
4,does,/ʌ/,5425
5,people,/iː/,13426
6,different,/d/,3478
7,door,/ɔː/,2034
8,books,/b/,2423
9,north,/ɔːr/,3700


In [23]:
def copy_random_files(source_dir, dest_dir,dir_list, num_files_per_folder):
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    for root, dirs, files in os.walk(source_dir):
        for folder in (filt_dirs for filt_dirs in dirs if filt_dirs in dir_list):
            folder_path = os.path.join(root, folder)
            files_in_folder = os.listdir(folder_path)
            random_files = random.sample(files_in_folder, num_files_per_folder)

            for file_name in tqdm(random_files,desc="Copying and downsampling \'"+folder+"\' files"):
                source_file_path = os.path.join(folder_path, file_name)
                if not os.path.exists(os.path.join(dest_dir,folder)):
                    os.makedirs(os.path.join(dest_dir,folder))
                dest_file_path = os.path.join(dest_dir,folder,file_name)
                audio = AudioSegment.from_file(source_file_path)
                audio = audio.set_frame_rate(16000)
                audio.export(dest_file_path[:-4]+"wav", format='wav')


    print(f"{num_files_per_folder} files in each of the {len(dir_list)} folders copied successfully!")

copy_random_files('./clips/', './subset/',list(phoneme_list.keys())+random_words, 2000)

Copying and downsampling 'trying' files: 100%|██████████| 2000/2000 [00:46<00:00, 43.02it/s]
Copying and downsampling 'friend' files: 100%|██████████| 2000/2000 [00:49<00:00, 40.68it/s]
Copying and downsampling 'our' files: 100%|██████████| 2000/2000 [00:43<00:00, 45.55it/s]
Copying and downsampling 'hey' files: 100%|██████████| 2000/2000 [00:49<00:00, 40.63it/s]
Copying and downsampling 'tried' files: 100%|██████████| 2000/2000 [00:44<00:00, 45.10it/s]
Copying and downsampling 'career' files: 100%|██████████| 2000/2000 [00:45<00:00, 43.79it/s]
Copying and downsampling 'south' files: 100%|██████████| 2000/2000 [00:45<00:00, 43.51it/s]
Copying and downsampling 'boy' files: 100%|██████████| 2000/2000 [00:43<00:00, 46.16it/s]
Copying and downsampling 'does' files: 100%|██████████| 2000/2000 [00:43<00:00, 46.03it/s]
Copying and downsampling 'people' files: 100%|██████████| 2000/2000 [00:43<00:00, 46.25it/s]
Copying and downsampling 'please' files: 100%|██████████| 2000/2000 [00:44<00:00, 4

2000 files in each of the 70 folders copied successfully!
