In [52]:
import wget
import urllib.parse
import os
import requests
import tarfile
import warnings
import regex
import yaml

import pandas as pd

from tqdm import tqdm
from lxml import html

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

# Webscraping

## Get URLS

In [12]:
def webscrapping_tar(page="http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/"):
    """Web scraping
    1. Get url to all href elements
    2. Check if it is a .tgz file
    3  Return list with the urls
    """
    page = requests.get(page)
    webpage = html.fromstring(page.content)
    files=[]
    for element in webpage.xpath('//a/@href'):
        if ".tgz" in element:
            files.append(element)
    return files
files = webscrapping_tar()

In [13]:
print("The number of files is {}".format(len(files)))

The number of files is 6244


In [17]:
# create the data folder
#! mkdir data

# Download files

In [19]:
def download_tar(save_in_folder="data",list_urls=None,base_url="http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/"):
    """ Function to download files
    1. Gets list with the URLS
    2. Downloads all the files
    3. Stores the .tgz files in the folder data
    """
    for url in tqdm(list_urls):
        url_file = url
        url=urllib.parse.urljoin(base_url, url_file)
        path_file = os.path.join("data",url_file)
        wget.download(url, path_file)
    
download_tar(list_urls=files)

100%|██████████| 6244/6244 [1:59:24<00:00,  1.15s/it]


## Unzip files

In [20]:
def extract_file(fname=None, destination="extracted_data/"):
    """ Function to unzip a folder
    1. Get folder name and checks type
    2. Returns warning if the format is not supported
    3. The unzipe data is passed to the folder extracted_data
    """
    if (fname.endswith("tar.gz") or fname.endswith("tgz")):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(path=destination)
        tar.close()
    elif (fname.endswith("tar")):
        # dont use in this case but keept it
        tar = tarfile.open(fname, "r:")
        tar.extractall(path=destination)
        tar.close()
    else:
        warnings.warn("The file was not extracted, problem with file format for file {}".format(fname))
#extract_file("data/1028-20100710-hne.tgz")

In [21]:
def extract_all_files(folder="data"):
    """ Function that loops over all folder in a folder and unzips them
    1. Checks for folder in a folder
    2. Unzips all the folders
    """
    dirs= os.listdir(folder)
    for tar in tqdm(dirs):
        path = os.path.join(folder,tar)
        extract_file(path)
extract_all_files()

100%|██████████| 6244/6244 [05:37<00:00, 18.52it/s]


## Validate the number of files

In [85]:
!(cd extracted_data && find . -type f -printf '%f\n' | sed -r -n 's/.+(\..*)$/\1/p' | sort | uniq -c | sort -bn)


      1 .prompt
      1 .prompts
      1 .wav-bak
      1 .wav-omit
      1 .wav-skip
      2 .text
      2 .wav-split
      7 .wav-original
   5704 .txt
   8825 .flac
  86656 .wav


Based upon the state information in the assignment it was assumed to be 62,440 audio samples. However there are at least more recodrings than that avilable. Further investigateion is needed later on. 

# Extract the text

In [39]:

def extract_text_info(path):
    """ Function to extract the infomration from the text files
    1. Opens text file
    2. Using regex(fuzzy) checks for the features of interest: gender,age_group,dialect,language
    3. Returns the extracted data as a dict
    """
    text_file = open(path, "r")
    lines = text_file.readlines()
    regex_expressions = yaml.load(open('extract.yaml'))
    respons=dict()
    # If I cant find it in the text i want to return None for the key
    respons['Gender']=None
    respons['Age_Range']=None
    respons['Language']=None
    respons['Dialect']=None
    for field, value in regex_expressions.items():
        for line in lines: 
            match=regex.search(regex_expressions.get(field).get('key'),line)
            if match:      
                respons[field]=match.group(1)
    return respons
#path = "/Users/Niklas/Documents/test/extracted_data/1028-20100710-hne/etc/README"
#extract_text_info(path)

# Extract info from all the unziped files


In [43]:
def label_and_path(base_folder = "extracted_data",output_file="data.csv"):
    """ Function that loops over all the unziped folders and extracts the information
    1. Gets the folder where the extracted data is
    2. Loops over all the folders in the folder
    3. Foreach of the folders the information is extracted also the path to the sound files
    4. Stores the information in a pandas dataframe that is then saved as a csv
    """
    dirs= os.listdir(base_folder)
    df = pd.DataFrame()
    paths = []
    age_range = []
    language = []
    dialect = []
    gender = []

    for folder in tqdm(dirs):
        if not folder.endswith(".DS_Store"):
            path = os.path.join(base_folder,folder)
            path = os.path.join(os.getcwd(),path)
            complete_path = os.path.join(path,"etc/README")
            if os.path.isfile(complete_path):
                ans = extract_text_info(complete_path)
                sound_path = os.path.join(path,"wav")
                if os.path.isdir(sound_path):
                    for file in os.listdir(sound_path):
                        sound_file_path = os.path.join(sound_path,file)
                        paths.append(sound_file_path)
                        age_range.append(ans['Age_Range'])
                        language.append(ans['Language'])
                        dialect.append(ans['Dialect'])
                        gender.append(ans['Gender'])
                        # print(path)
                        # I know here that i should go into wav and add the sound files 
                        # I also know that i should go into etc to get the information about the files
                        # DO the magic here
                else:
                    print("Missing sound files for {}".format(sound_path))
            else:
                print("Missing label for file {}".format(complete_path))
    df['path'] = paths
    df['gender'] = gender
    df['language'] = language
    df['age_range'] = age_range
    df['dialect'] = dialect
    df.to_csv(output_file)
label_and_path()

  1%|▏         | 86/6244 [00:00<00:14, 423.64it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080203-en51/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120204/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080312-en59/wav


  3%|▎         | 218/6244 [00:00<00:14, 429.25it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070923-en8/wav
Missing label for file /home/niklas_sven_hansson/test/extracted_data/jaiger-12032006-6/etc/README


  5%|▌         | 342/6244 [00:00<00:14, 420.76it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071120-en18/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071201-en27/wav


  7%|▋         | 427/6244 [00:01<00:13, 420.82it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071215-en39/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120318/wav


 11%|█         | 689/6244 [00:01<00:13, 424.46it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/propagationofsound-20100111-cc/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071118-en17/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/devin122-20080522-ar/wav


 13%|█▎        | 821/6244 [00:01<00:12, 424.67it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120309/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120205/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071208-en33/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080108-en43/wav


 15%|█▌        | 953/6244 [00:02<00:12, 425.70it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070914-en5/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/propagationofsound-20100112-ar/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070823_vf20/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherog-20070918-en7/wav


 17%|█▋        | 1041/6244 [00:02<00:12, 426.18it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080308-en55/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071126-en22/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071207-en32/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080410-vf6/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071206-en31/wav


 18%|█▊        | 1129/6244 [00:02<00:11, 426.45it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120226/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080111-en46/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071126-en25/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070826_vf23/wav


 20%|██        | 1262/6244 [00:02<00:11, 427.06it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071210-en35/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120409-2/wav
Missing label for file /home/niklas_sven_hansson/test/extracted_data/cmu_com_kal_ldom/etc/README
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070913-vf27/wav


 22%|██▏       | 1395/6244 [00:03<00:11, 427.93it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120304/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120210/wav


 24%|██▍       | 1526/6244 [00:03<00:11, 427.43it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120127/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080313-en60/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080315-en61/wav


 27%|██▋       | 1698/6244 [00:03<00:10, 427.13it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120219/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071212-en37/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071206-en30/wav


 30%|██▉       | 1873/6244 [00:04<00:10, 427.44it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071124-en20/wav
Missing label for file /home/niklas_sven_hansson/test/extracted_data/cmu_us_slt_arctic/etc/README
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120324/wav


 34%|███▍      | 2134/6244 [00:04<00:09, 427.05it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120225/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080329-vf2/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071217-en41/wav


 36%|███▌      | 2220/6244 [00:05<00:09, 425.79it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/benkay-20090111-ar/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120311/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071028-en15/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120122/wav


 38%|███▊      | 2392/6244 [00:05<00:09, 425.63it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120330/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/propagationofsound-20100112-rp/wav


 40%|████      | 2521/6244 [00:05<00:08, 425.50it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080402-vf3/wav
Missing label for file /home/niklas_sven_hansson/test/extracted_data/jaiger-12032006-5/etc/README
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120302/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071010-en12/wav


 42%|████▏     | 2606/6244 [00:06<00:08, 425.10it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071125-en21/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080110-en45/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071126-en23/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080113-en48/wav


 44%|████▍     | 2735/6244 [00:06<00:08, 425.09it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080310-en57/wav


 45%|████▌     | 2823/6244 [00:06<00:08, 425.30it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070824_vf21/wav


 47%|████▋     | 2953/6244 [00:06<00:07, 425.42it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080410-vf4/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071007-en11/wav


 49%|████▉     | 3084/6244 [00:07<00:07, 425.57it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070911-en4/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071014-en14/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120317/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080109-en44/wav


 51%|█████     | 3171/6244 [00:07<00:07, 425.00it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120217/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071216-en40/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120323/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080112-en47/wav


 53%|█████▎    | 3303/6244 [00:07<00:06, 425.01it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080204-en54/wav


 54%|█████▍    | 3390/6244 [00:07<00:06, 425.10it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071013-en13/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070924-en9/wav


 57%|█████▋    | 3569/6244 [00:08<00:06, 425.82it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/jooles-20080826/wav


 59%|█████▊    | 3656/6244 [00:08<00:06, 425.63it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080316-en62/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070825_vf22/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070910-en3/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/benkay-20090111-rp/wav


 63%|██████▎   | 3918/6244 [00:09<00:05, 424.77it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120212/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080204-en53/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/benkay-20090111-cc/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071107-en16/wav


 68%|██████▊   | 4227/6244 [00:09<00:04, 425.59it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080311-en58/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071205-en29/wav


 69%|██████▉   | 4316/6244 [00:10<00:04, 425.53it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120218/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071205-en28/wav


 71%|███████▏  | 4449/6244 [00:10<00:04, 425.87it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071001-en10/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080204-en52/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071211-en36/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080317-en63/wav


 73%|███████▎  | 4581/6244 [00:10<00:03, 416.57it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120331/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120211/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071126-en26/wav


 75%|███████▍  | 4669/6244 [00:11<00:03, 416.90it/s]

Missing label for file /home/niklas_sven_hansson/test/extracted_data/crhylove-10252006/etc/README


 79%|███████▉  | 4937/6244 [00:11<00:03, 418.14it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070827_vf24/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070903-en2/wav


 81%|████████  | 5069/6244 [00:12<00:02, 418.26it/s]

Missing label for file /home/niklas_sven_hansson/test/extracted_data/jaiger-12032006-4/etc/README
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080329-vf1/wav


 85%|████████▌ | 5337/6244 [00:12<00:02, 419.28it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080410-vf10/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071209-en34/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070822_vf19/wav


 88%|████████▊ | 5519/6244 [00:13<00:01, 420.16it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071123-en19/wav
Missing label for file /home/niklas_sven_hansson/test/extracted_data/jaiger-12032006-3/etc/README
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120325/wav


 91%|█████████ | 5652/6244 [00:13<00:01, 420.37it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080410-vf5/wav
Missing label for file /home/niklas_sven_hansson/test/extracted_data/cmu_us_bdl_arctic/etc/README
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/LunaTick-20080410-vf7/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070912-vf26/wav


 92%|█████████▏| 5741/6244 [00:13<00:01, 420.72it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070917-en6/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080203-en49/wav


 94%|█████████▍| 5877/6244 [00:13<00:00, 421.18it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071218-en42/wav


 96%|█████████▋| 6010/6244 [00:14<00:00, 421.38it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120124/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080309-en56/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120409/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20080203-en50/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071214-en38/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070831-en1/wav


100%|█████████▉| 6233/6244 [00:14<00:00, 422.03it/s]

Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20071126-en24/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/bjb-20120310/wav
Missing sound files for /home/niklas_sven_hansson/test/extracted_data/ralfherzog-20070830_vf25/wav


100%|██████████| 6244/6244 [00:14<00:00, 421.95it/s]


In [45]:
df = pd.read_csv("data.csv",index_col=0)
df.shape

(82893, 5)

In [53]:
df

Unnamed: 0,path,gender,language,age_range,dialect
0,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0492.wav,Male,EN,Adult,British English
1,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0493.wav,Male,EN,Adult,British English
2,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0487.wav,Male,EN,Adult,British English
3,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0488.wav,Male,EN,Adult,British English
4,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0495.wav,Male,EN,Adult,British English
5,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0490.wav,Male,EN,Adult,British English
6,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0496.wav,Male,EN,Adult,British English
7,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0494.wav,Male,EN,Adult,British English
8,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0489.wav,Male,EN,Adult,British English
9,/home/niklas_sven_hansson/test/extracted_data/phr0663r-20090922-jqj/wav/a0491.wav,Male,EN,Adult,British English


## Check the file information

In [47]:
# Observations where the language is stated to be english
df[df["language"]=="EN"].shape

(59027, 5)

In [49]:
# Count of the unique values of english
df['language'].value_counts()

EN    59027
Name: language, dtype: int64

In [75]:
# Sum of all dialects
df['dialect'].value_counts()

American English      32187
British English       9167 
European English      8320 
Other                 4203 
Canadian English      3128 
Indian English        2645 
Australian English    2360 
General American      1822 
New Zealand           819  
South African         783  
other                 628  
New York              551  
Irish English         329  
unknown               173  
Please Select         158  
Northern Irish        155  
Mid                   150  
California            94   
British Received      70   
West                  64   
european english      49   
English English       39   
Western               31   
European              24   
non                   10   
Westdeutschland       10   
Asian accent          10   
America English       10   
Español España        10   
Name: dialect, dtype: int64

In [67]:
df["gender"].value_counts()

Male         52760
male         6837 
Female       3824 
unknown      193  
Please       189  
female       157  
adult        10   
make         10   
Masculino    10   
Weiblich     10   
Name: gender, dtype: int64

In [70]:
sum(df["gender"].value_counts())-189-193

63618

## Only select the once that actually have a label of interest

In [86]:
test = df[((df["gender"] =='Male' )|(df["gender"]=='male')|
          (df["gender"]=='Female')|(df["gender"]=='female'))  & (df["language"]=="EN")]
test.shape

(58615, 5)

## Validat the paths

In [65]:
def test_files_exsist(df=None):
    for file in tqdm(df['path']):
        if not os.path.isfile(file):
            warning.warn("This path is wrong {}".format(file))

def test_files_unique():
    if not df.shape[0]==len(df['path'].unique()):
        print("Some rows are duplicates")
test_files_exsist(df)
test_files_unique()

100%|██████████| 82893/82893 [00:00<00:00, 226527.18it/s]


data	  extracted_data  ouput.csv	     sound_feature.R
data.csv  extract.yaml	  sound_feature_2.R  Untitled.ipynb
