The goal of this file is:
* to download a CSV audio file locally (excerpt of Wiktionary data)
* to download audio samples from Wikimedia Commons
* to write each audio (wav) file locally in an *audio* dir
* to convert it in an MFCC file and also save it locallu in an *mfcc* dir

Around 80200 wav files should be downloaded. The total duration for retrieving all file should last approximately 33 hours with a rate of approximately 2600 audio files downloaded per hour.

The needed disk space is approximately 13G
* 10G for the *audio* dir
* 3G for the *mfcc* dir

In [None]:
from datetime import datetime
import os
import re
import requests
import time
import sys

import librosa
import librosa.display
import numpy as np
import pandas as pd
import urllib.parse
import wget

* C.f.
https://commons.wikimedia.org/wiki/Commons:Village_pump/Technical#Best_way_to_download_a_batch_of_files
* Therefore, in the CONFIG, one can set *sleep_time = 0 sec*

In [None]:
CONFIG = {
    'csv_server': 'https://fonetik.fr',
    'dictionary_source_file': 'fr_wiktionary_excerpt.csv',    
    'max_samples': 85000,
    'n_max_phonemes': 20,
    'root_dir': '.',
    'sleep_time': 0, # interval in seconds between two .WAV downloads
    'speakers': [],
    'time_max': 2.0,    
    'mfcc': 40,
}

CONFIG['audio_dir'] = CONFIG['root_dir'] + '/' + 'audio'
config = CONFIG

In [None]:
def get_dataframe(config):
    
    filename = config['dictionary_source_file']
    url_file = config['csv_server'] + '/' + filename
    if not os.path.exists(filename):
        # unfortunately wget does not work because of a redirection
        # with the server containing the CSV
        # so wget.download(url_file) doe not work
        r = requests.get(url_file, allow_redirects=True)
        with open(filename, 'wb') as f:
            f.write(r.content)
    
    df = pd.read_csv(config['dictionary_source_file'], sep='\t', comment='#')

    # sort the df by growing size of phonemes
    s = df.Pronunciation.str.len().sort_values(ascending=True).index
    df = df.reindex(s)
    return df

In [None]:
def download_wav(audio_ressource, verbose=False):

    if not os.path.exists(config['audio_dir']):
        os.makedirs(config['audio_dir'])

    # first, download the META file
    # =============================
    website = 'https://commons.wikimedia.org/wiki/'
    filename = website + 'File:' + audio_ressource
    if verbose:
        print("meta_file ressource filename:%s" % filename)

    #https://commons.wikimedia.org/wiki/File:LL-Q150_(fra)-LoquaxFR-%2B.wav
    meta_file = wget.download(filename, bar=None)
    if verbose:
        print("downloaded ressource filename:%s" % meta_file)

    time.sleep(config['sleep_time'])

    mfile = open(meta_file, "r")

    # look for something like
    #https://upload.wikimedia.org/wikipedia/commons/c/cd/LL-Q150_%28fra%29-LoquaxFR-%2B.wav
    regexp_audio = '(https://upload.*?\.wav)'

    wav_filename = '-'
    for line in mfile:
        audio_found = re.search(regexp_audio, str(line))
        if audio_found != None:
            wav_filename = audio_found[1]
            if verbose:
                print('wav_filename=%s' % wav_filename)
            break
    
    os.remove(meta_file)

    # second, download the WAV file
    # =============================

    if wav_filename == '-':
        print('wav_filename not found in %s' % meta_file)
        return False
    else:
        wav_file = wav_filename.replace('transcoded/','')
        # transform chars like %28 and %29 in '(' and ')''
        wav_file = urllib.parse.unquote(wav_file)
        if verbose:
            print('wav_file:%s' % wav_file)

        #https://upload.wikimedia.org/wikipedia/commons/c/cd/LL-Q150_(fra)-LoquaxFR-+.wav
        # only download the file if it is not already on the local directory
        local_file = config['audio_dir'] + '/' + wav_file.split('/')[-1]
        if not os.path.exists(local_file):
            if verbose:
                print('downloading: %s' % wav_file)
            wave_file = wget.download(wav_file, config['audio_dir'], bar=None)
        else:
            if verbose:
                print('%s was already downloaded locally' % wav_file)
            wave_file = local_file
        return True

In [None]:
def analyze_wav(wave_file):
    try:
        size = os.stat(wave_file).st_size

        samples, sr = librosa.load(wave_file)
        duration = librosa.get_duration(y=samples, sr=sr)
        duration = int(duration*1000)/1000
 
        # return the name of the file as well as some attributes
        return (wave_file.split('/')[-1], size, duration, len(samples))
    except:
        print("analyze_wav: Unexpected error for wave_file:", wave_file)
        raise

In [None]:
def download_audio_files(config, df, verbose = False):
    
    print('download_audio_files')
    # for information basically we have
    # * 100 files -> 10 Mo
    # * 1 000 files -> 100 Mo
    # * 10 000 files -> 1 Go
    # * 20 000 files -> 2 Go
    #
    # ls -l audio/ | grep ^- | wc -l
    # 10 (fichiers)
    # du -hs audio/
    # > 1,2 M
    #
    
    duration_max = 0
    longest_word = ''
    current_n_max_phonemes = 0
    n_words = 0

    for index, row in df.iterrows():

        if n_words > config['max_samples'] - 1:
            print('early stopping at n:%d' %n_words)
            break
        
        word = row['Word']
        pronunciation = row['Pronunciation']
        wav_file = row['Audio']
        
        # visually display where we are in this potentially huge loop
        n_phonemes = len(pronunciation)

        if n_phonemes < current_n_max_phonemes:
            continue
            
        if n_phonemes > config['n_max_phonemes']:
            continue

        if n_phonemes > current_n_max_phonemes:
            print('current_n_max_phonemes:%d done (words:%s)' % (current_n_max_phonemes, n_words))
            current_n_max_phonemes = n_phonemes
            n_words = 0
            if current_n_max_phonemes > 100:
                break
        n_words += 1

        # Here, it possible to only keep audio files of some speakers.
        # If this is not needed, leave speakers as an empty list.
        # speakers = ['WikiLucas00', 'Lyokoï', 'Lepticed7']
        speakers = config['speakers']
        if len(speakers) > 0:
            speak_found = False
            for speaker in speakers:
                if speaker in wav_file:
                    speaker_found = True
                    break
            if not speaker_found:
                # skip file
                continue

        # if the wav file has not already been downloaded,
        # then download it.
        wav_file = wav_file.replace(' ', '_')
        wav_filename = config['audio_dir'] + '/' + wav_file
        try:
            if os.path.isfile(wav_filename):
                if verbose:
                    print('wav_file=%s already downloaded' % wav_filename)
            else:
                if verbose:
                    print('wav_file=%s being downloaded' % wav_filename) 
                try:
                    is_ok = download_wav(wav_file)
                except:
                    print('wav_file:%s could not be downloaded' % wav_file)
                    continue
        except:
            print(e)
            print('wav_file:%s could not be ls-ed' % wav_file)
            #continue

        # analyse the wave file
        try:
            wave_file, size, duration, nb_samples = analyze_wav(wav_filename)
        except:
            print('wave_file could not be ls-ed:%s' % wave_file)
            continue
        if verbose:
            print("wave_file:%s, size:%d bytes, duration:%.3f ms, nb_samples:%d" %
                    (wave_file, size, duration, nb_samples))

        # archive the result
        if duration > duration_max:
            duration_max = duration
            longest_word = word

    print('n_word=%d' % n_words)
    print('longest_word=%s, duration_max=%.3f ms' % (longest_word, duration_max))
    return n_words, duration_max

In [None]:
def pad_and_convert_to_mfcc_files(config, df, verbose=False):
    
    audio_dir = config['audio_dir']
    time_max = config['time_max']
    mfcc_samples = config['mfcc']
    mfcc_dir = 'mfcc' + '_' + str(config['time_max']) + '_' + str(config['mfcc']) + '/'
    
    n = 0
    
    shape_max = int((time_max+ 2/22050) *22050)
    
    # if not exists, create output directory
    
    if not os.path.exists(mfcc_dir):
        os.makedirs(mfcc_dir)
    
    for index, row in df.iterrows():
        
        if n > config['max_samples']-1:
            print('early stopping at n:%d' %n)
            break
               
        if n % 1000 == 0:
            if verbose:
                print('n:%d' % n)
                
        # if mfcc file already exists, skip it
        name = row['Audio'].split('.')[-2]
        mfcc_name = name+'.npy'
        if os.path.exists(mfcc_dir + mfcc_name):
            if verbose:
                print('mfcc %s already already downloaded' % mfcc_name)
            n += 1
            continue
        
        wav_file = audio_dir + '/' + row['Audio']
        wav_file = wav_file.replace(' ', '_')        
        y, sr = librosa.load(wav_file)
        
        # calculate duration of original file
        time = librosa.get_duration(y=y, sr=sr)
        if verbose:
            print('shape_max:%d' %shape_max)
            
        # discard words whose duration is longer than time_max seconds
        if time >= time_max:
            if verbose:
                print('skipping %s (time:%s > %time_max:%s)' % (name, time, time_max))
                
            continue
            
        # extend duration of the file up to TIME_MAX
        y2 = librosa.util.fix_length(y, shape_max)
        #librosa.display.waveplot(y=y, sr=sr)
        
        mfcc = librosa.feature.mfcc(y=y2, sr=sr, n_mfcc = mfcc_samples)
        if verbose:
            print("mfccs.shape:", mfcc.shape)
        np.save(mfcc_dir + name, mfcc)
        
        n += 1
        
    return df, n

In [None]:
def main():
    df = get_dataframe(CONFIG)

    print(df.head(3))

    # download audio files
    t0 = datetime.now()
    print(t0)
    download_audio_files(CONFIG, df, verbose=False)
    t1 = datetime.now()
    print(t1)
    download_audio_files_duration = t1 - t0
    print('download_audio_files_duration:', download_audio_files_duration)

    # convert audio files into mfcc_files
    t0 = datetime.now()
    print(t0)
    pad_and_convert_to_mfcc_files(CONFIG, df, verbose=False)
    t1 = datetime.now()
    print(t1)
    convert_files_duration = t1 - t0
    print('convert_files_duration:', convert_files_duration)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# in order to execute this script in a regular .py file
# open a terminal in the local curretn directory and
# execute the following command:
# jupyter nbconvert --to script 'make_local_database.ipynb'