### Download the dataset

In [1]:
import requests
import os
import tarfile
import zipfile
import glob

# Change the working directory to the root of the project
os.chdir(r'C:\Users\TuAhnDinh\Desktop\MediaanProjects\BachelorThesisST')

COVOST_DIR = 'data/CoVoST2'

# Downloads voice clips and transcripts
urls = {'en': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/en.tar.gz',
        'fr': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/fr.tar.gz',
        'de': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/de.tar.gz',
        'it': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/it.tar.gz',
        'pt': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/pt.tar.gz',
        'es': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/es.tar.gz',
        'nl': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/nl.tar.gz'}

for lang, url in urls.items():
    lang_dir = COVOST_DIR + '/' + lang
    if not os.path.exists(lang_dir):
        print(f'Downloading {lang} audios')
        filename = url.rsplit('/', 1)[1]
        r = requests.get(url)
        with open(COVOST_DIR + '/' + filename, 'wb') as f:
            f.write(r.content)
        print(f'Extracting {lang} audios')
        tf = tarfile.open(COVOST_DIR + '/' + filename)
        tf.extractall(lang_dir)
        tf.close()
        os.remove(COVOST_DIR + '/' + filename)

# Downloads CoVoST 2 translations, where validated.<lang>_en.en and validated.en_<lang>.<lang> 
# are matched with the transcripts in validated.tsv
if not os.path.exists(COVOST_DIR + '/covost2'):
    url = 'https://dl.fbaipublicfiles.com/covost/covost2.zip'
    filename = url.rsplit('/', 1)[1]
    r = requests.get(url)
    with open(COVOST_DIR + '/' + filename, 'wb') as f:
        f.write(r.content)
    with zipfile.ZipFile(COVOST_DIR + '/' + filename, 'r') as zf:
        zf.extractall(COVOST_DIR)
    os.remove(COVOST_DIR + '/' + filename)


### Explore the dataset

We take a look at Dutch --> English as an example

In [2]:
import pandas as pd

DUTCH_DIR = COVOST_DIR + '/nl'
validated_nl_samples = pd.read_csv(DUTCH_DIR + '/validated.tsv', sep='\t')
validated_nl_samples.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,0d709133bf209da7f0164653b6e5f9aee9d059ffaf7686...,common_voice_nl_17699535.mp3,De Aboriginals zijn de oorspronkelijke bewoner...,2,0,fifties,male,netherlands
1,0fca93407be6d482019f2463e60fbafdf598a82517e63c...,common_voice_nl_17694848.mp3,Mijn toetsenbord zit vol stof.,2,1,,,
2,175d4117110538cc68a8a0157a7f0a681f3e74fbe37e62...,common_voice_nl_18441136.mp3,Ze had de bank beschadigd met haar skateboard.,2,0,,,
3,2bdc4ac33c994aad2f21339eb4b972e1bf1847a67a86c3...,common_voice_nl_19841421.mp3,Waarom belde je me niet even?,2,0,fourties,male,netherlands
4,3657f0eda48d14c9ae3f324124983c69eb1c973f6fdd34...,common_voice_nl_19573544.mp3,De kinderen moesten zuchten; ze moesten nog ee...,2,0,,,


In [3]:
TRANSLATIONS_DIR = COVOST_DIR + '/covost2'
nl_en_translations = pd.read_csv(TRANSLATIONS_DIR + '/validated.nl_en.en', sep='\t', header=None)
nl_en_translations.head()

Unnamed: 0,0
0,Aboriginals are the original inhabitants of Au...
1,My keyboard is full of dust.
2,She had damaged the couch with her skateboard.
3,Why didn’t you just call me?
4,"The kids sighed, they had to wait another hour."


Show some samples

In [4]:
from IPython.display import Audio

n_samples = 3
for i in range (0, n_samples):
    print('- Audio:')
    audio_path = DUTCH_DIR + '/clips/' + validated_nl_samples.loc[i]['path']
    print(audio_path)
    display(Audio(filename=audio_path, autoplay=False))
    print('- Transcription - nl:')
    print(validated_nl_samples.loc[i]['sentence'])
    print('- Translated transcription - en:')
    print(nl_en_translations.loc[i][0])
    print('-------------------------------------------------')

- Audio:
data/CoVoST2/nl/clips/common_voice_nl_17699535.mp3


- Transcription - nl:
De Aboriginals zijn de oorspronkelijke bewoners van Australië.
- Translated transcription - en:
Aboriginals are the original inhabitants of Australia.
-------------------------------------------------
- Audio:
data/CoVoST2/nl/clips/common_voice_nl_17694848.mp3


- Transcription - nl:
Mijn toetsenbord zit vol stof.
- Translated transcription - en:
My keyboard is full of dust.
-------------------------------------------------
- Audio:
data/CoVoST2/nl/clips/common_voice_nl_18441136.mp3


- Transcription - nl:
Ze had de bank beschadigd met haar skateboard.
- Translated transcription - en:
She had damaged the couch with her skateboard.
-------------------------------------------------


### Preprocess the data set 

In [5]:
# Location to save the preprocessed data
preprocessed_dir = 'preprocessed/dummy'

#### Preprocess the audios

Since librosa does not deal with `.mp3` files, we need a wrapper function to load `.mp3` audios

In [6]:
import librosa
import numpy as np

# Create a .wav verson of a .mp3 file in the same location, and return the path to the .wav file
def mp3_to_wav(mp3_path):
    wav_path = f"{mp3_path[:-4]}.wav"
    gf = os.system(f"""ffmpeg -i {mp3_path} {wav_path}""")
    return wav_path
    
# Wrapper function to load .mp3 audio
def load_mp3(mp3_path, sr=22050, mono=True, offset=0.0, duration=None, dtype=np.float32, res_type='kaiser_best'):
    wav_path = mp3_to_wav(mp3_path)
    signal, sample_rate = librosa.load(wav_path, sr, mono, offset, duration, dtype, res_type)
    # Remove the .wav file when we're done
    os.remove(wav_path)
    return signal, sample_rate

In [7]:
audiodir = DUTCH_DIR + '/clips'
audios_list = [audiodir + '/' + path for path in validated_nl_samples['path']]
train_audios_list = audios_list[0:20]
val_audios_list = audios_list[20:30]

Extract the MFCC features of the audios

In [8]:
from python_speech_features import logfbank, calculate_delta, normalize
from kaldiio import WriteHelper

def preprocess_audios(audio_paths, output_file_prefix):
    out_ark = output_file_prefix + ".ark"
    out_scp = output_file_prefix + ".scp"
    count=0

    with WriteHelper('ark,scp:'+out_ark+','+out_scp) as writer:
        for audio in audio_paths:
            if audio.endswith('.mp3'):
                signal, sample_rate = load_mp3(audio, sr=16000)
            else:
                signal, sample_rate = librosa.load(audio, sr=16000)
            logmel = logfbank(signal, samplerate=sample_rate)
            delta = calculate_delta(logmel)
            features = np.concatenate([logmel, delta], axis=1)
            features = normalize(features) # features.shape gives (x, 80)
            writer(str(count), features)
            count = count + 1
    return out_ark, out_scp
            
preprocess_audios(train_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/nl_audios_train')
preprocess_audios(val_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/nl_audios_val')

('data/CoVoST2/preprocessed/dummy/nl_audios_val.ark',
 'data/CoVoST2/preprocessed/dummy/nl_audios_val.scp')

#### Preprocess the text

Create a text file with one-sentence-per-line from the data 

In [9]:
validated_nl_samples = pd.read_csv(DUTCH_DIR + '/validated.tsv', sep='\t')
def collect_transcription(info_df, audio_paths, output_file_prefix):
    info_df_re_indexed = info_df.set_index('path')
    with open(f"{output_file_prefix}.txt", "w") as out_file:
        for audio_path in audio_paths:
            # write line to output file
            audio_name = os.path.basename(audio_path)
            out_file.write(info_df_re_indexed.loc[audio_name]['sentence'])
            out_file.write("\n")
    return f"{output_file_prefix}.txt"
            
    
raw_text_train_path = collect_transcription(validated_nl_samples, train_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/nl_raw_text_train')
raw_text_val_path = collect_transcription(validated_nl_samples, val_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/nl_raw_text_val')

Use [subword units](https://github.com/google/sentencepiece) to preprocess the text.

In [10]:
import sentencepiece as spm

# Train the model
input_file = raw_text_train_path  # one-sentence-per-line raw corpus file
model_prefix = f'{COVOST_DIR}/{preprocessed_dir}/nl_text'
vocab_size = 172 # 8000, 16000, or 32000
character_coverage = 1 # 0.9995 for languages with rich character set like Japanse or Chinese 
                       # and 1.0 for other languages with small character set
model_type = 'unigram'
spm.SentencePieceTrainer.train(input=input_file, model_prefix=model_prefix, vocab_size=vocab_size, character_coverage=character_coverage, model_type=model_type)

In [11]:
# Use the model to preprocess the text
def preprocess_text(model_path, raw_text_file, output_file, output_type=int):
    sp = spm.SentencePieceProcessor(model_file=model_path)
    with open(raw_text_file, 'r') as f:
        raw_lines = f.readlines() 
    processed_lines = [' '.join([str(elem) for elem in sp.encode(line, out_type=output_type)]) for line in raw_lines]
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in processed_lines:
            f.write(line)
            f.write('\n')
        
preprocess_text(f"{model_prefix}.model", raw_text_train_path, f'{COVOST_DIR}/{preprocessed_dir}/nl_text_train.txt')
preprocess_text(f"{model_prefix}.model", raw_text_val_path, f'{COVOST_DIR}/{preprocessed_dir}/nl_text_val.txt')