### Download the dataset

In [1]:
import requests
import os
import tarfile
import zipfile
import glob
import pandas as pd
import csv

# Change the working directory to the root of the project
os.chdir(r'C:\Users\TuAhnDinh\Desktop\MediaanProjects\BachelorThesisST')

COVOST_DIR = 'data/CoVoST2'

# Downloads voice clips and transcripts
urls = {'en': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/en.tar.gz',
        'fr': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/fr.tar.gz',
        'de': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/de.tar.gz',
        'it': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/it.tar.gz',
        'pt': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/pt.tar.gz',
        'es': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/es.tar.gz',
        'nl': 'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/nl.tar.gz'}

for lang, url in urls.items():
    lang_dir = COVOST_DIR + '/' + lang
    if not os.path.exists(lang_dir):
        print(f'Downloading {lang} audios')
        filename = url.rsplit('/', 1)[1]
        r = requests.get(url)
        with open(COVOST_DIR + '/' + filename, 'wb') as f:
            f.write(r.content)
        print(f'Extracting {lang} audios')
        tf = tarfile.open(COVOST_DIR + '/' + filename)
        tf.extractall(lang_dir)
        tf.close()
        os.remove(COVOST_DIR + '/' + filename)

XX_EN_LANGUAGES  = ['fr', 'de', 'it', 'pt', 'es', 'nl']
EN_XX_LANGUAGES  = ['de', 'et']
# Download CoVoST 2 translations (covost_v2.<src_lang_code>_<tgt_lang_code>.tsv, 
# which matches the rows in validated.tsv from Common Voice)
if not os.path.exists(COVOST_DIR + '/covost2'):
    os.mkdir(COVOST_DIR + '/covost2')
for lang in XX_EN_LANGUAGES:
    if not os.path.exists(COVOST_DIR + '/covost2' + f'/{lang}_en'):
        os.mkdir(COVOST_DIR + '/covost2'+ f'/{lang}_en')
        # Download and extract .tsv file
        url = f'https://dl.fbaipublicfiles.com/covost/covost_v2.{lang}_en.tsv.tar.gz'
        filename = url.rsplit('/', 1)[1]
        print(f'Download and extracting {filename}')
        r = requests.get(url)
        with open(COVOST_DIR + '/covost2' + f'/{lang}_en' + f'/{filename}', 'wb') as f:
            f.write(r.content)
        tf = tarfile.open(COVOST_DIR + '/covost2' + f'/{lang}_en' + f'/{filename}')
        tf.extractall(COVOST_DIR + '/covost2' + f'/{lang}_en')
        tf.close()
        os.remove(COVOST_DIR + '/covost2' + f'/{lang}_en' + f'/{filename}')
        
        # Split .tsv file into train, dev and test set
        os.system(f"python get_covost_splits.py "
                  f"--version 2 --src-lang {lang} --tgt-lang en "
                  f"--root {COVOST_DIR + '/covost2' + f'/{lang}_en'} "
                  f"--cv-tsv {COVOST_DIR + '/' + lang + '/validated.tsv'}")
        
for lang in EN_XX_LANGUAGES:
    if not os.path.exists(COVOST_DIR + '/covost2' + f'/en_{lang}'):
        os.mkdir(COVOST_DIR + '/covost2'+ f'/en_{lang}')
        # Download and extract .tsv file
        url = f'https://dl.fbaipublicfiles.com/covost/covost_v2.en_{lang}.tsv.tar.gz'
        filename = url.rsplit('/', 1)[1]
        print(f'Download and extracting {filename}')
        r = requests.get(url)
        with open(COVOST_DIR + '/covost2' + f'/en_{lang}' + f'/{filename}', 'wb') as f:
            f.write(r.content)
        tf = tarfile.open(COVOST_DIR + '/covost2' + f'/en_{lang}' + f'/{filename}')
        tf.extractall(COVOST_DIR + '/covost2' + f'/en_{lang}')
        tf.close()
        os.remove(COVOST_DIR + '/covost2' + f'/en_{lang}' + f'/{filename}')
        
        # Split .tsv file into train, dev and test set
        os.system(f"python get_covost_splits.py "
                  f"--version 2 --src-lang en --tgt-lang {lang} "
                  f"--root {COVOST_DIR + '/covost2' + f'/en_{lang}'} "
                  f"--cv-tsv {COVOST_DIR + '/' + 'en' + '/validated.tsv'}")

**Note on data split**: 

- Use standard Common Voice dev/test splits (no duplicated sentences)
- Use extended Common Voice train split to improve data utilization (include all duplicated sentences with different speakers)

More info on the [Covost2 paper](https://arxiv.org/pdf/2007.10310.pdf).

#### A function to remove empty .mp3 audio lines from a split dataframe

In [2]:
def remove_empty_audio(split_df, audiodir):
    empty = []
    paths = split_df['path'].values
    for path in paths:
        if os.path.getsize(audiodir + '/' + path) == 0:
            print(f"found {path} to be empty")
            empty.append(path)
    new_df = split_df.set_index('path')
    new_df.drop(labels=empty, axis='index', inplace=True)
    new_df.reset_index(inplace=True)
    return new_df

def remove_empty_transcription(split_df):
    new_df = split_df.loc[(split_df['sentence'] != "") & (split_df['sentence'] != '""') & (split_df['translation'] != "") & (split_df['translation'] != '""')]
    return new_df
    
def read_tsv_split(translation_dir, src_lang, tgt_lang, split, audiodir):
    split_df = pd.read_csv(translation_dir + f'/covost_v2.{src_lang}_{tgt_lang}.{split}.tsv', sep='\t', header=0, encoding="utf-8", escapechar="\\", quoting=csv.QUOTE_NONE, na_filter=False)
    return remove_empty_transcription(remove_empty_audio(split_df, audiodir))

### Explore the dataset

We take a look at Dutch --> English as an example

In [None]:
DUTCH_AUDIO_DIR = COVOST_DIR + '/nl'
NL_EN_TRANSLATIONS_DIR = COVOST_DIR + '/covost2' + '/nl_en'
nl_en_translations_dev = pd.read_csv(NL_EN_TRANSLATIONS_DIR + '/covost_v2.nl_en.dev.tsv', sep='\t', header=0, encoding="utf-8", escapechar="\\", quoting=csv.QUOTE_NONE, na_filter=False)
nl_en_translations_dev.head()

Show some samples

### Some statistic

In [2]:
# import librosa
# import numpy as np

# X_en_stat = pd.DataFrame(columns=[['numb. of audios','numb. of audios','numb. of audios',
#                                    'numb. of unique sentence auios','numb. of unique sentence auios','numb. of unique sentence auios',
#                                    'avg audio length (s)','avg audio length (s)','avg audio length (s)',
#                                    'avg numb. of words per audio','avg numb. of words per audio','avg numb. of words per audio'],
#                                  ['train','dev','test',
#                                   'train','dev','test',
#                                   'train','dev','test',
#                                   'train', 'dev','test']])

# for lang in XX_EN_LANGUAGES:
#     SRC_AUDIO_DIR = COVOST_DIR + '/' + lang
#     audiodir = SRC_AUDIO_DIR + '/clips'
#     TRANSLATIONS_DIR =  COVOST_DIR + '/covost2' + f'/{lang}_en'
#     for split in ['train', 'dev', 'test']:
#         split_df = read_tsv_split(TRANSLATIONS_DIR, src_lang=lang, tgt_lang='en', split=split, audiodir=audiodir)
#         X_en_stat.at[lang, ('numb. of audios', split)] = len(split_df)
#         X_en_stat.at[lang, ('numb. of unique sentence auios', split)] = len(set(split_df['sentence'].values))
#         X_en_stat.at[lang, ('avg audio length (s)', split)] = np.mean(np.array([librosa.get_duration(filename=audiodir + '/' + path) for path in split_df['path'].values]))
#         X_en_stat.at[lang, ('avg numb. of words per audio', split)] = np.mean(np.array([len(s.split()) for s in split_df['sentence'].values]))

# X_en_stat.to_csv(COVOST_DIR + '/' + 'X_en_stat.csv')

X_en_stat = pd.read_csv(COVOST_DIR + '/' + 'X_en_stat.csv', index_col=0, header=[0,1])
X_en_stat

Unnamed: 0_level_0,numb. of audios,numb. of audios,numb. of audios,numb. of unique sentence auios,numb. of unique sentence auios,numb. of unique sentence auios,avg audio length (s),avg audio length (s),avg audio length (s),avg numb. of words per audio,avg numb. of words per audio,avg numb. of words per audio
Unnamed: 0_level_1,train,dev,test,train,dev,test,train,dev,test,train,dev,test
fr,207372,14760,14760,130602,14760,14760,4.544324,5.260833,5.639743,8.898622,9.348848,9.544986
de,127824,13511,13511,71831,13511,13511,5.146449,5.459307,5.697595,8.689651,8.885871,8.779143
it,31698,8940,8951,19387,8940,8951,4.976882,5.734083,6.148922,9.669443,9.718009,9.802257
pt,9158,3318,4023,6014,3318,4023,4.000819,4.72616,4.722993,7.348766,7.623568,7.890132
es,79013,13221,13221,64351,13221,13221,5.109949,5.896785,6.140504,9.458089,9.7982,9.863324
nl,7108,1699,1699,1893,1699,1699,3.661269,3.978693,4.264626,8.137029,8.331371,8.39847


In [3]:
# en_X_stat = pd.DataFrame(columns=[['numb. of audios','numb. of audios','numb. of audios',
#                                    'numb. of unique sentence auios','numb. of unique sentence auios','numb. of unique sentence auios',
#                                    'avg audio length (s)','avg audio length (s)','avg audio length (s)',
#                                    'avg numb. of words per audio','avg numb. of words per audio','avg numb. of words per audio'],
#                                  ['train','dev','test',
#                                   'train','dev','test',
#                                   'train','dev','test',
#                                   'train', 'dev','test']])

# for lang in EN_XX_LANGUAGES:
#     SRC_AUDIO_DIR = COVOST_DIR + '/' + 'en'
#     audiodir = SRC_AUDIO_DIR + '/clips'
#     TRANSLATIONS_DIR =  COVOST_DIR + '/covost2' + f'/en_{lang}'
#     for split in ['train', 'dev', 'test']:
#         split_df = read_tsv_split(TRANSLATIONS_DIR, src_lang='en', tgt_lang=lang, split=split, audiodir=audiodir)
#         en_X_stat.at[lang, ('numb. of audios', split)] = len(split_df)
#         en_X_stat.at[lang, ('numb. of unique sentence auios', split)] = len(set(split_df['sentence'].values))
#         en_X_stat.at[lang, ('avg audio length (s)', split)] = np.mean(np.array([librosa.get_duration(filename=audiodir + '/' + path) for path in split_df['path'].values]))
#         en_X_stat.at[lang, ('avg numb. of words per audio', split)] = np.mean(np.array([len(s.split()) for s in split_df['sentence'].values]))

# en_X_stat.to_csv(COVOST_DIR + '/' + 'en_X_stat.csv')        

en_X_stat = pd.read_csv(COVOST_DIR + '/' + 'en_X_stat.csv', index_col=0, header=[0,1])
en_X_stat

Unnamed: 0_level_0,numb. of audios,numb. of audios,numb. of audios,numb. of unique sentence auios,numb. of unique sentence auios,numb. of unique sentence auios,avg audio length (s),avg audio length (s),avg audio length (s),avg numb. of words per audio,avg numb. of words per audio,avg numb. of words per audio
Unnamed: 0_level_1,train,dev,test,train,dev,test,train,dev,test,train,dev,test
de,289413,15531,15531,232958,15531,15531,5.300589,6.006085,5.670569,9.801999,9.852295,9.100315


### Preprocess a sample dataset 

In [7]:
# Location to save the preprocessed data
src_lang = 'en'
tgt_lang = 'de'
preprocessed_dir = f'preprocessed/{src_lang}_{tgt_lang}'
os.mkdir(f'{COVOST_DIR}/{preprocessed_dir}')

SRC_AUDIO_DIR = COVOST_DIR + '/' + src_lang
audiodir = SRC_AUDIO_DIR + '/clips'

TRANSLATIONS_DIR =  COVOST_DIR + '/covost2' + f'/{src_lang}_{tgt_lang}'
train_df = read_tsv_split(TRANSLATIONS_DIR, src_lang=src_lang, tgt_lang=tgt_lang, split='train', audiodir=audiodir)
val_df = read_tsv_split(TRANSLATIONS_DIR, src_lang=src_lang, tgt_lang=tgt_lang, split='dev', audiodir=audiodir)
test_df = read_tsv_split(TRANSLATIONS_DIR, src_lang=src_lang, tgt_lang=tgt_lang, split='test', audiodir=audiodir)

train_audios_list = [audiodir + '/' + path for path in train_df['path']]
val_audios_list = [audiodir + '/' + path for path in val_df['path']]
test_audios_list = [audiodir + '/' + path for path in test_df['path']]

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'data/CoVoST2/preprocessed/dummy/en_de'

#### Preprocess the audios

Since librosa does not deal with `.mp3` files, we need a wrapper function to load `.mp3` audios

In [None]:
import librosa
import numpy as np

# Create a .wav verson of a .mp3 file in the same location, and return the path to the .wav file
def mp3_to_wav(mp3_path):
    wav_path = f"{mp3_path[:-4]}.wav"
    gf = os.system(f"""ffmpeg -i {mp3_path} {wav_path}""")
    return wav_path
    
# Wrapper function to load .mp3 audio
def load_mp3(mp3_path, sr=22050, mono=True, offset=0.0, duration=None, dtype=np.float32, res_type='kaiser_best'):
    wav_path = mp3_to_wav(mp3_path)
    signal, sample_rate = librosa.load(wav_path, sr, mono, offset, duration, dtype, res_type)
    # Remove the .wav file when we're done
    os.remove(wav_path)
    return signal, sample_rate

Extract the MFCC features of the audios

In [None]:
from python_speech_features import logfbank, calculate_delta, normalize
from kaldiio import WriteHelper

def preprocess_audios(audio_paths, output_file_prefix):
    out_ark = output_file_prefix + ".ark"
    out_scp = output_file_prefix + ".scp"
    count=0

    with WriteHelper('ark,scp:'+out_ark+','+out_scp) as writer:
        for audio in audio_paths:
            if audio.endswith('.mp3'):
                signal, sample_rate = load_mp3(audio, sr=16000)
            else:
                signal, sample_rate = librosa.load(audio, sr=16000)
            logmel = logfbank(signal, samplerate=sample_rate)
            delta = calculate_delta(logmel)
            features = np.concatenate([logmel, delta], axis=1)
            features = normalize(features) # features.shape gives (x, 80)
            writer(str(count), features)
            count = count + 1
    return out_ark, out_scp
            
preprocess_audios(train_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/{src_lang}_audio_train')
preprocess_audios(val_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/{src_lang}_audio_val')
preprocess_audios(test_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/{src_lang}_audio_test')

#### Preprocess the text

Create a text file with one-sentence-per-line from the data 

In [None]:
def collect_transcription(info_df, audio_paths, output_file_prefix, transcription_type='translated'):
    """ transcription_type is either 'original' or 'translated'
    """
    info_df_re_indexed = info_df.set_index('path')
    with open(f"{output_file_prefix}.txt", "w", encoding="utf-8") as out_file:
        for audio_path in audio_paths:
            # write line to output file
            audio_name = os.path.basename(audio_path)
            if transcription_type == 'original':
                out_file.write(prepare_sentence(info_df_re_indexed.loc[audio_name]['sentence']))
            elif transcription_type == 'translated':
                out_file.write(prepare_sentence(info_df_re_indexed.loc[audio_name]['translation']))
            else:
                raise RuntimeError("transcription_type is either 'original' or 'translated'")
            out_file.write("\n")
    return f"{output_file_prefix}.txt"

def prepare_sentence(sentence):
    if sentence.startswith('"') and sentence.endswith('"'):
        return sentence[1:-1]
    return sentence
             
raw_text_train_path = collect_transcription(train_df, train_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/{tgt_lang}_raw_text_train')
raw_text_val_path = collect_transcription(val_df, val_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/{tgt_lang}_raw_text_val')
raw_text_test_path = collect_transcription(test_df, test_audios_list, f'{COVOST_DIR}/{preprocessed_dir}/{tgt_lang}_raw_text_test')


Use [subword units](https://github.com/google/sentencepiece) to preprocess the text.

In [16]:
def subword_unit(model_path, raw_text_file, output_file, output_type=str):
    """
    Use a Sentence Piece model to do subword unit on a text file
    """
    sp = spm.SentencePieceProcessor(model_file=model_path)
    with open(raw_text_file, 'r', encoding="utf-8") as f:
        raw_lines = f.readlines()
    processed_lines = [' '.join([str(elem) for elem in sp.encode(line, out_type=output_type)]) for line in raw_lines]
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in processed_lines:
            f.write(line)
            f.write('\n')

In [4]:
def preprocess_transcription(transcription_type, train_df, val_df, test_df, train_audios_list, val_audios_list, test_audios_list,
                    save_location, lang):
    raw_text_train_path = collect_transcription(train_df, train_audios_list, f'{save_location}/{lang}_raw_text_train', transcription_type)
    raw_text_val_path = collect_transcription(val_df, val_audios_list, f'{save_location}/{lang}_raw_text_val', transcription_type)
    raw_text_test_path = collect_transcription(test_df, test_audios_list, f'{save_location}/{lang}_raw_text_test', transcription_type)

    # Train the model to do subword unit on the text
    input_file = raw_text_train_path  # one-sentence-per-line raw corpus file
    model_prefix = f'{save_location}/{lang}_text'
    vocab_size = 8000  # 8000, 16000, or 32000
    if lang == 'zh-CN' or lang == 'ja':
        character_coverage = 0.9995  # 0.9995 for languages with rich character set like Japanese or Chinese
    else:
        character_coverage = 1  # and 1.0 for other languages with small character set
    model_type = 'unigram'
    spm.SentencePieceTrainer.train(input=input_file, model_prefix=model_prefix, vocab_size=vocab_size,
                                   character_coverage=character_coverage, model_type=model_type)
    
    subword_unit(f"{model_prefix}.model", raw_text_train_path,
                    f'{save_location}/{lang}_text_train.txt')
    subword_unit(f"{model_prefix}.model", raw_text_val_path,
                    f'{save_location}/{lang}_text_val.txt')
    subword_unit(f"{model_prefix}.model", raw_text_test_path,
                    f'{save_location}/{lang}_text_test.txt')

In [None]:
import sentencepiece as spm

preprocess_transcription('original', train_df, val_df, test_df, train_audios_list, val_audios_list, test_audios_list, 
                         f'{COVOST_DIR}/{preprocessed_dir}', src_lang)
preprocess_transcription('translated', train_df, val_df, test_df, train_audios_list, val_audios_list, test_audios_list, 
                         f'{COVOST_DIR}/{preprocessed_dir}', tgt_lang)

### Confirm that the duplicated sentences are distributed equally over the dataset

In [3]:
df = pd.read_csv("../data/CoVoST2/covost2/en_de/covost_v2.en_de.train.tsv", sep='\t', header=0, encoding="utf-8", escapechar="\\", quoting=csv.QUOTE_NONE, na_filter=False)

In [5]:
print(df.shape)
df.head()

(289430, 4)


Unnamed: 0,path,sentence,translation,client_id
0,common_voice_en_19664034.mp3,"""These data components in turn serve as the """"...",Diese Datenkomponenten wiederum dienen als die...,4f29be8fe932d773576dd3df5e111929f4e22242232245...
1,common_voice_en_19664035.mp3,The church is unrelated to the Jewish politica...,Die Kirche ist nicht mit der jüdischen politis...,4f29be8fe932d773576dd3df5e111929f4e22242232245...
2,common_voice_en_19664037.mp3,The following represents architectures which h...,Die folgenden Architekturen sind stellvertrete...,4f29be8fe932d773576dd3df5e111929f4e22242232245...
3,common_voice_en_19664038.mp3,"Additionally, the pulse output can be directed...",Außerdem kann die Impulsausgabe durch eine von...,4f29be8fe932d773576dd3df5e111929f4e22242232245...
4,common_voice_en_19664040.mp3,The two are robbed by a pickpocket who is losi...,Die Zwei werden von einem Taschendieb ausgerau...,4f29be8fe932d773576dd3df5e111929f4e22242232245...


In [6]:
df['sentence'].nunique()

232975

In [13]:
dup = df[df.duplicated(subset=['sentence'], keep=False)]
dup.shape

(75563, 4)

In [15]:
dup.loc[:round(289430/10)]

Unnamed: 0,path,sentence,translation,client_id
10,common_voice_en_589230.mp3,What did you think of that trip.,Wie hat dir die Reise gefallen?,4f9d2db67e38d0513bf84559689e88b6e790b7ed82d930...
11,common_voice_en_589234.mp3,Two men spoke in Arabic while having a drink a...,"Zwei Männer, die an der Bar etwas tranken, spr...",4f9d2db67e38d0513bf84559689e88b6e790b7ed82d930...
28,common_voice_en_181712.mp3,Two children playing on a statue,Zwei Kinder spielen auf einer Statue.,4fdb7a7c4e6b6e03754daa018cb1af74b93024a58502b3...
42,common_voice_en_19643733.mp3,"He was still charming, venerable, and courteou...","Er war noch immer charmant, ehrwürdig, zuvorko...",4ffcdeb024dca515637e87338a67b9e9ea69483761a461...
46,common_voice_en_19643737.mp3,"""All five of the """"general electorates"""" were ...",Die General Voters Party gewann alle fünf der ...,4ffcdeb024dca515637e87338a67b9e9ea69483761a461...
...,...,...,...,...
28930,common_voice_en_18608834.mp3,"‘No, no!’ said the Queen.","Die Königin sagte: ‘Nein, nein!‘",fc2591bf6268fc853f1e782a156d49b6408449eef9588d...
28931,common_voice_en_18608835.mp3,"‘Yes,’ said Alice, ‘we learned French and music.’","‘Ja,‘ sagte Alice, ‘wir haben für Mathe und Fr...",fc2591bf6268fc853f1e782a156d49b6408449eef9588d...
28932,common_voice_en_18608836.mp3,Do you hear any sound?,Hörst du etwas?,fc2591bf6268fc853f1e782a156d49b6408449eef9588d...
28941,common_voice_en_19645214.mp3,"McGonigal was born in Melville, Saskatchewan.","McGonigal wurde in Melville, Saskatchewan gebo...",fcd1914648b65b2b9eb150d55ed6381df311512bf31d40...


**Observe**: the duplicated sentences are distributed equally throughout the dataset

### Create artificial language: reversed English

In [5]:
import string

def reverse_sentence(line):
    reverse_line = line[::-1]
    reverse_line = reverse_line.lower()
    reverse_line = reverse_line.translate(str.maketrans('', '', string.punctuation))
    if line[-1] in string.punctuation:
        # Put back the last punctuation of the sentence if any
        reverse_line = reverse_line + line[-1]
    # Capitalize the beginning of the sentence
    reverse_line = reverse_line[0].upper() + reverse_line[1:]
    return reverse_line
     
reverse_sentence("Hello World!")

'Dlrow olleh!'

In [13]:
def reverse_transcription(original_file, output_file):
    with open(original_file, 'r', encoding="utf-8") as f:
        original_sentences = f.readlines()
    original_sentences = [sentence.rstrip('\n') for sentence in original_sentences]
    print(original_sentences[:2])
    with open(output_file, "w", encoding="utf-8") as out_file:
        for sentence in original_sentences:
            out_file.write(reverse_sentence(sentence))
            out_file.write("\n")
    return output_file

reverse_transcription("en_raw_text_test.txt", "enr_raw_text_test.txt")

["She'll be all right.", "All's well that ends well."]


'enr_raw_text_test.txt'

In [None]:
def preprocess_reversed_transcription(raw_text_train_path, raw_text_val_path, raw_text_test_path, save_location, lang):
    # Train the model to do subword unit on the text
    input_file = raw_text_train_path  # one-sentence-per-line raw corpus file
    model_prefix = f'{save_location}/{lang}_text'
    # TODO
    vocab_size = 8000  # 8000, 16000, or 32000
    if lang == 'zh-CN' or lang == 'ja':
        character_coverage = 0.9995  # 0.9995 for languages with rich character set like Japanese or Chinese
    else:
        character_coverage = 1  # and 1.0 for other languages with small character set
    model_type = 'unigram'
    spm.SentencePieceTrainer.train(input=input_file, model_prefix=model_prefix, vocab_size=vocab_size,
                                   character_coverage=character_coverage, model_type=model_type)

    subword_unit(f"{model_prefix}.model", raw_text_train_path,
                 f'{save_location}/{lang}_text_train.txt')
    subword_unit(f"{model_prefix}.model", raw_text_val_path,
                 f'{save_location}/{lang}_text_val.txt')
    subword_unit(f"{model_prefix}.model", raw_text_test_path,
                 f'{save_location}/{lang}_text_test.txt')
    
def subword_unit(model_path, raw_text_file, output_file, output_type=str):
    """
    Use a Sentence Piece model to do subword unit on a text file
    """
    sp = spm.SentencePieceProcessor(model_file=model_path)
    with open(raw_text_file, 'r', encoding="utf-8") as f:
        raw_lines = f.readlines()
    processed_lines = [' '.join([str(elem) for elem in sp.encode(line, out_type=output_type)]) for line in raw_lines]
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in processed_lines:
            f.write(line)
            f.write('\n')