In [1]:
import json

In [2]:
dev_file = "../salt-dev-v1.1.jsonl"
test_file = "../salt-test-v1.1.jsonl"
train_file = "../salt-train-v1.1.jsonl"

In [3]:
def get_data(filename):
    data = []
    
    with open(filename) as json_file:
        json_list = list(json_file)
    
    for json_str in json_list:
        data.append(json.loads(json_str))
    
    return data

In [13]:
dev_data = get_data(dev_file)
test_data = get_data(test_file)
train_data = get_data(train_file)
dev_data[0]

{'text': {'eng': "It's the government's responsibility to teach its people about various diseases",
  'lug': "Buvunaanyizibwa bwa gavumenti okusomesa abantu baayo ku bulwadde obw'enjawulo",
  'ach': 'Obedo tic pa gamente me pwonyo lwak i kom two mapat pat',
  'teo': 'Erai aswam apugan aisisianakin itunga ke nuikamunitos adekasinei nu egelegela.',
  'lgg': "Eri azi gamete ni imbata fezu 'ba ivile 'diyini azo ndundu eyi ma dria.",
  'nyn': "N'obujunanizibwa bwa Gavumenti okwegyesa abantu baayo ebikwatirane n'endwara nyingi."}}

In [10]:
import csv

def text_to_audio_file(filename):
    text_to_audio = dict()
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)
        for row in reader:
            text = row[2]
            file = "https://salt-tts-data.s3.eu-west-1.amazonaws.com/data/" + row[3].replace(' ', '+')
            text_to_audio[text] = file
    return text_to_audio

In [15]:
english_tts = text_to_audio_file('english_studio_tts_dataset.csv')
luganda_tts = text_to_audio_file('luganda_studio_tts_dataset.csv')

In [19]:
# count the number of sentences that have matching audio
matches = 0
for line in dev_data + test_data + train_data:
    if line['text']['lug'] in luganda_tts:
        matches += 1
    if line['text']['eng'] in english_tts:
        matches += 1

print(matches)

4851


In [23]:
def add_tts_data(prev_data):
    data_tts = []
    for line in prev_data:
        line_tts = line.copy()
        lug_text = line['text']['lug']
        en_text = line['text']['eng']
        if 'tts-speech' not in line_tts and (lug_text in luganda_tts or en_text in english_tts):
            line_tts['tts-speech'] = {}
        if lug_text in luganda_tts:
            line_tts['tts-speech']['lug'] = luganda_tts[lug_text]
        if en_text in english_tts:
            line_tts['tts-speech']['eng'] = english_tts[en_text]
        data_tts.append(line_tts)
    return data_tts

dev_data_tts = add_tts_data(dev_data)
test_data_tts = add_tts_data(test_data)
train_data_tts = add_tts_data(train_data)

In [24]:
print(len(train_data_tts))

23947


In [26]:
assert len(train_data_tts) == len(train_data)
assert len(test_data_tts) == len(test_data)
assert len(dev_data_tts) == len(dev_data)

In [31]:
def write_to_jsonl(data, output_filename):
    with open(output_filename, 'w') as outfile:
        for record in data:
            json.dump(record, outfile)
            outfile.write('\n')


In [32]:
write_to_jsonl(dev_data_tts, 'salt-dev-v1.2.jsonl')

In [33]:
write_to_jsonl(test_data_tts, 'salt-test-v1.2.jsonl')

In [34]:
write_to_jsonl(train_data_tts, 'salt-train-v1.2.jsonl')