# Conversion of machine translation datasets to SALT v2 format

This notebook converts existing machine translation datasets that we have used, including the first version of SALT plus third party datasets, and converts it into the v2 format.

The new format is .jsonl and looks like this:

    [
    {'text' : {
        'lug' : 'Oli otya?,
        'ach' : 'Itye maber?'
        'eng' : 'How are you?'}}
    {'text' : {
        'lug' : 'Weebale,
        'ach' : 'Apwoyo',
        'eng' : 'Thank you'}}
    ...
    ]
    
 The resulting files are compressed and stored in `s3://sunbird-translate`, downloadable [here](https://sunbird-translate.s3.us-east-2.amazonaws.com/salt-translation-plus-external-datasets.zip).

In [None]:
from IPython import display
import numpy as np
import os
import random
import json
import glob
import requests

In [None]:
OUTPUT_DIR = '../datasets/salt-translation-plus-external-datasets-15-3-23/'
!mkdir -p {OUTPUT_DIR}

In [None]:
def file_to_list(path):
    with open(path) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]
        return lines
    
def url_to_list(url):
    response = requests.get(url)
    return response.text.splitlines()

# MT560 datasets

MT560 data was already extracted for four languages [here](https://github.com/SunbirdAI/parallel-text-EDA/blob/main/Prepare_supplementary_translation_data_(MT560%2BFLORES101%2BAI4D).ipynb).

There are several other African languages not yet processed which can be added. We convert from our previous data files.

In [None]:
languages = ['lug', 'ach', 'nyn', 'luo']

DATA_DIR = 'v7-dataset/v7.0/supervised/'

if not os.path.exists('v7-dataset'):
    !wget https://sunbird-translate.s3.us-east-2.amazonaws.com/v7-dataset.zip
    !unzip v7-dataset.zip
    display.clear_output()
    
for language in languages:
    source = file_to_list(DATA_DIR + f'mul-en/train_mt560_{language}.src')
    target = file_to_list(DATA_DIR + f'mul-en/train_mt560_{language}.tgt')

    sentences = []
    for s, t in zip(source, target):
        sentences.append({'text': {language: s, 'eng': t}})

    with open(OUTPUT_DIR + f'mt560_{language}.jsonl', 'w') as outfile:
        for entry in sentences:
            json.dump(entry, outfile)
            outfile.write('\n')

# Makerere AI4D (Luganda to English)

15,000 good quality Luganda to English translations.

In [None]:
lug = file_to_list(DATA_DIR + 'mul-en/train_ai4d.src')
en = file_to_list(DATA_DIR + 'mul-en/train_ai4d.tgt')

sentences = []
for s, t in zip(lug, en):
    sentences.append({'text': {'lug': s, 'eng': t}})

with open(OUTPUT_DIR + 'ai4d.jsonl', 'w') as outfile:
    for entry in sentences:
        json.dump(entry, outfile)
        outfile.write('\n')

# FLORES 200

This dataset contains 2000 sentences with translations in 44 different African languages. We combine the dev and devtest splits into a single set.

In [None]:
if not os.path.exists('flores200_dataset'):
    !wget --trust-server-names https://tinyurl.com/flores200dataset
    !tar xvzf flores200_dataset.tar.gz 
    display.clear_output()

languages = ['lug', 'eng', 'ibo', 'ewe', 'fon', 'hau', 'kam', 'kea', 'kik', 'kin',
             'kmb', 'kon', 'lin', 'lua', 'luo', 'nso', 'nya', 'gaz', 'run', 'sag',
             'sna', 'som', 'sot', 'ssw', 'swh', 'tir', 'tsn', 'tso', 'tum', 'twi',
             'umb', 'wol', 'xho', 'zul', 'aka', 'amh', 'aka', 'bam', 'bem', 'cjk',
             'dik', 'dyu', 'fuv', 'kbp']


source_sentences = {}

for language in languages:
    dev_path = glob.glob(f'flores200_dataset/dev/{language}*.dev')[0]
    devtest_path = glob.glob(f'flores200_dataset/devtest/{language}*.devtest')[0]
    source_sentences[language] = file_to_list(dev_path) + file_to_list(devtest_path)
    if not len(source_sentences[language]):
        raise ValueError(f'No text found for language {language}.')  

N = len(source_sentences['lug'])
sentences = []
for i in range(N):
    sentence = {'text': {}}
    for language in languages:
        sentence['text'][language] = source_sentences[language][i] 
    sentences.append(sentence)

with open(OUTPUT_DIR + f'flores200.jsonl', 'w') as outfile:
    for entry in sentences:
        json.dump(entry, outfile)
        outfile.write('\n')

# SALT v1 dataset

From the entire v1 dataset we create train, dev and test splits.

In [None]:
SALT_URL = 'https://github.com/SunbirdAI/salt/blob/main/sunbird-ug-lang-v1.0.jsonl?raw=true'
response = requests.get(SALT_URL)
result = [json.loads(jline) for jline in response.text.splitlines()]

In [None]:
unique_english_text = set()
sentences = []
for item in result:
    if item['English'] not in unique_english_text:
        sentence = {'text': {}}
        sentence['text']['eng'] = item['English']
        sentence['text']['lug'] = item['Luganda']
        sentence['text']['ach'] = item['Acholi']
        sentence['text']['teo'] = item['Ateso']
        sentence['text']['lgg'] = item['Lugbara']
        sentence['text']['nyn'] = item['Runyankole']
        sentences.append(sentence)
        unique_english_text.add(item['English'])

In [None]:
np.random.seed(0)
order = np.random.permutation(np.arange(len(sentences)))
salt_dev = [sentences[i] for i in order[:500]]
salt_test = [sentences[i] for i in order[500:1000]]
salt_train = [sentences[i] for i in order[1000:]]

In [None]:
eng_dev = set([s['text']['eng'] for s in salt_dev])
eng_test = set([s['text']['eng'] for s in salt_test])
eng_train = set([s['text']['eng'] for s in salt_train])

if eng_dev.intersection(eng_test):
    raise ValueError('Overlap between dev and test')
if eng_dev.intersection(eng_train):
    raise ValueError('Overlap between dev and train')
if eng_train.intersection(eng_test):
    raise ValueError('Overlap between test and train')

In [None]:
with open(OUTPUT_DIR + f'salt-train.jsonl', 'w') as outfile:
    for entry in salt_train:
        json.dump(entry, outfile)
        outfile.write('\n')
        
with open(OUTPUT_DIR + f'salt-dev.jsonl', 'w') as outfile:
    for entry in salt_dev:
        json.dump(entry, outfile)
        outfile.write('\n')
        
with open(OUTPUT_DIR + f'salt-test.jsonl', 'w') as outfile:
    for entry in salt_test:
        json.dump(entry, outfile)
        outfile.write('\n')

# Monolingual text (web scraped)

Data was scraped from the web using [this code](https://github.com/SunbirdAI/parallel-text-EDA/tree/main/back_translation).

In [None]:
url_prefix = ('https://raw.githubusercontent.com/SunbirdAI/'
              'parallel-text-EDA/main/back_translation/data/')
english_sentences = url_to_list(url_prefix + 'eng/daily-monitor.txt')
english_sentences += url_to_list(url_prefix + 'eng/new-vision.txt')
english_sentences = [{'text': {'eng': s}} for s in english_sentences]

In [None]:
luganda_sentences = url_to_list(url_prefix + 'lug/bukedde.txt')
luganda_sentences += url_to_list(url_prefix + 'lug/makerere.txt')
luganda_sentences = [{'text': {'lug': s}} for s in luganda_sentences]

In [None]:
acholi_sentences = url_to_list(url_prefix + 'ach/acholi-online.txt')
acholi_sentences += url_to_list(url_prefix + 'ach/misc.txt')
acholi_sentences += url_to_list(url_prefix + 'ach/rupiny.txt')
acholi_sentences = [{'text': {'ach': s}} for s in acholi_sentences]

In [None]:
len(acholi_sentences), len(luganda_sentences), len(english_sentences)

In [None]:
with open(OUTPUT_DIR + f'monolingual-eng.jsonl', 'w') as outfile:
    for entry in english_sentences:
        json.dump(entry, outfile)
        outfile.write('\n')

with open(OUTPUT_DIR + f'monolingual-lug.jsonl', 'w') as outfile:
    for entry in luganda_sentences:
        json.dump(entry, outfile)
        outfile.write('\n')
        
with open(OUTPUT_DIR + f'monolingual-ach.jsonl', 'w') as outfile:
    for entry in acholi_sentences:
        json.dump(entry, outfile)
        outfile.write('\n') 