In [None]:
!git clone https://github.com/jqug/salt.git
!pip install -q sacremoses
!pip install -q pandas
!pip install -q datasets

import gzip
import io
from IPython import display
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import sacremoses
import salt.constants
import datasets

pd.set_option('display.max_colwidth', None)

# Prepare MT560 data

This is a big dataset, around 35GB compressed. Only a small part of it is relevant to the languages we are interested in, though. So first we find which lines have language codes  `lug` (Luganda), `ach` (Acholi), or `nyn` (Runyankore).

In [None]:
import pandas as pd
import numpy as np
import concurrent.futures
from tqdm.notebook import tqdm
import salt.constants
import time
from IPython import display
import os
import gzip 
import sacremoses
import collections

!wget -nc https://object.pouta.csc.fi/OPUS-MT560/train.v1.lang.gz

print("Reading languages list...")
languages = pd.read_csv('train.v1.lang.gz', engine='c', names=['code'])
language_codes = set(salt.constants.SALT_LANGUAGE_NAMES.keys())
print(f"Dataset loaded with {len(languages)} lines and {len(language_codes)} language codes")

print("Counting examples by language...")
lang_counts = languages['code'].value_counts().to_dict()

# Sort by count for better readability
print("Language distribution:")
for code, count in sorted(language_counts.items(), key=lambda x: -x[1]):
    if count > 0:  # Only show languages that have at least one line
        print(f'{count} lines of language {code}')

In [None]:
mt560_subsets['alz'][:10]

Now retrieve the actual sentences. This should take ~30 minutes to download and ~50 minutes to iterate over.

In [None]:
!wget https://object.pouta.csc.fi/OPUS-MT560/train.v1.eng.tok.gz
!wget https://object.pouta.csc.fi/OPUS-MT560/train.v1.src.tok.gz

In [None]:

language_codes = set(salt.constants.SALT_LANGUAGE_NAMES.keys())
detokenizer = sacremoses.MosesDetokenizer(lang='en')

mt560_subsets = collections.defaultdict(list)

with gzip.open('train.v1.src.tok.gz', 'r') as src_file, gzip.open('train.v1.eng.tok.gz', 'r') as eng_file:
    for line_src, line_eng, language in tqdm(zip(src_file, eng_file, languages['code']), total=len(languages)):
        if language in language_codes:
            src_processed = detokenizer.detokenize([line_src.decode('utf8')])
            eng_processed = detokenizer.detokenize([line_eng.decode('utf8')])
            mt560_subsets[language].extend([{f'{language}_target_text': src_processed, 'eng_text': eng_processed}])

In [None]:
for selected_language in ['swa', 'kin', 'ibo', 'lug', 'ach', 'nyn', 'koo', 'ttj']:
    print('###', selected_language, '###')
    df = pd.DataFrame.from_dict(mt560_subsets[selected_language])
    
    # Filter out rows where the text in one language is more than 2.5 times as long as the other
    len_eng = df["eng_text"].str.len()
    len_src = df[f"{selected_language}_target_text"].str.len()
    df = df[
        np.maximum(len_eng, len_src) < 2.5 * np.minimum(len_eng, len_src)
    ]
    
    ds = datasets.Dataset.from_pandas(df)
    ds.push_to_hub(
        "Sunbird/external-translation-data",
        config_name=f'mt560_{selected_language}_unidirectional',
        private=True, split='train')