## Compilation of `multilingual` training dataset for Sunbird language models

In [None]:
# Download Raw dataset
!wget https://transfer.sh/AvcWgi/sunbird-ug-lang-v4.0.jsonl

## Logic

### V1:
#### A model that creates anything to English

Source sentence: Any Language
Target: English


### V2:
#### Anything to Anything model

A model to translate into any of the Ugandan Languages: Specify Language tag


In [1]:
import json
import pandas as pd

In [2]:
with open("sunbird-ug-lang-v5.0.jsonl", "r") as f:
    data = list(f)

In [3]:
# Convert dataset to Dataframe
df = pd.DataFrame(data)

### Part 1: Create Multi-Lingual Dataset to English target

In [4]:
translated_sentence = json.loads(data[0])
translated_sentence.keys()

dict_keys(['English', 'Luganda', 'Runyankole', 'Ateso', 'Lugbara', 'Acholi'])

In [5]:
translated_sentence

{'English': 'Eggplants always grow best under warm conditions.',
 'Luganda': 'Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
 'Runyankole': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata",
 'Ateso': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
 'Lugbara': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
 'Acholi': 'Bilinyanya pol kare dongo maber ka lyeto tye'}

In [8]:
# Function to generate multiple training examples from one translated sentence.
def training_examples_from_sentence(translated_sentence,
                                    target_language = 'English'):
  if target_language not in translated_sentence:
    raise ValueError(
        f'Target language {target_language} expected in translations, but '
        f'{translated_sentence.keys()} found')

  source_languages = set(translated_sentence.keys())
  source_languages.remove(target_language)

  if not source_languages:
    raise ValueError('There should be at least one language apart from the '
                    'target.')

  training_examples = [{'source': translated_sentence[lang], 
                        'target': translated_sentence[target_language]}
                        for lang in source_languages
                      ]

  return training_examples

In [9]:
training_examples = training_examples_from_sentence(translated_sentence)

In [10]:
training_examples

[{'source': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': 'Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': 'Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata",
  'target': 'Eggplants always grow best under warm conditions.'}]

In [11]:
len(df)

25007

In [12]:
c = []
for i in range(len(df)):
  c.append(training_examples_from_sentence(json.loads(data[i])))

In [13]:
from itertools import chain
dataset = pd.DataFrame(list(chain.from_iterable(c)))

In [14]:
# We have 125K language pairs
dataset.shape

(125035, 2)

In [15]:
# train/test/val split

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(dataset, test_size=0.33, random_state=42)

In [16]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [17]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(83773, 2)
(20631, 2)
(20631, 2)


In [18]:
dataset[["source"]].to_csv(r'other.src', header=None, index=None, sep=' ', mode='a')
dataset[["target"]].to_csv(r'eng.tgt', header=None, index=None, sep=' ', mode='a')

**Create the .txt files needed for the training dataset**

In [19]:
language_list = list(dataset.columns)
language_codes = {
    "source": "src", "target": "tgt"
}

In [20]:
for language in language_list:
    train_df[language].to_csv(f"train.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    test_df[language].to_csv(f"test.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    val_df[language].to_csv(f"val.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')

**Create initial dataset folder and add dataset files**


In [21]:
!mkdir multilingual-dataset


In [22]:
!mv {*.src,*.tgt} multilingual-dataset

In [23]:
!ls multilingual-dataset/


eng.tgt   other.src test.src  test.tgt  train.src train.tgt val.src   val.tgt


**Update dataset folder structure and create archive**


In [24]:
# !mkdir -p v6-dataset/v6.0/supervised/  # if this folder does not exist yet
!mkdir v6-dataset/v6.0/supervised/src-tgt

In [25]:
!cp -v multilingual-dataset/*.{src,tgt} v6-dataset/v6.0/supervised/src-tgt

multilingual-dataset/other.src -> v6-dataset/v6.0/supervised/src-tgt/other.src
multilingual-dataset/test.src -> v6-dataset/v6.0/supervised/src-tgt/test.src
multilingual-dataset/train.src -> v6-dataset/v6.0/supervised/src-tgt/train.src
multilingual-dataset/val.src -> v6-dataset/v6.0/supervised/src-tgt/val.src
multilingual-dataset/eng.tgt -> v6-dataset/v6.0/supervised/src-tgt/eng.tgt
multilingual-dataset/test.tgt -> v6-dataset/v6.0/supervised/src-tgt/test.tgt
multilingual-dataset/train.tgt -> v6-dataset/v6.0/supervised/src-tgt/train.tgt
multilingual-dataset/val.tgt -> v6-dataset/v6.0/supervised/src-tgt/val.tgt


In [27]:
# Zip Directory
!zip -r v6-dataset.zip v6-dataset/


  adding: v6-dataset/ (stored 0%)
  adding: v6-dataset/v6.0/ (stored 0%)
  adding: v6-dataset/v6.0/supervised/ (stored 0%)
  adding: v6-dataset/v6.0/supervised/en-run/ (stored 0%)
  adding: v6-dataset/v6.0/supervised/en-run/test.run (deflated 64%)
  adding: v6-dataset/v6.0/supervised/en-run/val.run (deflated 65%)
  adding: v6-dataset/v6.0/supervised/en-run/train.en (deflated 64%)
  adding: v6-dataset/v6.0/supervised/en-run/val.en (deflated 63%)
  adding: v6-dataset/v6.0/supervised/en-run/train.run (deflated 65%)
  adding: v6-dataset/v6.0/supervised/en-run/test.en (deflated 63%)
  adding: v6-dataset/v6.0/supervised/lug-teo/ (stored 0%)
  adding: v6-dataset/v6.0/supervised/lug-teo/train.lug (deflated 66%)
  adding: v6-dataset/v6.0/supervised/lug-teo/val.teo (deflated 66%)
  adding: v6-dataset/v6.0/supervised/lug-teo/test.teo (deflated 66%)
  adding: v6-dataset/v6.0/supervised/lug-teo/val.lug (deflated 65%)
  adding: v6-dataset/v6.0/supervised/lug-teo/test.lug (deflated 65%)
  adding: v6-

### Part2: Multi-Lingual all languages

In [29]:
# Multi-lingual case: generate all examples of source and target language
def training_examples_from_sentence(translated_sentence):

  languages = set(translated_sentence.keys())

  if len(languages) < 2:
    raise ValueError("There must be at least two different languages, "
                     f"found {languages})")

  training_examples = []
  for target_language in languages:

    source_languages = languages.copy()
    source_languages.remove(target_language)

    for source_language in source_languages:
      source_text = (f"<to_{target_language}> "
                     f"{translated_sentence[source_language]}")
      target_text = translated_sentence[target_language]

      training_examples.append({'source': source_text, 
                                'target': target_text})
      
  return training_examples

In [30]:
len(training_examples_from_sentence(translated_sentence))

30

In [31]:
training_examples_from_sentence(translated_sentence)

[{'source': '<to_Lugbara> Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Lugbara> Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Lugbara> Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Lugbara> Eggplants always grow best under warm conditions.',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': "<to_Lugbara> Entonga buriijo zikurira omu mbeera y'obwire erikutagata",
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Ateso> Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
  'target': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.'},
 {'source': '<to_Ateso> Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': 'Epoloi ebirinyanyi ojok apa

In [32]:
# Create all pairs from dataset

m = []
for i in range(len(df)):
  m.append(training_examples_from_sentence(json.loads(data[i])))

In [33]:
m[0]

[{'source': '<to_Lugbara> Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Lugbara> Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Lugbara> Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Lugbara> Eggplants always grow best under warm conditions.',
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': "<to_Lugbara> Entonga buriijo zikurira omu mbeera y'obwire erikutagata",
  'target': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.'},
 {'source': '<to_Ateso> Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
  'target': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.'},
 {'source': '<to_Ateso> Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': 'Epoloi ebirinyanyi ojok apa

In [34]:
len(m[0])

30

In [35]:
len(m)

25007

In [36]:
len(m)*len(m[0])

750210

In [37]:
from itertools import chain
multi_dataset = pd.DataFrame(list(chain.from_iterable(m)))

In [38]:
multi_dataset.tail(5).values

array([['<to_Runyankole> Gameteni silingi eza angiri eli vusi nzila siza ma dria',
        'Gavumenti neeshohoreza munonga omukwombeka enguuto eibara-mwaka.'],
       ['<to_Runyankole> Gamente tiyo ki cente ma dwong adada me gero ki roco gudu.',
        'Gavumenti neeshohoreza munonga omukwombeka enguuto eibara-mwaka.'],
       ['<to_Runyankole> Itosomai apugan ikapun luipu kanginikaru kotoma aiduk irotin.',
        'Gavumenti neeshohoreza munonga omukwombeka enguuto eibara-mwaka.'],
       ['<to_Runyankole> Gavumenti essaasaanya ssente nnyingi nnyo buli mwaka mu kuzimba amakubo.',
        'Gavumenti neeshohoreza munonga omukwombeka enguuto eibara-mwaka.'],
       ['<to_Runyankole> The government spends a lot of money every year on road construction.',
        'Gavumenti neeshohoreza munonga omukwombeka enguuto eibara-mwaka.']],
      dtype=object)