In [None]:
# Download Raw dataset
!wget https://transfer.sh/AvcWgi/sunbird-ug-lang-v4.0.jsonl

--2021-10-29 12:51:54--  https://transfer.sh/AvcWgi/sunbird-ug-lang-v4.0.jsonl
Resolving transfer.sh (transfer.sh)... 144.76.136.153
Connecting to transfer.sh (transfer.sh)|144.76.136.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9101418 (8.7M) []
Saving to: ‘sunbird-ug-lang-v4.0.jsonl’


2021-10-29 12:51:57 (6.32 MB/s) - ‘sunbird-ug-lang-v4.0.jsonl’ saved [9101418/9101418]



In [None]:
## Logic
"""

V1:


A model that creates anything to English

Source sentence: Any Language
Target: English


V2:
Anything to Anything model

A model to translate into any of the Ugandan Languages; Specify Language tag

"""

'\n\nV1:\n\n\nA model that creates anything to English\n\nSource sentence: Any Language\nTarget: English\n\n\nV2:\nAnything to Anything model\n\nA model to translate into any of the Ugandan Languages; Specify Language tag\n\n'

In [None]:
import json


with open('sunbird-ug-lang-v4.0.jsonl', 'r') as f:
    data = list(f)

In [None]:
#convert dataset to Dataframe
import pandas as pd
df = pd.DataFrame(data)

Part 1: Create Multi-Lingual Dataset to English target

In [None]:

translated_sentence = json.loads(data[0])

translated_sentence.keys()

dict_keys(['English', 'Luganda', 'Runyankole', 'Acholi', 'Ateso', 'Lugbara'])

In [None]:
translated_sentence

{'Acholi': 'Bilinyanya pol kare dongo maber ka lyeto tye',
 'Ateso': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
 'English': 'Eggplants always grow best under warm conditions.',
 'Luganda': 'Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
 'Lugbara': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
 'Runyankole': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"}

In [None]:
# Function to generate multiple training examples from one translated sentence.
def training_examples_from_sentence(translated_sentence,
                                    target_language = 'English'):
  if target_language not in translated_sentence:
    raise ValueError(
        f'Target language {target_language} expected in translations, but '
        f'{translated_sentence.keys()} found')

  source_languages = set(translated_sentence.keys())
  source_languages.remove(target_language)

  if not source_languages:
    raise ValueError('There should be at least one language apart from the '
                    'target.')

  training_examples = [{'source': translated_sentence[lang], 
                        'target': translated_sentence[target_language]}
                        for lang in source_languages
                      ]

  return training_examples

In [None]:
training_examples = training_examples_from_sentence(translated_sentence)

In [None]:
training_examples

[{'source': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata",
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': 'Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': 'Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': 'Eggplants always grow best under warm conditions.'},
 {'source': 'Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
  'target': 'Eggplants always grow best under warm conditions.'}]

In [None]:
len(df)

21507

In [None]:
c = []
for i in range(len(df)):
  c.append(training_examples_from_sentence(json.loads(data[i])))

In [None]:
from itertools import chain
dataset = pd.DataFrame(list(chain.from_iterable(c)))

In [None]:
## We have 107K language pairs

dataset.shape

(107535, 2)

In [None]:
## train/test/val split

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(dataset, test_size=0.33, random_state=42)

In [None]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [None]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(72048, 2)
(17743, 2)
(17744, 2)


In [None]:
dataset[["source"]].to_csv(r'other.src', header=None, index=None, sep=' ', mode='a')
dataset[["target"]].to_csv(r'eng.tgt', header=None, index=None, sep=' ', mode='a')

**Create the .txt files needed for the training dataset**

In [None]:
language_list = list(dataset.columns)
language_codes = {
    "source": "src", "target": "tgt"
}

In [None]:
for language in language_list:
    train_df[language].to_csv(f"train.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    test_df[language].to_csv(f"test.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    val_df[language].to_csv(f"val.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')

**Create initial dataset folder and add dataset files**


In [None]:
!mkdir dataset


In [None]:
!mv {*.src,*.tgt} dataset

In [None]:
!ls dataset/


eng.tgt  other.src  test.src  test.tgt	train.src  train.tgt  val.src  val.tgt


**Update dataset folder structure and create archive**


In [None]:
!mkdir -p v5-dataset/v5.0/supervised/
!mkdir v5-dataset/v5.0/supervised/src-tgt

In [None]:
!cp -v dataset/*.{src,tgt} v5-dataset/v5.0/supervised/src-tgt

'dataset/other.src' -> 'v5-dataset/v5.0/supervised/src-tgt/other.src'
'dataset/test.src' -> 'v5-dataset/v5.0/supervised/src-tgt/test.src'
'dataset/train.src' -> 'v5-dataset/v5.0/supervised/src-tgt/train.src'
'dataset/val.src' -> 'v5-dataset/v5.0/supervised/src-tgt/val.src'
'dataset/eng.tgt' -> 'v5-dataset/v5.0/supervised/src-tgt/eng.tgt'
'dataset/test.tgt' -> 'v5-dataset/v5.0/supervised/src-tgt/test.tgt'
'dataset/train.tgt' -> 'v5-dataset/v5.0/supervised/src-tgt/train.tgt'
'dataset/val.tgt' -> 'v5-dataset/v5.0/supervised/src-tgt/val.tgt'


In [None]:
# Zip Directory
!zip -r v5-dataset.zip v5-dataset/


  adding: v5-dataset/ (stored 0%)
  adding: v5-dataset/v5.0/ (stored 0%)
  adding: v5-dataset/v5.0/supervised/ (stored 0%)
  adding: v5-dataset/v5.0/supervised/src-tgt/ (stored 0%)
  adding: v5-dataset/v5.0/supervised/src-tgt/eng.tgt (deflated 91%)
  adding: v5-dataset/v5.0/supervised/src-tgt/other.src (deflated 63%)
  adding: v5-dataset/v5.0/supervised/src-tgt/val.src (deflated 58%)
  adding: v5-dataset/v5.0/supervised/src-tgt/train.src (deflated 58%)
  adding: v5-dataset/v5.0/supervised/src-tgt/test.tgt (deflated 64%)
  adding: v5-dataset/v5.0/supervised/src-tgt/test.src (deflated 58%)
  adding: v5-dataset/v5.0/supervised/src-tgt/train.tgt (deflated 65%)
  adding: v5-dataset/v5.0/supervised/src-tgt/val.tgt (deflated 64%)


**Part2: Multi-Lingual all languages**

In [None]:
# Multi-lingual case: generate all examples of source and target language
def training_examples_from_sentence(translated_sentence):

  languages = set(translated_sentence.keys())

  if len(languages) < 2:
    raise ValueError("There must be at least two different languages, "
                     f"found {languages})")

  training_examples = []
  for target_language in languages:

    source_languages = languages.copy()
    source_languages.remove(target_language)

    for source_language in source_languages:
      source_text = (f"<to_{target_language}> "
                     f"{translated_sentence[source_language]}")
      target_text = translated_sentence[target_language]

      training_examples.append({'source': source_text, 
                                'target': target_text})
      
  return training_examples

In [None]:
len(training_examples_from_sentence(translated_sentence))

30

In [None]:
training_examples_from_sentence(translated_sentence)

[{'source': '<to_Runyankole> Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Eggplants always grow best under warm conditions.',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Ateso> Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.'},
 {'source': "<to_Ateso> Entonga buriijo zikurira omu mbeera y'obwire eriku

In [None]:
#create all pairs from dataset

m = []
for i in range(len(df)):
  m.append(training_examples_from_sentence(json.loads(data[i])))

In [None]:
m[0]

[{'source': '<to_Runyankole> Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Bilinyanya pol kare dongo maber ka lyeto tye',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Eggplants always grow best under warm conditions.',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Epoloi ebirinyanyi ojok apakio nu emwanar akwap.',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Runyankole> Birinyanya eyi zo kililiru ndeni angu driza ma alia.',
  'target': "Entonga buriijo zikurira omu mbeera y'obwire erikutagata"},
 {'source': '<to_Ateso> Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu',
  'target': 'Epoloi ebirinyanyi ojok apakio nu emwanar akwap.'},
 {'source': "<to_Ateso> Entonga buriijo zikurira omu mbeera y'obwire eriku

In [None]:
len(m[0])

30

In [None]:
len(m)

21507

In [None]:
len(m)*len(m[0])

645210

In [None]:
from itertools import chain
multi_dataset = pd.DataFrame(list(chain.from_iterable(m)))

In [None]:
multi_dataset.tail(5).values

array([["<to_Lugbara> Kamera y'eby'okwerinda yabakwata nga bamenya okuyingira mu nnyuma eyo.",
        "Ba 'du yi afa ondredreani security cameraruri ma dria yini yi nyoria jo ma alia"],
       ['<to_Lugbara> Akatambi kaabo barikucwa okutaaha omunju kakakwatwa zaa kamera zaa kareebireebi.',
        "Ba 'du yi afa ondredreani security cameraruri ma dria yini yi nyoria jo ma alia"],
       ['<to_Lugbara> Kamera me gwoko ber bedo omako cal gi ma gitye ka turo ot.',
        "Ba 'du yi afa ondredreani security cameraruri ma dria yini yi nyoria jo ma alia"],
       ['<to_Lugbara> They were captured on the security camera breaking into the house.',
        "Ba 'du yi afa ondredreani security cameraruri ma dria yini yi nyoria jo ma alia"],
       ['<to_Lugbara> Aponi ikamai kesi keda akamera eutasi aibil etogo kanu akoko.',
        "Ba 'du yi afa ondredreani security cameraruri ma dria yini yi nyoria jo ma alia"]],
      dtype=object)