In [15]:
import sys
sys.path.append('../..')
import leb.dataset
import leb.utils
import yaml

## English text to Luganda and Acholi text (one-to-multiple translation)

In [37]:
yaml_config = '''
huggingface_load:   
  path: Sunbird/salt
  split: train
  name: text-all
source:
  type: text
  language: eng
  preprocessing:
      - prefix_target_language
target:
  type: text
  language: [lug, ach]
'''

config = yaml.safe_load(yaml_config)
ds = leb.dataset.create(config)
list(ds.take(5))

[{'source': '>>lug<< Eggplants always grow best under warm conditions.',
  'target': 'Bbiringanya lubeerera  asinga kukulira mu mbeera ya bugumu'},
 {'source': '>>ach<< Eggplants always grow best under warm conditions.',
  'target': 'Bilinyanya pol kare dongo maber ka lyeto tye'},
 {'source': '>>lug<< Farmland is sometimes a challenge to farmers.',
  'target': "Ettaka ly'okulimirako n'okulundirako ebiseera ebimu kisoomooza abalimi"},
 {'source': '>>ach<< Farmland is sometimes a challenge to farmers.',
  'target': 'Ngom me pur i kare mukene obedo peko madit bot lupur'},
 {'source': '>>lug<< Farmers should be encouraged to grow more coffee.',
  'target': 'Abalimi balina okukubirizibwa okwongera okulima emmwanyi'}]

## Luganda speech to text

In [38]:
yaml_config = '''
huggingface_load:
    path: Sunbird/salt
    split: train
    name: multispeaker-lug
source:
  type: speech
  language: lug
target:
  type: text
  language: lug
'''

config = yaml.safe_load(yaml_config)
ds = leb.dataset.create(config)

leb.utils.show_dataset(list(ds.take(5)), audio_features=['source'])

Unnamed: 0,source,target
0,Your browser does not support the audio element.,Disitulikiti erina okukendeeza ku ssente z'okwewandiisa.
1,Your browser does not support the audio element.,Tulina okukuuma obuyonjo okwewala endwadde.
2,Your browser does not support the audio element.,Bakunga abantu okugenda okwekebeza n'okubudaabudibwa..
3,Your browser does not support the audio element.,Kumpi amakomera gonna mu Uganda ga gavumenti.
4,Your browser does not support the audio element.,Abakungu bakungaanyizza obujulizi okukakasa okukozesa obubi obuyambi.


## Random augmentation

In [68]:
yaml_config = '''
huggingface_load:   
  path: Sunbird/salt
  split: train
  name: text-all
source:
  type: text
  language: eng
  preprocessing:
    - augment_characters:
          action: swap
          spec_char: None
          include_numeric: False
          aug_word_p: 0.1
          aug_word_min: 0
    - prefix_target_language
target:
  type: text
  language: [lug, ach, teo, ibo]
'''

config = yaml.safe_load(yaml_config)
ds = leb.dataset.create(config)

In [69]:
leb.utils.show_dataset(list(ds.take(8)))

Unnamed: 0,source,target
0,>>lug<< Eggplants always grow best udnre warm conditions.,Bbiringanya lubeerera asinga kukulira mu mbeera ya bugumu
1,>>ach<< Eggplants always ogrw best under warm conditions.,Bilinyanya pol kare dongo maber ka lyeto tye
2,>>teo<< Gegplnats always grow best under warm conditions.,Epoloi ebirinyanyi ojok apakio nu emwanar akwap.
3,>>ibo<< Eggplants always grow best under warm conditions.,A na-eto eggplants mgbe nile n'ọnọdụ okpomọkụ
4,>>lug<< Farmland is sometimes a challenge to afrmrse.,Ettaka ly'okulimirako n'okulundirako ebiseera ebimu kisoomooza abalimi
5,>>ach<< Framland is sometimes a challenge to farmers.,Ngom me pur i kare mukene obedo peko madit bot lupur
6,>>teo<< Farmland is smoteiems a challenge to farmers.,Akiro nu alupok nes erai ationis kanejaas akoriok
7,>>ibo<< Farmland is sometimes a hcalleeng to farmers.,"Mgbe ụfọdụ, ihe ịma aka na-abịara ndị ọrụ ugbo bụ ala ha na-akọ ugbo"


## Reloading the module for debugging

Note that some HuggingFace cache files have to be also deleted, and some library references, to avoid unexpected behaviour when updating code.

In [63]:
from importlib import reload
reload(leb.dataset)
reload(leb.dataset.preprocessing)
!rm -rf ~/.cache/huggingface/datasets/generator/*