## Compilation of training dataset for Sunbird language models

In [1]:
import gdown
import pandas as pd
from decouple import config
from sklearn.model_selection import train_test_split

### Download data

In [25]:
id = config("RESOURCE_ID")
url = f"https://drive.google.com/uc?id={id}"
gdown.download(url)

Downloading...
From: https://drive.google.com/uc?id=1vRJsiBvySr9GKcqJr_kXW1hRbg_qIEPW
To: /Users/lydia/Desktop/sunbird/nlp-eda/sunbird-ug-lang-v4.0.jsonl
100%|██████████| 9.10M/9.10M [00:05<00:00, 1.66MB/s]


'sunbird-ug-lang-v4.0.jsonl'

In [2]:
data = pd.read_json("sunbird-ug-lang-v1.0.jsonl", lines=True)

### Split data into train, test and validation sets

In [3]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=42)

In [4]:
train_df.shape

(16754, 6)

In [5]:
test_df.shape

(8252, 6)

In [6]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [7]:
val_df.shape

(4126, 6)

In [8]:
test_df.shape

(4126, 6)

### Create the .txt files needed for the training dataset

In [18]:
language_list = list(data.columns)
language_codes = {
    "English": "en", "Luganda": "lug", "Runyankole": "nyn", 
    "Acholi": "ach", "Ateso": "teo", "Lugbara": "lgg"
}

In [19]:
# Array of pairs to be used in later refactoring
pairs = [
    "en-lug", "en-nyn", "en-ach", "en-teo", "en-lgg", "lug-ach", "lug-nyn", "lug-lgg", 
    "lug-teo", "ach-nyn", "ach-lgg", "ach-teo", "teo-lgg", "teo-nyn", "nyn-lgg"
]

In [36]:
for language in language_list:
    train_df[language].to_csv(f"train.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    test_df[language].to_csv(f"test.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    val_df[language].to_csv(f"val.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')

### Create initial dataset folder and add dataset files

In [37]:
!mkdir dataset

In [38]:
!mv {*.en,*.lug,*.nyn,*.ach,*.teo,*.lgg} dataset

In [39]:
!ls dataset/

test.ach  test.lug  train.ach train.lug val.ach   val.lug
test.en   test.nyn  train.en  train.nyn val.en    val.nyn
test.lgg  test.teo  train.lgg train.teo val.lgg   val.teo


### Update dataset folder structure

#### `TO DO: Refactor this dataset creation code using Python pathlib`

#### Create and update folders for English to other language pairs

In [40]:
!mkdir -p v7-dataset/v7.0/supervised/
!mkdir v7-dataset/v7.0/supervised/en-lug
!mkdir v7-dataset/v7.0/supervised/en-ach
!mkdir v7-dataset/v7.0/supervised/en-nyn
!mkdir v7-dataset/v7.0/supervised/en-lgg
!mkdir v7-dataset/v7.0/supervised/en-teo

In [41]:
!cp -v dataset/*.{en,lug} v7-dataset/v7.0/supervised/en-lug
!cp -v dataset/*.{en,ach} v7-dataset/v7.0/supervised/en-ach
!cp -v dataset/*.{en,nyn} v7-dataset/v7.0/supervised/en-nyn
!cp -v dataset/*.{en,lgg} v7-dataset/v7.0/supervised/en-lgg
!cp -v dataset/*.{en,teo} v7-dataset/v7.0/supervised/en-teo

dataset/test.en -> v7-dataset/v7.0/supervised/en-lug/test.en
dataset/train.en -> v7-dataset/v7.0/supervised/en-lug/train.en
dataset/val.en -> v7-dataset/v7.0/supervised/en-lug/val.en
dataset/test.lug -> v7-dataset/v7.0/supervised/en-lug/test.lug
dataset/train.lug -> v7-dataset/v7.0/supervised/en-lug/train.lug
dataset/val.lug -> v7-dataset/v7.0/supervised/en-lug/val.lug
dataset/test.en -> v7-dataset/v7.0/supervised/en-ach/test.en
dataset/train.en -> v7-dataset/v7.0/supervised/en-ach/train.en
dataset/val.en -> v7-dataset/v7.0/supervised/en-ach/val.en
dataset/test.ach -> v7-dataset/v7.0/supervised/en-ach/test.ach
dataset/train.ach -> v7-dataset/v7.0/supervised/en-ach/train.ach
dataset/val.ach -> v7-dataset/v7.0/supervised/en-ach/val.ach
dataset/test.en -> v7-dataset/v7.0/supervised/en-nyn/test.en
dataset/train.en -> v7-dataset/v7.0/supervised/en-nyn/train.en
dataset/val.en -> v7-dataset/v7.0/supervised/en-nyn/val.en
dataset/test.nyn -> v7-dataset/v7.0/supervised/en-nyn/test.nyn
dataset/tr

#### Create and update folders for inter-language pairs

In [42]:
!mkdir v7-dataset/v7.0/supervised/lug-ach
!mkdir v7-dataset/v7.0/supervised/lug-nyn
!mkdir v7-dataset/v7.0/supervised/lug-lgg
!mkdir v7-dataset/v7.0/supervised/lug-teo
!mkdir v7-dataset/v7.0/supervised/ach-nyn
!mkdir v7-dataset/v7.0/supervised/ach-lgg
!mkdir v7-dataset/v7.0/supervised/ach-teo
!mkdir v7-dataset/v7.0/supervised/teo-lgg
!mkdir v7-dataset/v7.0/supervised/teo-nyn
!mkdir v7-dataset/v7.0/supervised/nyn-lgg

In [43]:
!cp -v dataset/*.{lug,ach} v7-dataset/v7.0/supervised/lug-ach
!cp -v dataset/*.{lug,nyn} v7-dataset/v7.0/supervised/lug-nyn
!cp -v dataset/*.{lug,lgg} v7-dataset/v7.0/supervised/lug-lgg
!cp -v dataset/*.{lug,teo} v7-dataset/v7.0/supervised/lug-teo
!cp -v dataset/*.{ach,nyn} v7-dataset/v7.0/supervised/ach-nyn
!cp -v dataset/*.{ach,lgg} v7-dataset/v7.0/supervised/ach-lgg
!cp -v dataset/*.{ach,teo} v7-dataset/v7.0/supervised/ach-teo
!cp -v dataset/*.{teo,lgg} v7-dataset/v7.0/supervised/teo-lgg
!cp -v dataset/*.{teo,nyn} v7-dataset/v7.0/supervised/teo-nyn
!cp -v dataset/*.{nyn,lgg} v7-dataset/v7.0/supervised/nyn-lgg

dataset/test.lug -> v7-dataset/v7.0/supervised/lug-ach/test.lug
dataset/train.lug -> v7-dataset/v7.0/supervised/lug-ach/train.lug
dataset/val.lug -> v7-dataset/v7.0/supervised/lug-ach/val.lug
dataset/test.ach -> v7-dataset/v7.0/supervised/lug-ach/test.ach
dataset/train.ach -> v7-dataset/v7.0/supervised/lug-ach/train.ach
dataset/val.ach -> v7-dataset/v7.0/supervised/lug-ach/val.ach
dataset/test.lug -> v7-dataset/v7.0/supervised/lug-nyn/test.lug
dataset/train.lug -> v7-dataset/v7.0/supervised/lug-nyn/train.lug
dataset/val.lug -> v7-dataset/v7.0/supervised/lug-nyn/val.lug
dataset/test.nyn -> v7-dataset/v7.0/supervised/lug-nyn/test.nyn
dataset/train.nyn -> v7-dataset/v7.0/supervised/lug-nyn/train.nyn
dataset/val.nyn -> v7-dataset/v7.0/supervised/lug-nyn/val.nyn
dataset/test.lug -> v7-dataset/v7.0/supervised/lug-lgg/test.lug
dataset/train.lug -> v7-dataset/v7.0/supervised/lug-lgg/train.lug
dataset/val.lug -> v7-dataset/v7.0/supervised/lug-lgg/val.lug
dataset/test.lgg -> v7-dataset/v7.0/supe