# Compilation of training dataset for Sunbird language models

In [26]:
import gdown
import pandas as pd
from decouple import config
from sklearn.model_selection import train_test_split

## Download data

In [27]:
id = config("RESOURCE_ID")
url = f"https://drive.google.com/uc?id={id}"
gdown.download(url)

Downloading...
From: https://drive.google.com/uc?id=1vRJsiBvySr9GKcqJr_kXW1hRbg_qIEPW
To: /Users/lydia/Desktop/sunbird/nlp-eda/sunbird-ug-lang-v4.0.jsonl
100%|██████████| 9.10M/9.10M [00:05<00:00, 1.70MB/s]


'sunbird-ug-lang-v4.0.jsonl'

In [28]:
data = pd.read_json("sunbird-ug-lang-v4.0.jsonl", lines=True)

## Split data into train, test and validation sets

In [29]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=42)

In [30]:
train_df.shape

(14409, 6)

In [31]:
test_df.shape

(7098, 6)

In [32]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [33]:
val_df.shape

(3549, 6)

In [34]:
test_df.shape

(3549, 6)

## Create the .txt files needed for the training dataset

In [35]:
language_list = list(data.columns)
language_codes = {
    "English": "en", "Luganda": "lug", "Runyankole": "run", 
    "Acholi": "ach", "Ateso": "teo", "Lugbara": "lgg"
}

In [36]:
# Array of pairs to be used in later refactoring
pairs = [
    "en-lug", "en-run", "en-ach", "en-teo", "en-lgg", "lug-ach", "lug-run", "lug-lgg", 
    "lug-teo", "ach-run", "ach-lgg", "ach-teo", "teo-lgg", "teo-run", "run-lgg"
]

In [37]:
for language in language_list:
    train_df[language].to_csv(f"train.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    test_df[language].to_csv(f"test.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    val_df[language].to_csv(f"val.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')

## Create initial dataset folder and add dataset files

In [38]:
!mkdir dataset

mkdir: dataset: File exists


In [39]:
!mv {*.en,*.lug,*.run,*.ach,*.teo,*.lgg} dataset

In [40]:
!ls dataset/

test.ach  test.lug  train.ach train.lug val.ach   val.lug
test.en   test.run  train.en  train.run val.en    val.run
test.lgg  test.teo  train.lgg train.teo val.lgg   val.teo


## Update dataset folder structure and create archive

### `TO DO: Refactor this dataset creation code using Python pathlib`

#### Create and update folders for English to other language pairs

In [41]:
!mkdir -p v4-dataset/v4.0/supervised/
!mkdir v4-dataset/v4.0/supervised/en-lug
!mkdir v4-dataset/v4.0/supervised/en-ach
!mkdir v4-dataset/v4.0/supervised/en-run
!mkdir v4-dataset/v4.0/supervised/en-lgg
!mkdir v4-dataset/v4.0/supervised/en-teo

In [42]:
!cp -v dataset/*.{en,lug} v4-dataset/v4.0/supervised/en-lug
!cp -v dataset/*.{en,ach} v4-dataset/v4.0/supervised/en-ach
!cp -v dataset/*.{en,run} v4-dataset/v4.0/supervised/en-run
!cp -v dataset/*.{en,lgg} v4-dataset/v4.0/supervised/en-lgg
!cp -v dataset/*.{en,teo} v4-dataset/v4.0/supervised/en-teo

dataset/test.en -> v4-dataset/v4.0/supervised/en-lug/test.en
dataset/train.en -> v4-dataset/v4.0/supervised/en-lug/train.en
dataset/val.en -> v4-dataset/v4.0/supervised/en-lug/val.en
dataset/test.lug -> v4-dataset/v4.0/supervised/en-lug/test.lug
dataset/train.lug -> v4-dataset/v4.0/supervised/en-lug/train.lug
dataset/val.lug -> v4-dataset/v4.0/supervised/en-lug/val.lug
dataset/test.en -> v4-dataset/v4.0/supervised/en-ach/test.en
dataset/train.en -> v4-dataset/v4.0/supervised/en-ach/train.en
dataset/val.en -> v4-dataset/v4.0/supervised/en-ach/val.en
dataset/test.ach -> v4-dataset/v4.0/supervised/en-ach/test.ach
dataset/train.ach -> v4-dataset/v4.0/supervised/en-ach/train.ach
dataset/val.ach -> v4-dataset/v4.0/supervised/en-ach/val.ach
dataset/test.en -> v4-dataset/v4.0/supervised/en-run/test.en
dataset/train.en -> v4-dataset/v4.0/supervised/en-run/train.en
dataset/val.en -> v4-dataset/v4.0/supervised/en-run/val.en
dataset/test.run -> v4-dataset/v4.0/supervised/en-run/test.run
dataset/tr

#### Create and update folders for inter-language pairs

In [43]:
!mkdir v4-dataset/v4.0/supervised/lug-ach
!mkdir v4-dataset/v4.0/supervised/lug-run
!mkdir v4-dataset/v4.0/supervised/lug-lgg
!mkdir v4-dataset/v4.0/supervised/lug-teo
!mkdir v4-dataset/v4.0/supervised/ach-run
!mkdir v4-dataset/v4.0/supervised/ach-lgg
!mkdir v4-dataset/v4.0/supervised/ach-teo
!mkdir v4-dataset/v4.0/supervised/teo-lgg
!mkdir v4-dataset/v4.0/supervised/teo-run
!mkdir v4-dataset/v4.0/supervised/run-lgg

In [44]:
!cp -v dataset/*.{lug,ach} v4-dataset/v4.0/supervised/lug-ach
!cp -v dataset/*.{lug,run} v4-dataset/v4.0/supervised/lug-run
!cp -v dataset/*.{lug,lgg} v4-dataset/v4.0/supervised/lug-lgg
!cp -v dataset/*.{lug,teo} v4-dataset/v4.0/supervised/lug-teo
!cp -v dataset/*.{ach,run} v4-dataset/v4.0/supervised/ach-run
!cp -v dataset/*.{ach,lgg} v4-dataset/v4.0/supervised/ach-lgg
!cp -v dataset/*.{ach,teo} v4-dataset/v4.0/supervised/ach-teo
!cp -v dataset/*.{teo,lgg} v4-dataset/v4.0/supervised/teo-lgg
!cp -v dataset/*.{teo,run} v4-dataset/v4.0/supervised/teo-run
!cp -v dataset/*.{run,lgg} v4-dataset/v4.0/supervised/run-lgg

dataset/test.lug -> v4-dataset/v4.0/supervised/lug-ach/test.lug
dataset/train.lug -> v4-dataset/v4.0/supervised/lug-ach/train.lug
dataset/val.lug -> v4-dataset/v4.0/supervised/lug-ach/val.lug
dataset/test.ach -> v4-dataset/v4.0/supervised/lug-ach/test.ach
dataset/train.ach -> v4-dataset/v4.0/supervised/lug-ach/train.ach
dataset/val.ach -> v4-dataset/v4.0/supervised/lug-ach/val.ach
dataset/test.lug -> v4-dataset/v4.0/supervised/lug-run/test.lug
dataset/train.lug -> v4-dataset/v4.0/supervised/lug-run/train.lug
dataset/val.lug -> v4-dataset/v4.0/supervised/lug-run/val.lug
dataset/test.run -> v4-dataset/v4.0/supervised/lug-run/test.run
dataset/train.run -> v4-dataset/v4.0/supervised/lug-run/train.run
dataset/val.run -> v4-dataset/v4.0/supervised/lug-run/val.run
dataset/test.lug -> v4-dataset/v4.0/supervised/lug-lgg/test.lug
dataset/train.lug -> v4-dataset/v4.0/supervised/lug-lgg/train.lug
dataset/val.lug -> v4-dataset/v4.0/supervised/lug-lgg/val.lug
dataset/test.lgg -> v4-dataset/v4.0/supe

## Zip the dataset - ready for training

In [45]:
!zip -r v4-dataset.zip v4-dataset/

  adding: v4-dataset/ (stored 0%)
  adding: v4-dataset/v4.0/ (stored 0%)
  adding: v4-dataset/v4.0/supervised/ (stored 0%)
  adding: v4-dataset/v4.0/supervised/en-run/ (stored 0%)
  adding: v4-dataset/v4.0/supervised/en-run/test.run (deflated 65%)
  adding: v4-dataset/v4.0/supervised/en-run/val.run (deflated 64%)
  adding: v4-dataset/v4.0/supervised/en-run/train.en (deflated 64%)
  adding: v4-dataset/v4.0/supervised/en-run/val.en (deflated 63%)
  adding: v4-dataset/v4.0/supervised/en-run/train.run (deflated 65%)
  adding: v4-dataset/v4.0/supervised/en-run/test.en (deflated 63%)
  adding: v4-dataset/v4.0/supervised/lug-teo/ (stored 0%)
  adding: v4-dataset/v4.0/supervised/lug-teo/train.lug (deflated 66%)
  adding: v4-dataset/v4.0/supervised/lug-teo/val.teo (deflated 66%)
  adding: v4-dataset/v4.0/supervised/lug-teo/test.teo (deflated 66%)
  adding: v4-dataset/v4.0/supervised/lug-teo/val.lug (deflated 65%)
  adding: v4-dataset/v4.0/supervised/lug-teo/test.lug (deflated 65%)
  adding: v4-