# Compilation of training dataset for Sunbird language models

In [1]:
import gdown
import pandas as pd
from decouple import config
from sklearn.model_selection import train_test_split

## Download data

In [2]:
id = config("RESOURCE_ID")
url = f"https://drive.google.com/uc?id={id}"
gdown.download(url)

Downloading...
From: https://drive.google.com/uc?id=1vRJsiBvySr9GKcqJr_kXW1hRbg_qIEPW
To: /Users/lydia/Desktop/sunbird/nlp-eda/sunbird-ug-lang-v4.0.jsonl
100%|██████████| 9.10M/9.10M [00:03<00:00, 2.64MB/s]


'sunbird-ug-lang-v4.0.jsonl'

In [3]:
data = pd.read_json("sunbird-ug-lang-v4.0.jsonl", lines=True)

## Split data into train, test and validation sets

In [4]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=42)

In [5]:
train_df.shape

(14409, 6)

In [6]:
test_df.shape

(7098, 6)

In [7]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [8]:
val_df.shape

(3549, 6)

In [9]:
test_df.shape

(3549, 6)

## Create the training texts

In [10]:
language_list = list(data.columns)
language_codes = {
    "English": "en", "Luganda": "lug", "Runyankole": "run", 
    "Acholi": "ach", "Ateso": "teo", "Lugbara": "lgg"
}

In [11]:
pairs = ["en-lug", "en-run", "en-ach", "en-teo", "en-lgg"]

In [12]:
for language in language_list:
    train_df[language].to_csv(f"train.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    test_df[language].to_csv(f"test.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    val_df[language].to_csv(f"val.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')

## Create dataset folder and add dataset files

In [13]:
!mkdir dataset

mkdir: dataset: File exists


In [14]:
!mv {*.en,*.lug,*.run,*.ach,*.teo,*.lgg} dataset

In [18]:
!ls dataset/

test.ach  test.lug  train.ach train.lug val.ach   val.lug
test.en   test.run  train.en  train.run val.en    val.run
test.lgg  test.teo  train.lgg train.teo val.lgg   val.teo


## Update dataset folder structure and create archive

In [16]:
!tar -czvf dataset-v4.tar dataset 

a dataset
a dataset/train.ach
a dataset/train.lug
a dataset/test.run
a dataset/train.lgg
a dataset/val.run
a dataset/val.teo
a dataset/test.teo
a dataset/train.en
a dataset/val.lug
a dataset/val.en
a dataset/val.ach
a dataset/test.ach
a dataset/test.lug
a dataset/test.lgg
a dataset/train.run
a dataset/test.en
a dataset/val.lgg
a dataset/train.teo


In [17]:
!zip -r dataset-v4.zip dataset/

  adding: dataset/ (stored 0%)
  adding: dataset/train.ach (deflated 63%)
  adding: dataset/train.lug (deflated 66%)
  adding: dataset/test.run (deflated 65%)
  adding: dataset/train.lgg (deflated 66%)
  adding: dataset/val.run (deflated 64%)
  adding: dataset/val.teo (deflated 66%)
  adding: dataset/test.teo (deflated 66%)
  adding: dataset/train.en (deflated 64%)
  adding: dataset/val.lug (deflated 65%)
  adding: dataset/val.en (deflated 63%)
  adding: dataset/val.ach (deflated 63%)
  adding: dataset/test.ach (deflated 63%)
  adding: dataset/test.lug (deflated 65%)
  adding: dataset/test.lgg (deflated 65%)
  adding: dataset/train.run (deflated 65%)
  adding: dataset/test.en (deflated 63%)
  adding: dataset/val.lgg (deflated 65%)
  adding: dataset/train.teo (deflated 67%)
