# Compilation of training dataset for Sunbird language models

In [2]:
import gdown
import pandas as pd
from decouple import config
from sklearn.model_selection import train_test_split

## Download data

In [3]:
id = config("RESOURCE_ID")
url = f"https://drive.google.com/uc?id={id}"
gdown.download(url)

Downloading...
From: https://drive.google.com/uc?id=1vRJsiBvySr9GKcqJr_kXW1hRbg_qIEPW
To: /Users/lydia/Desktop/sunbird/nlp-eda/sunbird-ug-lang-v4.0.jsonl
100%|██████████| 9.10M/9.10M [00:05<00:00, 1.66MB/s]


'sunbird-ug-lang-v4.0.jsonl'

In [4]:
data = pd.read_json("sunbird-ug-lang-v4.0.jsonl", lines=True)

## Split data into train, test and validation sets

In [5]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=42)

In [6]:
train_df.shape

(14409, 6)

In [7]:
test_df.shape

(7098, 6)

In [8]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [9]:
val_df.shape

(3549, 6)

In [10]:
test_df.shape

(3549, 6)

## Create the training texts

In [12]:
language_list = list(data.columns)
language_codes = {
    "English": "en", "Luganda": "lug", "Runyankole": "run", 
    "Acholi": "ach", "Ateso": "teo", "Lugbara": "lgg"
}

In [13]:
pairs = ["en-lug", "en-run", "en-ach", "en-teo", "en-lgg"]

In [14]:
for language in language_list:
    train_df[language].to_csv(f"train.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    test_df[language].to_csv(f"test.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    val_df[language].to_csv(f"val.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')

## Create dataset folder and add dataset files

In [15]:
!mkdir dataset

mkdir: dataset: File exists


In [16]:
!mv {*.en,*.lug,*.run,*.ach,*.teo,*.lgg} dataset

In [17]:
!ls dataset/

test.ach  test.lug  train.ach train.lug val.ach   val.lug
test.en   test.run  train.en  train.run val.en    val.run
test.lgg  test.teo  train.lgg train.teo val.lgg   val.teo


## Update dataset folder structure and create archive

In [29]:
!mkdir -p v5-dataset/v5.0/supervised/
!mkdir v5-dataset/v5.0/supervised/en-lug
!mkdir v5-dataset/v5.0/supervised/en-ach
!mkdir v5-dataset/v5.0/supervised/en-run
!mkdir v5-dataset/v5.0/supervised/en-lgg
!mkdir v5-dataset/v5.0/supervised/en-teo

In [32]:
!cp -v dataset/*.{en,lug} v5-dataset/v5.0/supervised/en-lug
!cp -v dataset/*.{en,ach} v5-dataset/v5.0/supervised/en-ach
!cp -v dataset/*.{en,run} v5-dataset/v5.0/supervised/en-run
!cp -v dataset/*.{en,lgg} v5-dataset/v5.0/supervised/en-lgg
!cp -v dataset/*.{en,teo} v5-dataset/v5.0/supervised/en-teo

dataset/test.en -> v5-dataset/v5.0/supervised/en-lug/test.en
dataset/train.en -> v5-dataset/v5.0/supervised/en-lug/train.en
dataset/val.en -> v5-dataset/v5.0/supervised/en-lug/val.en
dataset/test.lug -> v5-dataset/v5.0/supervised/en-lug/test.lug
dataset/train.lug -> v5-dataset/v5.0/supervised/en-lug/train.lug
dataset/val.lug -> v5-dataset/v5.0/supervised/en-lug/val.lug
dataset/test.en -> v5-dataset/v5.0/supervised/en-ach/test.en
dataset/train.en -> v5-dataset/v5.0/supervised/en-ach/train.en
dataset/val.en -> v5-dataset/v5.0/supervised/en-ach/val.en
dataset/test.ach -> v5-dataset/v5.0/supervised/en-ach/test.ach
dataset/train.ach -> v5-dataset/v5.0/supervised/en-ach/train.ach
dataset/val.ach -> v5-dataset/v5.0/supervised/en-ach/val.ach
dataset/test.en -> v5-dataset/v5.0/supervised/en-run/test.en
dataset/train.en -> v5-dataset/v5.0/supervised/en-run/train.en
dataset/val.en -> v5-dataset/v5.0/supervised/en-run/val.en
dataset/test.run -> v5-dataset/v5.0/supervised/en-run/test.run
dataset/tr

In [33]:
!zip -r dataset-v5.zip v5-dataset/

  adding: v5-dataset/ (stored 0%)
  adding: v5-dataset/v5.0/ (stored 0%)
  adding: v5-dataset/v5.0/supervised/ (stored 0%)
  adding: v5-dataset/v5.0/supervised/en-run/ (stored 0%)
  adding: v5-dataset/v5.0/supervised/en-run/test.run (deflated 65%)
  adding: v5-dataset/v5.0/supervised/en-run/val.run (deflated 64%)
  adding: v5-dataset/v5.0/supervised/en-run/train.en (deflated 64%)
  adding: v5-dataset/v5.0/supervised/en-run/val.en (deflated 63%)
  adding: v5-dataset/v5.0/supervised/en-run/train.run (deflated 65%)
  adding: v5-dataset/v5.0/supervised/en-run/test.en (deflated 63%)
  adding: v5-dataset/v5.0/supervised/en-teo/ (stored 0%)
  adding: v5-dataset/v5.0/supervised/en-teo/val.teo (deflated 66%)
  adding: v5-dataset/v5.0/supervised/en-teo/test.teo (deflated 66%)
  adding: v5-dataset/v5.0/supervised/en-teo/train.en (deflated 64%)
  adding: v5-dataset/v5.0/supervised/en-teo/val.en (deflated 63%)
  adding: v5-dataset/v5.0/supervised/en-teo/test.en (deflated 63%)
  adding: v5-dataset/v