# Compilation of training dataset for Sunbird language models

In [68]:
import gdown
import pandas as pd
from decouple import config
from sklearn.model_selection import train_test_split

## Download data

In [69]:
id = config("RESOURCE_ID")
url = f"https://drive.google.com/uc?id={id}"
gdown.download(url)

Downloading...
From: https://drive.google.com/uc?id=1vRJsiBvySr9GKcqJr_kXW1hRbg_qIEPW
To: /Users/lydia/Desktop/sunbird/nlp-eda/sunbird-ug-lang-v4.0.jsonl
100%|██████████| 9.10M/9.10M [00:02<00:00, 4.13MB/s]


'sunbird-ug-lang-v4.0.jsonl'

In [70]:
data = pd.read_json("sunbird-ug-lang-v4.0.jsonl", lines=True)

## Split data into train, test and validation sets

In [71]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=42)

In [72]:
train_df.shape

(14409, 6)

In [73]:
test_df.shape

(7098, 6)

In [74]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [75]:
val_df.shape

(3549, 6)

In [76]:
test_df.shape

(3549, 6)

## Create the training texts

In [77]:
language_list = list(data.columns)
language_codes = {
    "English": "en", "Luganda": "lug", "Runyankole": "run", 
    "Acholi": "ach", "Ateso": "teo", "Lugbara": "lgg"
}

In [78]:
pairs = ["en-lug", "en-run", "en-ach", "en-teo", "en-lgg"]

In [79]:
for language in language_list:
    train_df[language].to_csv(f"train.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    test_df[language].to_csv(f"test.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')
    val_df[language].to_csv(f"val.{language_codes[language]}", header=False, index=False, sep='\t', mode='a')

## Create dataset folder structure

In [39]:
!mkdir dataset

In [None]:
!mv *.* dataset

In [None]:
ls dataset/

sunbird-ug-lang-v4.0.jsonl  test.lgg   train.en   train.teo  val.lgg
test.ach                    test.run   train.lg   val.ach    val.run
test.en                     test.teo   train.lgg  val.en     val.teo
test.lg                     train.ach  train.run  val.lg


In [None]:
!tar -czvf dataset-v4.tar dataset 

dataset/
dataset/.ipynb_checkpoints/
dataset/train.ach
dataset/train.lg
dataset/train.en
dataset/test.run
dataset/train.run
dataset/val.en
dataset/val.lg
dataset/test.ach
dataset/val.ach
dataset/test.lgg
dataset/train.teo
dataset/val.run
dataset/test.en
dataset/val.teo
dataset/val.lgg
dataset/test.teo
dataset/test.lg
dataset/train.lgg


In [None]:
!zip -r dataset-v4.zip dataset/

  adding: dataset/ (stored 0%)
  adding: dataset/.ipynb_checkpoints/ (stored 0%)
  adding: dataset/train.ach (deflated 63%)
  adding: dataset/train.lg (deflated 66%)
  adding: dataset/train.en (deflated 64%)
  adding: dataset/test.run (deflated 65%)
  adding: dataset/train.run (deflated 65%)
  adding: dataset/val.en (deflated 63%)
  adding: dataset/val.lg (deflated 65%)
  adding: dataset/test.ach (deflated 63%)
  adding: dataset/val.ach (deflated 63%)
  adding: dataset/test.lgg (deflated 65%)
  adding: dataset/train.teo (deflated 67%)
  adding: dataset/val.run (deflated 64%)
  adding: dataset/test.en (deflated 63%)
  adding: dataset/val.teo (deflated 66%)
  adding: dataset/val.lgg (deflated 65%)
  adding: dataset/test.teo (deflated 66%)
  adding: dataset/test.lg (deflated 65%)
  adding: dataset/train.lgg (deflated 66%)
