In [24]:
import gdown
import pandas as pd
from decouple import config
from sklearn.model_selection import train_test_split

In [27]:
id = config("RESOURCE_ID")
url = f"https://drive.google.com/uc?id={id}"
gdown.download(url)

In [28]:
data = pd.read_json("sunbird-ug-lang-v4.0.jsonl", lines=True)

In [29]:
train_df, test_df = train_test_split(data, test_size=0.33, random_state=42)

In [30]:
train_df.shape

(14409, 6)

In [31]:
test_df.shape

(7098, 6)

In [32]:
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [33]:
val_df.shape

(3549, 6)

In [34]:
test_df.shape

(3549, 6)

## Create the training texts

In [35]:
train_df['English']

15709    He survived after his son heard him wailing an...
11152                     It is good to grow your talent..
177      During the coronavirus outbreak, people began ...
17663                           What is your friend up to?
14329    Rwanda does not take lightly threats of invasi...
                               ...                        
11284                           The buildings have cracks.
11964                            They raised enough funds.
5390     That professor teaches physics at the university.
860      Instead of fighting rebels, Sudan opted for pe...
15795    Engaging in difficult cases requires hard nego...
Name: English, Length: 14409, dtype: object

In [36]:
train_df["English"].to_csv('train.en', header=False, index=False, sep='\t', mode='a')
train_df["Luganda"].to_csv('train.lg', header=False, index=False, sep='\t', mode='a')
train_df["Runyankole"].to_csv('train.run', header=False, index=False, sep='\t', mode='a')
train_df["Ateso"].to_csv('train.teo', header=False, index=False, sep='\t', mode='a')
train_df["Lugbara"].to_csv('train.lgg', header=False, index=False, sep='\t', mode='a')
train_df["Acholi"].to_csv('train.ach', header=False, index=False, sep='\t', mode='a')


In [37]:
test_df["English"].to_csv('test.en', header=False, index=False, sep='\t', mode='a')
test_df["Luganda"].to_csv('test.lg', header=False, index=False, sep='\t', mode='a')
test_df["Runyankole"].to_csv('test.run', header=False, index=False, sep='\t', mode='a')
test_df["Ateso"].to_csv('test.teo', header=False, index=False, sep='\t', mode='a')
test_df["Lugbara"].to_csv('test.lgg', header=False, index=False, sep='\t', mode='a')
test_df["Acholi"].to_csv('test.ach', header=False, index=False, sep='\t', mode='a')


In [38]:
val_df["English"].to_csv('val.en', header=False, index=False, sep='\t', mode='a')
val_df["Luganda"].to_csv('val.lg', header=False, index=False, sep='\t', mode='a')
val_df["Runyankole"].to_csv('val.run', header=False, index=False, sep='\t', mode='a')
val_df["Ateso"].to_csv('val.teo', header=False, index=False, sep='\t', mode='a')
val_df["Lugbara"].to_csv('val.lgg', header=False, index=False, sep='\t', mode='a')
val_df["Acholi"].to_csv('val.ach', header=False, index=False, sep='\t', mode='a')


In [39]:
!mkdir dataset

In [None]:
!mv *.* dataset

In [None]:
ls dataset/

sunbird-ug-lang-v4.0.jsonl  test.lgg   train.en   train.teo  val.lgg
test.ach                    test.run   train.lg   val.ach    val.run
test.en                     test.teo   train.lgg  val.en     val.teo
test.lg                     train.ach  train.run  val.lg


In [None]:
!tar -czvf dataset-v4.tar dataset 

dataset/
dataset/.ipynb_checkpoints/
dataset/train.ach
dataset/train.lg
dataset/train.en
dataset/test.run
dataset/train.run
dataset/val.en
dataset/val.lg
dataset/test.ach
dataset/val.ach
dataset/test.lgg
dataset/train.teo
dataset/val.run
dataset/test.en
dataset/val.teo
dataset/val.lgg
dataset/test.teo
dataset/test.lg
dataset/train.lgg


In [None]:
!zip -r dataset-v4.zip dataset/

  adding: dataset/ (stored 0%)
  adding: dataset/.ipynb_checkpoints/ (stored 0%)
  adding: dataset/train.ach (deflated 63%)
  adding: dataset/train.lg (deflated 66%)
  adding: dataset/train.en (deflated 64%)
  adding: dataset/test.run (deflated 65%)
  adding: dataset/train.run (deflated 65%)
  adding: dataset/val.en (deflated 63%)
  adding: dataset/val.lg (deflated 65%)
  adding: dataset/test.ach (deflated 63%)
  adding: dataset/val.ach (deflated 63%)
  adding: dataset/test.lgg (deflated 65%)
  adding: dataset/train.teo (deflated 67%)
  adding: dataset/val.run (deflated 64%)
  adding: dataset/test.en (deflated 63%)
  adding: dataset/val.teo (deflated 66%)
  adding: dataset/val.lgg (deflated 65%)
  adding: dataset/test.teo (deflated 66%)
  adding: dataset/test.lg (deflated 65%)
  adding: dataset/train.lgg (deflated 66%)
