In [1]:
import os, sys
import pandas as pd
import numpy as np
import torch
import datasets
from datasets import load_dataset

# Load Dataset

In [2]:
# NusaTranslation Senti
nt_senti_dset = {
	"btk": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_btk_nusantara_text"),
	"sun": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_sun_nusantara_text"),
	"jav": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_jav_nusantara_text"),
	"mad": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_mad_nusantara_text"),
	"mak": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_mak_nusantara_text"),
	"min": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_min_nusantara_text"),
}

# NusaTranslation MT
nt_mt_dset = {
	"btk": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_btk_ind_nusantara_t2t"),
	"sun": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_sun_ind_nusantara_t2t"),
	"jav": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_jav_ind_nusantara_t2t"),
	"mad": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_mad_ind_nusantara_t2t"),
	"mak": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_mak_ind_nusantara_t2t"),
	"min": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_min_ind_nusantara_t2t"),
}

# NusaX Senti
nusax_senti_dset = {
	"btk": load_dataset("indonlp/NusaX-senti", name="bbc"),
	"sun": load_dataset("indonlp/NusaX-senti", name="sun"),
	"jav": load_dataset("indonlp/NusaX-senti", name="jav"),
	"mad": load_dataset("indonlp/NusaX-senti", name="mad"),
	"mak": load_dataset("indonlp/NusaX-senti", name="bug"),
	"min": load_dataset("indonlp/NusaX-senti", name="min"),
}

# NusaX MT ind
nusax_mt_ind_dset = {
	"btk": load_dataset("indonlp/NusaX-MT", name="bbc-ind"),
	"sun": load_dataset("indonlp/NusaX-MT", name="sun-ind"),
	"jav": load_dataset("indonlp/NusaX-MT", name="jav-ind"),
	"mad": load_dataset("indonlp/NusaX-MT", name="mad-ind"),
	"mak": load_dataset("indonlp/NusaX-MT", name="bug-ind"),
	"min": load_dataset("indonlp/NusaX-MT", name="min-ind"),
}

# NusaX MT eng (Extended experiment)
nusax_mt_eng_dset = {
	"btk": load_dataset("indonlp/NusaX-MT", name="bbc-eng"),
	"sun": load_dataset("indonlp/NusaX-MT", name="sun-eng"),
	"jav": load_dataset("indonlp/NusaX-MT", name="jav-eng"),
	"mad": load_dataset("indonlp/NusaX-MT", name="mad-eng"),
	"mak": load_dataset("indonlp/NusaX-MT", name="bug-eng"),
	"min": load_dataset("indonlp/NusaX-MT", name="min-eng"),
}

# MasakhaNews
masakhanews_dset = {
	"amh": load_dataset("masakhane/masakhanews", name="amh"),
	"hau": load_dataset("masakhane/masakhanews", name="hau"),
	"ibo": load_dataset("masakhane/masakhanews", name="ibo"),
	"lug": load_dataset("masakhane/masakhanews", name="lug"),
	"pcm": load_dataset("masakhane/masakhanews", name="pcm"),
	"sna": load_dataset("masakhane/masakhanews", name="sna"),
	"swa": load_dataset("masakhane/masakhanews", name="swa"),
	"xho": load_dataset("masakhane/masakhanews", name="xho"),
	"yor": load_dataset("masakhane/masakhanews", name="yor"),
}
# MAFAND
mafand_dset = {
	"amh": load_dataset("masakhane/mafand", name="en-amh"),
	"hau": load_dataset("masakhane/mafand", name="en-hau"),
	"ibo": load_dataset("masakhane/mafand", name="en-ibo"),
	"lug": load_dataset("masakhane/mafand", name="en-lug"),
	"pcm": load_dataset("masakhane/mafand", name="en-pcm"),
	"sna": load_dataset("masakhane/mafand", name="en-sna"),
	"swa": load_dataset("masakhane/mafand", name="en-swa"),
	"xho": load_dataset("masakhane/mafand", name="en-xho"),
	"yor": load_dataset("masakhane/mafand", name="en-yor"),
}

# AmericasNLI
americasnli_dset = {
	"aym": load_dataset("americas_nli", name="aym"),
	"bzd": load_dataset("americas_nli", name="bzd"),
	"cni": load_dataset("americas_nli", name="cni"),
	"gn": load_dataset("americas_nli", name="gn"),
	"hch": load_dataset("americas_nli", name="hch"),
	"nah": load_dataset("americas_nli", name="nah"),
	"oto": load_dataset("americas_nli", name="oto"),
	"quy": load_dataset("americas_nli", name="quy"),
	"shp": load_dataset("americas_nli", name="shp"),
	"tar": load_dataset("americas_nli", name="tar"),
}

# XNLI (es)
xnli_es_dset = load_dataset("xnli", name="es")

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_btk_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_sun_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_jav_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_mad_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_mak_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_min_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_btk_ind_nusantara_t2t/1.0.0/7bc55ca0431a2e37367c37eaadf3eb41305c25f22713b09932865f27d6b4c005)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_sun_ind_nusantara_t2t/1.0.0/7bc55ca0431a2e37367c37eaadf3eb41305c25f22713b09932865f27d6b4c005)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_jav_ind_nusantara_t2t/1.0.0/7bc55ca0431a2e37367c37eaadf3eb41305c25f22713b09932865f27d6b4c005)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_mad_ind_nusantara_t2t/1.0.0/7bc55ca0431a2e37367c37eaadf3eb41305c25f22713b09932865f27d6b4c005)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_mak_ind_nusantara_t2t/1.0.0/7bc55ca0431a2e37367c37eaadf3eb41305c25f22713b09932865f27d6b4c005)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_min_ind_nusantara_t2t/1.0.0/7bc55ca0431a2e37367c37eaadf3eb41305c25f22713b09932865f27d6b4c005)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/bbc/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/sun/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/jav/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/mad/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/bug/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/min/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bbc-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/sun-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/jav-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/mad-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bug-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/min-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bbc-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/sun-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/jav-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/mad-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bug-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/min-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/amh/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/hau/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/ibo/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/lug/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/pcm/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/sna/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/swa/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/xho/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/yor/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-amh/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-hau/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-ibo/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-lug/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-pcm/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-sna/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-swa/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-xho/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-yor/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/aym/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/bzd/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/cni/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/gn/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/hch/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/nah/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/oto/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/quy/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/shp/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/tar/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset xnli (/home/samuel/.cache/huggingface/datasets/xnli/es/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

# Standardize Dataset

### NLU Dataset

In [3]:
####
# Single-Sentence Classification [text, label]
# Pair-Sentence Classification [text_1, text_2, label]
####
save_path = './'

# Process NusaTranslation Senti
nt_senti_dset_clean = {}
for key in nt_senti_dset.keys():
    dset = nt_senti_dset[key]['test'].remove_columns(['id'])
    nt_senti_dset_clean[key] = dset
nt_senti_dset_clean = datasets.DatasetDict(nt_senti_dset_clean)

# Process NusaX Senti
nusax_senti_dset_clean = {}
for key in nusax_senti_dset.keys():
    dset = nusax_senti_dset[key]['test'].remove_columns(['id', 'lang'])
    nusax_senti_dset_clean[key] = dset
nusax_senti_dset_clean = datasets.DatasetDict(nusax_senti_dset_clean)

# Process MasakhaNews Senti
masakhanews_dset_clean = {}
for key in masakhanews_dset.keys():
    dset = masakhanews_dset[key]['test'].remove_columns(['text', 'headline_text', 'url'])
    dset = dset.rename_columns({'headline': 'text'})
    masakhanews_dset_clean[key] = dset
masakhanews_dset_clean = datasets.DatasetDict(masakhanews_dset_clean)

# Process AmericasNLI Senti
americasnli_dset_clean = {}
for key in americasnli_dset.keys():
    dset = americasnli_dset[key]['test'].rename_columns({'premise': 'text_1', 'hypothesis': 'text_2'})
    americasnli_dset_clean[key] = dset
americasnli_dset_clean = datasets.DatasetDict(americasnli_dset_clean)

# xnli_es_dset_clean = xnli_es_dset['test'].rename_columns({'premise': 'text_1', 'hypothesis': 'text_2'})

nt_senti_dset_clean.save_to_disk(f'{save_path}/nt_senti_dset_clean')
nusax_senti_dset_clean.save_to_disk(f'{save_path}/nusax_senti_dset_clean')
masakhanews_dset_clean.save_to_disk(f'{save_path}/masakhanews_dset_clean')
americasnli_dset_clean.save_to_disk(f'{save_path}/americasnli_dset_clean')

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/376 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/637 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/390 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/223 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/305 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/369 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/476 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/297 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/411 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/748 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

### MT Dataset

In [4]:
####
# MT [text_1, text_2]
####
save_path = './'

# Process NusaTranslation MT
nt_mt_dset_clean = {}
for key in nt_mt_dset.keys():
    dset = []
    for split in nt_mt_dset[key].keys():
        dset.append(nt_mt_dset[key][split].remove_columns(['id', 'text_1_name', 'text_2_name']))
    nt_mt_dset_clean[key] = datasets.concatenate_datasets(dset)
nt_mt_dset_clean = datasets.DatasetDict(nt_mt_dset_clean)

# Process NusaX MT ind
nusax_mt_ind_dset_clean = {}
for key in nusax_mt_ind_dset.keys():
    dset = []
    for split in nusax_mt_ind_dset[key].keys():
        dset.append(nusax_mt_ind_dset[key][split].remove_columns(['id', 'text_1_lang', 'text_2_lang']))
    nusax_mt_ind_dset_clean[key] = datasets.concatenate_datasets(dset)
nusax_mt_ind_dset_clean = datasets.DatasetDict(nusax_mt_ind_dset_clean)

# Process NusaX MT eng
nusax_mt_eng_dset_clean = {}
for key in nusax_mt_eng_dset.keys():
    dset = []
    for split in nusax_mt_eng_dset[key].keys():
        dset.append(nusax_mt_eng_dset[key][split].remove_columns(['id', 'text_1_lang', 'text_2_lang']))
    nusax_mt_eng_dset_clean[key] = datasets.concatenate_datasets(dset)
nusax_mt_eng_dset_clean = datasets.DatasetDict(nusax_mt_eng_dset_clean)

# Process MAFAND
mafand_dset_clean = {}
for key in mafand_dset.keys():
    tmp_dset = {'text_1': [], 'text_2': []}
    for split in mafand_dset[key].keys():        
        for i in range(len(mafand_dset[key][split])):
            tmp_dset['text_1'].append(mafand_dset[key][split][i]['translation'][key])
            tmp_dset['text_2'].append(mafand_dset[key][split][i]['translation']['en'])
    mafand_dset_clean[key] = datasets.Dataset.from_dict(tmp_dset)
mafand_dset_clean = datasets.DatasetDict(mafand_dset_clean)

# Save all datasets
nt_mt_dset_clean.save_to_disk(f'{save_path}/nt_mt_dset_clean')
nusax_mt_ind_dset_clean.save_to_disk(f'{save_path}/nusax_mt_ind_dset_clean')
nusax_mt_eng_dset_clean.save_to_disk(f'{save_path}/nusax_mt_eng_dset_clean')
mafand_dset_clean.save_to_disk(f'{save_path}/mafand_dset_clean')

Saving the dataset (0/1 shards):   0%|          | 0/9449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1936 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8665 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9998 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7838 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1561 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/34408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1488 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9746 [00:00<?, ? examples/s]

### Merge Dataset

In [5]:
####
# Combined [text, text_1, text_2, label]
####
# Process NusaX Combined ind
nusax_combined_ind_dset = {}
for key in nusax_senti_dset.keys():
    dset = []
    for split in nusax_senti_dset[key].keys():
        dset.append(nusax_senti_dset[key][split].remove_columns(['id', 'lang']))
    nusax_combined_ind_dset[key] = datasets.concatenate_datasets([
        datasets.concatenate_datasets(dset),
        nusax_mt_ind_dset_clean[key]
    ], axis=1)
nusax_combined_ind_dset = datasets.DatasetDict(nusax_combined_ind_dset)

# Process NusaX Combined ind
nusax_combined_eng_dset = {}
for key in nusax_senti_dset.keys():
    dset = []
    for split in nusax_senti_dset[key].keys():
        dset.append(nusax_senti_dset[key][split].remove_columns(['id', 'lang']))
    nusax_combined_eng_dset[key] = datasets.concatenate_datasets([
        datasets.concatenate_datasets(dset),
        nusax_mt_eng_dset_clean[key]
    ], axis=1)
nusax_combined_eng_dset = datasets.DatasetDict(nusax_combined_eng_dset)

nusax_combined_ind_dset.save_to_disk(f'{save_path}/nusax_combined_ind_dset')
nusax_combined_eng_dset.save_to_disk(f'{save_path}/nusax_combined_eng_dset')

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
####
# Process AmericasNLI - XNLI Combined
####
from sentence_transformers import SentenceTransformer
sbert = SentenceTransformer('sentence-transformers/stsb-xlm-r-multilingual')

# Encode Americas NLI
anli_emb_map = {}
for key in americasnli_dset.keys():
    sents = americasnli_dset[key]['validation'].map(lambda x: {'text': f"{x['premise']}. {x['hypothesis']}"})['text']
    anli_emb_map[key] = sbert.encode(sents, batch_size=256, device='cuda:0', show_progress_bar=True, convert_to_tensor=True)

# Encode XNLI
xnli_embs = []
for split in xnli_es_dset.keys():
    sents =  xnli_es_dset[split].map(lambda x: {'text': f"{x['premise']}. {x['hypothesis']}"})['text']
    xnli_embs.append(sbert.encode(sents, batch_size=256, device='cuda:0', show_progress_bar=True, convert_to_tensor=True))
xnli_embs = torch.cat(xnli_embs)