In [1]:
import os, sys
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
import random
import pandas as pd
import numpy as np
import torch
import datasets
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
# Download XNLI Data
!wget https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip
!unzip XNLI-1.0.zip

# Load Dataset

In [2]:
# Tweet Sentiment Multilingual
tweet_senti_multi_dset = {
	"eng": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="english"),
	"arb": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="arabic"),
	"fra": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="french"),
	"deu": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="german"),
	"hin": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="hindi"),
	"ita": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="italian"),
	"por": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="portuguese"),
	"spa": load_dataset("cardiffnlp/tweet_sentiment_multilingual", name="spanish"),
}

# NusaTranslation Senti
nt_senti_dset = {
	"btk": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_btk_nusantara_text"),
	"sun": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_sun_nusantara_text"),
	"jav": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_jav_nusantara_text"),
	"mad": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_mad_nusantara_text"),
	"mak": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_mak_nusantara_text"),
	"min": load_dataset("indonlp/nusatranslation_senti", name="nusatranslation_senti_min_nusantara_text"),
}

# NusaTranslation MT
nt_mt_dset = {
	"btk": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_btk_ind_nusantara_t2t"),
	"sun": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_sun_ind_nusantara_t2t"),
	"jav": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_jav_ind_nusantara_t2t"),
	"mad": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_mad_ind_nusantara_t2t"),
	"mak": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_mak_ind_nusantara_t2t"),
	"min": load_dataset("indonlp/nusatranslation_mt", name="nusatranslation_mt_min_ind_nusantara_t2t"),
}

# NusaX Senti
nusax_senti_dset = {
	"btk": load_dataset("indonlp/NusaX-senti", name="bbc"),
	"sun": load_dataset("indonlp/NusaX-senti", name="sun"),
	"jav": load_dataset("indonlp/NusaX-senti", name="jav"),
	"mad": load_dataset("indonlp/NusaX-senti", name="mad"),
	"mak": load_dataset("indonlp/NusaX-senti", name="bug"),
	"min": load_dataset("indonlp/NusaX-senti", name="min"),
	"ind": load_dataset("indonlp/NusaX-senti", name="ind"), # For X-ICL
	"eng": load_dataset("indonlp/NusaX-senti", name="eng"), # For X-ICL
}

# NusaX MT ind
nusax_mt_ind_dset = {
	"btk": load_dataset("indonlp/NusaX-MT", name="bbc-ind"), # For IIA
	"sun": load_dataset("indonlp/NusaX-MT", name="sun-ind"), # For IIA
	"jav": load_dataset("indonlp/NusaX-MT", name="jav-ind"), # For IIA
	"mad": load_dataset("indonlp/NusaX-MT", name="mad-ind"), # For IIA
	"mak": load_dataset("indonlp/NusaX-MT", name="bug-ind"), # For IIA
	"min": load_dataset("indonlp/NusaX-MT", name="min-ind"), # For IIA
}

# NusaX MT eng (Extended experiment)
nusax_mt_eng_dset = {
	"btk": load_dataset("indonlp/NusaX-MT", name="bbc-eng"), # For IIA
	"sun": load_dataset("indonlp/NusaX-MT", name="sun-eng"), # For IIA
	"jav": load_dataset("indonlp/NusaX-MT", name="jav-eng"), # For IIA
	"mad": load_dataset("indonlp/NusaX-MT", name="mad-eng"), # For IIA
	"mak": load_dataset("indonlp/NusaX-MT", name="bug-eng"), # For IIA
	"min": load_dataset("indonlp/NusaX-MT", name="min-eng"), # For IIA
}

# MasakhaNews
masakhanews_dset = {
	"amh": load_dataset("masakhane/masakhanews", name="amh"),
	"hau": load_dataset("masakhane/masakhanews", name="hau"),
	"ibo": load_dataset("masakhane/masakhanews", name="ibo"),
	"lug": load_dataset("masakhane/masakhanews", name="lug"),
	"pcm": load_dataset("masakhane/masakhanews", name="pcm"),
	"sna": load_dataset("masakhane/masakhanews", name="sna"),
	"swa": load_dataset("masakhane/masakhanews", name="swa"),
	"xho": load_dataset("masakhane/masakhanews", name="xho"),
	"yor": load_dataset("masakhane/masakhanews", name="yor"),
	"eng": load_dataset("masakhane/masakhanews", name="eng"), # For X-ICL
}

# MAFAND
mafand_dset = {
	"amh": load_dataset("masakhane/mafand", name="en-amh"), # For IIA
	"hau": load_dataset("masakhane/mafand", name="en-hau"), # For IIA
	"ibo": load_dataset("masakhane/mafand", name="en-ibo"), # For IIA
	"lug": load_dataset("masakhane/mafand", name="en-lug"), # For IIA
	"pcm": load_dataset("masakhane/mafand", name="en-pcm"), # For IIA
	"sna": load_dataset("masakhane/mafand", name="en-sna"), # For IIA
	"swa": load_dataset("masakhane/mafand", name="en-swa"), # For IIA
	"xho": load_dataset("masakhane/mafand", name="en-xho"), # For IIA
	"yor": load_dataset("masakhane/mafand", name="en-yor"), # For IIA
}

# AmericasNLI
americasnli_dset = {
	"aym": load_dataset("americas_nli", name="aym"),
	"bzd": load_dataset("americas_nli", name="bzd"),
	"cni": load_dataset("americas_nli", name="cni"),
	"gn": load_dataset("americas_nli", name="gn"),
	"hch": load_dataset("americas_nli", name="hch"),
	"nah": load_dataset("americas_nli", name="nah"),
	"oto": load_dataset("americas_nli", name="oto"),
	"quy": load_dataset("americas_nli", name="quy"),
	"shp": load_dataset("americas_nli", name="shp"),
	"tar": load_dataset("americas_nli", name="tar"),
}

# XNLI
xnli_spa_dset = load_dataset("xnli", name="es") # For X-ICL
xnli_eng_dset = load_dataset("xnli", name="en") # For X-ICL

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/english/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/arabic/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/french/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/german/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/hindi/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/italian/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/portuguese/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset tweet_sentiment_multilingual (/home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/spanish/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_btk_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_sun_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_jav_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_mad_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_mak_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_senti/nusatranslation_senti_min_nusantara_text/1.0.0/84380b9fe1509e8d8d8c7c3c6d475f39181456c63a2fad48e88ceb95a642c952)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_btk_ind_nusantara_t2t/1.0.0/376e332d835c55ed9bac14d4daafdc3f3676d9be8165bf3fa6176fafcfc8a7c7)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_sun_ind_nusantara_t2t/1.0.0/376e332d835c55ed9bac14d4daafdc3f3676d9be8165bf3fa6176fafcfc8a7c7)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_jav_ind_nusantara_t2t/1.0.0/376e332d835c55ed9bac14d4daafdc3f3676d9be8165bf3fa6176fafcfc8a7c7)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_mad_ind_nusantara_t2t/1.0.0/376e332d835c55ed9bac14d4daafdc3f3676d9be8165bf3fa6176fafcfc8a7c7)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_mak_ind_nusantara_t2t/1.0.0/376e332d835c55ed9bac14d4daafdc3f3676d9be8165bf3fa6176fafcfc8a7c7)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusatranslation_mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusatranslation_mt/nusatranslation_mt_min_ind_nusantara_t2t/1.0.0/376e332d835c55ed9bac14d4daafdc3f3676d9be8165bf3fa6176fafcfc8a7c7)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/bbc/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/sun/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/jav/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/mad/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/bug/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/min/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/ind/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-senti (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/eng/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bbc-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/sun-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/jav-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/mad-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bug-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/min-ind/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bbc-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/sun-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/jav-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/mad-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/bug-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset nusa_x-mt (/home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-mt/min-eng/1.0.0/875114470acfddba36384fe519329688cdf068ecbffed136c7d2a4c63f8e60d0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/amh/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/hau/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/ibo/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/lug/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/pcm/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/sna/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/swa/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/xho/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/yor/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset masakhanews (/home/samuel/.cache/huggingface/datasets/masakhane___masakhanews/eng/1.0.0/c60334474cec1d13c0c3018e138cd8b60bc6443fa5e25258d65d990b2498403f)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-amh/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-hau/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-ibo/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-lug/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-pcm/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-sna/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-swa/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-xho/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset mafand (/home/samuel/.cache/huggingface/datasets/masakhane___mafand/en-yor/1.0.0/9d9e4635f1deeddd2d72e959a6bcd39d9450d019c262e4c5d9a3d6f24387e937)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/aym/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/bzd/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/cni/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/gn/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/hch/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/nah/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/oto/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/quy/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/shp/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset americas_nli (/home/samuel/.cache/huggingface/datasets/americas_nli/tar/1.0.0/2f4cd00b49c982fa8827cb332b2a2dbb692aa8a7cd34432fc476efeb7fee375a)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset xnli (/home/samuel/.cache/huggingface/datasets/xnli/es/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset xnli (/home/samuel/.cache/huggingface/datasets/xnli/en/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)


  0%|          | 0/3 [00:00<?, ?it/s]

# Standardize Dataset

### NLU Dataset

In [3]:
save_path = '.'

def label2str(row, dset):
    row['str_label'] = dset.features['label'].int2str(row['label'])
    return row

# Americas NLI
anli_label_map = {
    'entailment': 'vinculación',
    'neutral': 'neutral',
    'contradiction': 'contradicción'
}

nt_label_map = {
    'positive': 'positif',
    'neutral': 'netral',
    'negative': 'negatif',
}

def label_eng2spa(row):
    row['label'] = anli_label_map[row['label']]
    return row

def label_eng2ind(row):
    row['label'] = nt_label_map[row['label']]
    return row

In [4]:
###
# Prepare Evaluation Data
#
# Single-Sentence Classification [text, label]
# NLI Classification [premise, hypothesis, label]
###

# Process Tweet Sentiment Multilingual
tweet_senti_multi_dset_clean = {}
for key in tweet_senti_multi_dset.keys():
    dset = tweet_senti_multi_dset[key]['test']
    tweet_senti_multi_dset_clean[key] = dset
tweet_senti_multi_dset_clean = datasets.DatasetDict(tweet_senti_multi_dset_clean)
tweet_senti_multi_dset_clean = tweet_senti_multi_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": nusax_senti_dset['jav']['train']}
).rename_columns({'str_label': 'label'})

# Process NusaTranslation Senti
nt_senti_dset_clean = {}
for key in nt_senti_dset.keys():
    dset = nt_senti_dset[key]['test'].remove_columns(['id'])
    nt_senti_dset_clean[key] = dset
nt_senti_dset_clean = datasets.DatasetDict(nt_senti_dset_clean)
nt_senti_dset_clean = nt_senti_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": nusax_senti_dset['jav']['train']}
).rename_columns({'str_label': 'label'})

# Process MasakhaNews Senti
masakhanews_dset_clean = {}
for key in masakhanews_dset.keys():
    dset = masakhanews_dset[key]['test'].remove_columns(['text', 'headline_text', 'url'])
    dset = dset.rename_columns({'headline': 'text'})
    dset = dset.map(
        label2str, remove_columns=['label'], fn_kwargs={"dset": dset}
    ).rename_columns({'str_label': 'label'})
    masakhanews_dset_clean[key] = dset
masakhanews_dset_clean = datasets.DatasetDict(masakhanews_dset_clean)

# Process AmericasNLI Senti Spa
americasnli_spa_dset_clean = {}
for key in americasnli_dset.keys():
    dset = americasnli_dset[key]['test']
    americasnli_spa_dset_clean[key] = dset
americasnli_spa_dset_clean = datasets.DatasetDict(americasnli_spa_dset_clean)
americasnli_spa_dset_clean = americasnli_spa_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": americasnli_dset['aym']['test']}
).rename_columns({'str_label': 'label'})

americasnli_spa_dset_clean = americasnli_spa_dset_clean.map(label_eng2spa)

# Process AmericasNLI Senti Eng
americasnli_eng_dset_clean = {}
for key in americasnli_dset.keys():
    dset = americasnli_dset[key]['test']
    americasnli_eng_dset_clean[key] = dset
americasnli_eng_dset_clean = datasets.DatasetDict(americasnli_eng_dset_clean)
americasnli_eng_dset_clean = americasnli_eng_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": americasnli_dset['aym']['test']}
).rename_columns({'str_label': 'label'})


# Save to Cache
tweet_senti_multi_dset_clean.save_to_disk(f'{save_path}/tweet_senti_multi_test_dset')
nt_senti_dset_clean.save_to_disk(f'{save_path}/nt_senti_test_dset')
masakhanews_dset_clean.save_to_disk(f'{save_path}/masakhanews_test_dset')
americasnli_spa_dset_clean.save_to_disk(f'{save_path}/americasnli_test_spa_dset')
americasnli_eng_dset_clean.save_to_disk(f'{save_path}/americasnli_test_eng_dset')

Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/english/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-235e87bb67b341d6.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/arabic/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-293da8dd76976129.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/french/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-5d058ffd14da21a6.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/german/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-05ca7e6712ccbd42.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/870 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/376 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/637 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/390 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/223 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/305 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/369 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/476 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/297 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/411 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/948 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/748 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/748 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

In [5]:
###
# Prepare ICL & X-ICL Data
#
# Single-Sentence Classification [text, label]
# NLI Classification [premise, hypothesis, label]
###

# Process Tweet Sentiment Multilingual
icl_tweet_senti_multi_dset_clean = {}
for key in tweet_senti_multi_dset.keys():
    dset = tweet_senti_multi_dset[key]['train']
    icl_tweet_senti_multi_dset_clean[key] = dset
icl_tweet_senti_multi_dset_clean = datasets.DatasetDict(icl_tweet_senti_multi_dset_clean)
icl_tweet_senti_multi_dset_clean = icl_tweet_senti_multi_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": nusax_senti_dset['jav']['train']}
).rename_columns({'str_label': 'label'})

# Process NusaTranslation Senti ICL Data => NusaX Senti All Splits
icl_nusax_senti_dset_clean = {}
for key in nusax_senti_dset.keys():
    dset = []
    for split in nusax_senti_dset[key].keys():
        dset.append(nusax_senti_dset[key][split].remove_columns(['id', 'lang']))
    icl_nusax_senti_dset_clean[key] = datasets.concatenate_datasets(dset)
icl_nusax_senti_dset_clean = datasets.DatasetDict(icl_nusax_senti_dset_clean)
icl_nusax_senti_dset_clean = icl_nusax_senti_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": nusax_senti_dset['jav']['train']}
).rename_columns({'str_label': 'label'})

# Process MasakhaNews ICL Data => MasakhaNews Train & Validation
icl_masakhanews_dset_clean = {}
for key in masakhanews_dset.keys():
    dset = []
    for split in masakhanews_dset[key].keys():
        dset.append(masakhanews_dset[key][split].remove_columns(['text', 'headline_text', 'url']))
    dset = datasets.concatenate_datasets(dset).rename_columns({'headline': 'text'})
    dset = dset.map(
        label2str, remove_columns=['label'], fn_kwargs={"dset": dset}
    ).rename_columns({'str_label': 'label'})
    icl_masakhanews_dset_clean[key] = dset
icl_masakhanews_dset_clean = datasets.DatasetDict(icl_masakhanews_dset_clean)

# Process AmericasNLI ICL Data => ICL: AmericasNLI Validation, X-ICL: XNLI Validation
# Spanish
icl_americasnli_spa_dset_clean = {}
for key in americasnli_dset.keys():
    dset = americasnli_dset[key]['validation']
    icl_americasnli_spa_dset_clean[key] = dset
icl_americasnli_spa_dset_clean['spa'] = xnli_spa_dset['validation'] # Add Spanish data from XNLI
icl_americasnli_spa_dset_clean = datasets.DatasetDict(icl_americasnli_spa_dset_clean)

icl_americasnli_spa_dset_clean = icl_americasnli_spa_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": americasnli_dset['aym']['test']}
).rename_columns({'str_label': 'label'})
icl_americasnli_spa_dset_clean = icl_americasnli_spa_dset_clean.map(label_eng2spa)

# English
icl_americasnli_eng_dset_clean = {}
for key in americasnli_dset.keys():
    dset = americasnli_dset[key]['validation']
    icl_americasnli_eng_dset_clean[key] = dset
icl_americasnli_eng_dset_clean['eng'] = xnli_eng_dset['validation'] # Add Spanish data from XNLI
icl_americasnli_eng_dset_clean = datasets.DatasetDict(icl_americasnli_eng_dset_clean)

icl_americasnli_eng_dset_clean = icl_americasnli_eng_dset_clean.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": americasnli_dset['aym']['test']}
).rename_columns({'str_label': 'label'})

# Save to Cache
icl_tweet_senti_multi_dset_clean.save_to_disk(f'{save_path}/icl_tweet_senti_multi_dset')
icl_nusax_senti_dset_clean.save_to_disk(f'{save_path}/icl_nusax_senti_dset')
icl_masakhanews_dset_clean.save_to_disk(f'{save_path}/icl_masakhanews_dset')
icl_americasnli_spa_dset_clean.save_to_disk(f'{save_path}/icl_americasnli_spa_dset')
icl_americasnli_eng_dset_clean.save_to_disk(f'{save_path}/icl_americasnli_eng_dset')

Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/english/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-549f623ffb346bde.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/arabic/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-f7867223d0da9838.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/french/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-05ef2e3a34870f15.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/german/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-6f471d7456abbd2d.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1875 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3173 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1940 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1517 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1842 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2371 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1476 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2050 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4729 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/658 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/376 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/222 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2490 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/658 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/376 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/222 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2490 [00:00<?, ? examples/s]

### ITC & IIA Dataset

In [6]:
save_path = '.'

def label2str(row, dset):
    row['str_label'] = dset.features['label'].int2str(row['label'])
    return row

# Tweet Sentiment Multi
bcp47_lang_map = {
    'arb': 'arb_Arab',
    'deu': 'deu_Latn',
    'fra': 'fra_Latn',
    'spa': 'spa_Latn',
    'ita': 'ita_Latn',
    'por': 'por_Latn',
    'hin': 'hin_Deva',
    'eng': 'eng_Latn'
}

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B").to('cuda')
def translate_nllb(row):
    text = row['text']
    inputs = tokenizer(text, padding='longest', return_tensors="pt").to('cuda')
    translated_tokens = model.generate(
        **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=100
    )
    row['mt_text'] = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)    
    return row

In [7]:
####
# Twitter Sentiment Multilingual
####

tweet_senti_multi_combined_dset = {}
for key in tweet_senti_multi_dset.keys():
    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B", src_lang=bcp47_lang_map[key])
    dset = tweet_senti_multi_dset[key]['train'].map(
        translate_nllb, batched=True, batch_size=128
    ).rename_columns({'text': 'text_1', 'mt_text': 'text_2'})
    tweet_senti_multi_combined_dset[key] = dset
tweet_senti_multi_combined_dset = datasets.DatasetDict(tweet_senti_multi_combined_dset)
tweet_senti_multi_combined_dset = tweet_senti_multi_combined_dset.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": tweet_senti_multi_dset['eng']['train']}
).rename_columns({'str_label': 'label'})

tweet_senti_multi_combined_dset.save_to_disk(f'{save_path}/tweet_senti_multi_combined_dset')

Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/english/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-fa4628b72886e70a.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/arabic/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-5f6801279976fabd.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/french/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-ff762a3aa140820d.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/german/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-bf0b165be17df672.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/cardiffnlp___tweet

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1839 [00:00<?, ? examples/s]

In [8]:
####
# NusaX Combined [text_1, text_2, label]
####

nusax_mt_ind_dset_clean = datasets.load_from_disk(f'{save_path}/nusax_mt_ind_dset')
nusax_mt_eng_dset_clean = datasets.load_from_disk(f'{save_path}/nusax_mt_eng_dset')

# Process NusaX Combined ind
nusax_combined_ind_dset = {}
for key in nusax_senti_dset.keys():
    if key in ['ind', 'eng']:
        continue
        
    dset = []
    for split in nusax_senti_dset[key].keys():
        dset.append(nusax_senti_dset[key][split].remove_columns(['id', 'lang', 'text']))
    nusax_combined_ind_dset[key] = datasets.concatenate_datasets([
        datasets.concatenate_datasets(dset),
        nusax_mt_ind_dset_clean[key]
    ], axis=1)
nusax_combined_ind_dset = datasets.DatasetDict(nusax_combined_ind_dset)
nusax_combined_ind_dset = nusax_combined_ind_dset.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": nusax_senti_dset['jav']['train']}
).rename_columns({'str_label': 'label'})

# Process NusaX Combined ind
nusax_combined_eng_dset = {}
for key in nusax_senti_dset.keys():
    if key in ['ind', 'eng']:
        continue
        
    dset = []
    for split in nusax_senti_dset[key].keys():
        dset.append(nusax_senti_dset[key][split].remove_columns(['id', 'lang', 'text']))
    nusax_combined_eng_dset[key] = datasets.concatenate_datasets([
        datasets.concatenate_datasets(dset),
        nusax_mt_eng_dset_clean[key]
    ], axis=1)
nusax_combined_eng_dset = datasets.DatasetDict(nusax_combined_eng_dset)
nusax_combined_eng_dset = nusax_combined_eng_dset.map(
    label2str, remove_columns=['label'], fn_kwargs={"dset": nusax_senti_dset['jav']['train']}
).rename_columns({'str_label': 'label'})

nusax_combined_ind_dset.save_to_disk(f'{save_path}/nusax_combined_ind_dset')
nusax_combined_eng_dset.save_to_disk(f'{save_path}/nusax_combined_eng_dset')

Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/bbc/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5/cache-5e0783d702a0838e.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/sun/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5/cache-d8b3212c30baf690.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/jav/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5/cache-9d305c4cc4ee0ee1.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/mad/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5/cache-ded18bf5d11d4084.arrow
Loading cached processed dataset at /home/samuel/.cache/huggingface/datasets/indonlp___nusa_x-senti/bug/1.0.0/3477a395c5c7a09a74d897ceb96ebd2c3afbd1d7fad0c11d8b8026b8b08e3af5/cache-74b

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
save_path = '.'
nusax_combined_ind_dset = datasets.load_from_disk(f'{save_path}/nusax_combined_ind_dset')
nusax_combined_eng_dset = datasets.load_from_disk(f'{save_path}/nusax_combined_eng_dset')

In [14]:
nusax_combined_ind_dset['btk'][:3]

{'text_1': ['atimi ciccilan 10% sahat 12 bulan tu panuhoran tiket pesaway air asia dohot kartu kredit bni!',
  'Kue-kue na diparade mambahen au taringot tu angka nadung salpu. saluhut angka ragam ni kue tingki najolo, songon rupa nang dohot daina, kue na tabo argana pe ura.',
  'Inong hea karejo di grab Indonesia.'],
 'text_2': ['Nikmati cicilan 0% hingga 12 bulan untuk pemesanan tiket pesawat air asia dengan kartu kredit bni!',
  'Kue-kue yang disajikan bikin saya bernostalgia. Semuanya tipikal kue zaman dulu, baik dari penampilan maupun rasa. Kuenya enak dan harganya juga murah.',
  'Ibu pernah bekerja di grab indonesia'],
 'label': ['neutral', 'positive', 'neutral']}

In [9]:
####
# AmericasNLI - XNLI Combined
####

# Spanish
save_path = '.'

# Cannot use the XNLI from HuggingFace, somehow the results are not aligned,
# so we use the original XNLI file (https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip) instead
xnli_df = pd.read_csv('XNLI-1.0/xnli.dev.tsv', sep='\t').reset_index()
americasnli_combined_spa_dset = {}
for key in americasnli_dset.keys():
    anli_df = pd.read_csv(f'https://github.com/abteen/americasnli/raw/main/data/anli_final/dev/{key}.tsv', sep='\t')
    anli_dset = datasets.Dataset.from_pandas(
        anli_df[['premise', 'hypothesis', 'label']].rename({
            'premise': 'premise_1', 'hypothesis': 'hypothesis_1'
        }, axis='columns')
    )    
    
    xnli_dset = datasets.Dataset.from_pandas(
        xnli_df.loc[anli_df.id-1, ['sentence1', 'sentence2']].rename({
            'sentence1': 'premise_2', 'sentence2': 'hypothesis_2'
        }, axis='columns')
    ).remove_columns('__index_level_0__')
    americasnli_combined_spa_dset[key] = datasets.concatenate_datasets([anli_dset, xnli_dset], axis=1)
americasnli_combined_spa_dset = datasets.DatasetDict(americasnli_combined_spa_dset)
americasnli_combined_spa_dset = americasnli_combined_spa_dset.map(label_eng2spa)
americasnli_combined_spa_dset.save_to_disk(f'{save_path}/americasnli_combined_dev_spa_dset')

# English
xnli_df = pd.read_csv('XNLI-1.0/xnli.dev.tsv', sep='\t').reset_index()
americasnli_combined_eng_dset = {}
for key in americasnli_dset.keys():
    anli_df = pd.read_csv(f'https://github.com/abteen/americasnli/raw/main/data/anli_final/dev/{key}.tsv', sep='\t')
    anli_dset = datasets.Dataset.from_pandas(
        anli_df[['premise', 'hypothesis', 'label']].rename({
            'premise': 'premise_1', 'hypothesis': 'hypothesis_1'
        }, axis='columns')
    )    
    
    xnli_dset = datasets.Dataset.from_pandas(
        xnli_df.loc[anli_df.id-2491, ['sentence1', 'sentence2']].rename({
            'sentence1': 'premise_2', 'sentence2': 'hypothesis_2'
        }, axis='columns')
    ).remove_columns('__index_level_0__')
    americasnli_combined_eng_dset[key] = datasets.concatenate_datasets([anli_dset, xnli_dset], axis=1)
americasnli_combined_eng_dset = datasets.DatasetDict(americasnli_combined_spa_dset)
americasnli_combined_eng_dset.save_to_disk(f'{save_path}/americasnli_combined_dev_eng_dset')

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/658 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/585 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/658 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/585 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/222 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/658 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/585 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/658 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/585 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/222 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

In [10]:
####
# MAFAND Random Label
####
random.seed(12345)
save_path = '.'

# Process MAFAND
mafand_dset_clean = {}
for key in mafand_dset.keys():
    label_names = masakhanews_dset[key]['train'].features['label'].names
    tmp_dset = {'text_1': [], 'text_2': [], 'label': []}
    for split in mafand_dset[key].keys():        
        for i in range(len(mafand_dset[key][split])):
            tmp_dset['text_1'].append(mafand_dset[key][split][i]['translation'][key])
            tmp_dset['text_2'].append(mafand_dset[key][split][i]['translation']['en'])
            tmp_dset['label'].append(random.choice(label_names))
    mafand_dset_clean[key] = datasets.Dataset.from_dict(tmp_dset)
mafand_dset_clean = datasets.DatasetDict(mafand_dset_clean)
mafand_dset_clean.save_to_disk(f'{save_path}/mafand_rand_label_dset')

Saving the dataset (0/1 shards):   0%|          | 0/1936 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8665 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9998 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7075 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7838 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1561 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/34408 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1488 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9746 [00:00<?, ? examples/s]

In [23]:
####
# AmericasNLI - XNLI Combined
####

# Spanish
save_path = '.'

# Cannot use the XNLI from HuggingFace, somehow the results are not aligned,
# so we use the original XNLI file (https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip) instead
xnli_df = pd.read_csv('XNLI-1.0/xnli.dev.tsv', sep='\t').reset_index()
americasnli_combined_spa_dset = {}
for key in americasnli_dset.keys():
    anli_df = pd.read_csv(f'https://github.com/abteen/americasnli/raw/main/data/anli_final/dev/{key}.tsv', sep='\t')
    anli_dset = datasets.Dataset.from_pandas(
        anli_df[['premise', 'hypothesis', 'label']].rename({
            'premise': 'premise_1', 'hypothesis': 'hypothesis_1'
        }, axis='columns')
    )    
    
    xnli_dset = datasets.Dataset.from_pandas(
        xnli_df.loc[anli_df.id-1, ['sentence1', 'sentence2']].rename({
            'sentence1': 'premise_2', 'sentence2': 'hypothesis_2'
        }, axis='columns')
    ).remove_columns('__index_level_0__')
    americasnli_combined_spa_dset[key] = datasets.concatenate_datasets([anli_dset, xnli_dset], axis=1)
americasnli_combined_spa_dset = datasets.DatasetDict(americasnli_combined_spa_dset)
americasnli_combined_spa_dset = americasnli_combined_spa_dset.map(label_eng2spa)
americasnli_combined_spa_dset.save_to_disk(f'{save_path}/americasnli_combined_dev_spa_dset')

# English
xnli_df = pd.read_csv('XNLI-1.0/xnli.dev.tsv', sep='\t').reset_index()
americasnli_combined_eng_dset = {}
for key in americasnli_dset.keys():
    anli_df = pd.read_csv(f'https://github.com/abteen/americasnli/raw/main/data/anli_final/dev/{key}.tsv', sep='\t')
    anli_dset = datasets.Dataset.from_pandas(
        anli_df[['premise', 'hypothesis', 'label']].rename({
            'premise': 'premise_1', 'hypothesis': 'hypothesis_1'
        }, axis='columns')
    )    
    
    xnli_dset = datasets.Dataset.from_pandas(
        xnli_df.loc[anli_df.id-2491, ['sentence1', 'sentence2']].rename({
            'sentence1': 'premise_2', 'sentence2': 'hypothesis_2'
        }, axis='columns')
    ).remove_columns('__index_level_0__')
    americasnli_combined_eng_dset[key] = datasets.concatenate_datasets([anli_dset, xnli_dset], axis=1)
americasnli_combined_eng_dset = datasets.DatasetDict(americasnli_combined_eng_dset)
americasnli_combined_eng_dset.save_to_disk(f'{save_path}/americasnli_combined_dev_eng_dset')

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/658 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/585 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Map:   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/658 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/585 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/222 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/658 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/585 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/222 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/743 [00:00<?, ? examples/s]