In [1]:
import os
import math
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import XLMRobertaTokenizer
from google.colab import drive

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Define path in Google Drive
# Minimal distillation data are saved for a quick access.
DATA_DIR = "/content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data"
TXT_DIR = "/content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_txt"
os.makedirs(DATA_DIR, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define languages
languages = {'af': 1.3, 'am': 0.8, 'ar': 28.0, 'as': 0.1, 'az': 6.5,
       'be': 4.3, 'bg': 57.5, 'bn': 8.4, 'bn_rom': 0.5, 'br': 0.1,
       'bs': 0.1, 'ca': 10.1, 'cs': 16.3, 'cy': 0.8, 'da': 45.6,
       'de': 66.6, 'el': 46.9, 'en': 300.8, 'eo': 0.9, 'es': 53.3,
       'et': 6.1, 'eu': 2.0, 'fa': 111.6, 'fi': 54.3, 'fr': 56.8,
       'fy': 0.2, 'ga': 0.5, 'gd': 0.1, 'gl': 2.9, 'gu': 1.9, 'ha': 0.3,
       'he': 31.6, 'hi': 20.2, 'hi_rom': 0.5, 'hr': 20.5, 'hu': 58.4,
       'hy': 5.5, 'id': 148.3, 'is': 3.2, 'it': 30.2, 'ja': 69.3,
       'jv': 0.2, 'ka': 9.1, 'kk': 6.4, 'km': 1.5, 'kn': 3.3, 'ko': 54.2,
       'ku': 0.4, 'ky': 1.2, 'la': 2.5, 'lo': 0.6, 'lt': 13.7, 'lv': 8.8,
       'mg': 0.2, 'mk': 4.8, 'ml': 7.6, 'mn': 3.0, 'mr': 2.8, 'ms': 8.5,
       'my': 0.4, 'my_zaw': 1.6, 'ne': 3.8, 'nl': 29.3, 'no': 49.0,
       'om': 0.1, 'or': 0.6, 'pa': 0.8, 'pl': 44.6, 'ps': 0.7,
       'pt': 49.1, 'ro': 61.4, 'ru': 278.0, 'sa': 0.3, 'sd': 0.4,
       'si': 3.6, 'sk': 23.2, 'sl': 10.3, 'so': 0.4, 'sq': 5.4,
       'sr': 9.1, 'su': 0.1, 'sv': 12.1, 'sw': 1.6, 'ta': 12.2,
       'ta_rom': 0.3, 'te': 4.7, 'te_rom': 0.3, 'th': 71.7, 'tl': 3.1,
       'tr': 20.9, 'ug': 0.4, 'uk': 84.6, 'ur': 5.7, 'ur_rom': 0.5,
       'uz': 0.7, 'vi': 137.3, 'xh': 0.1, 'yi': 0.3, 'zh-Hans': 46.9,
       'zh-Hant': 16.6
}

In [4]:
# Initialize tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")

def preprocess_and_save(lang):
    print(f"Processing {lang}...")
    txt_path = os.path.join(TXT_DIR, f"{lang}.txt")

    # Read lines into Hugging Face Dataset
    with open(txt_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]
    dataset = Dataset.from_dict({"text": lines})

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=256
        )

    tokenized = dataset.map(tokenize_function, batched=True, batch_size=1000)

    # Save as compressed Parquet
    save_path = os.path.join(DATA_DIR, f"{lang}.parquet")
    tokenized.to_parquet(save_path)
    print(f"Saved {lang} dataset at {save_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
for lang in languages:
    save_path = os.path.join(DATA_DIR, f"{lang}.parquet")
    if not os.path.exists(save_path):
        preprocess_and_save(lang)
    else:
        print(f"{lang} dataset already exists in Google Drive.")

Processing af...


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved af dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/af.parquet
Processing am...


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved am dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/am.parquet
Processing ar...


Map:   0%|          | 0/280 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ar dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ar.parquet
Processing as...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved as dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/as.parquet
Processing az...


Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved az dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/az.parquet
Processing be...


Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved be dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/be.parquet
Processing bg...


Map:   0%|          | 0/575 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved bg dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/bg.parquet
Processing bn...


Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved bn dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/bn.parquet
Processing bn_rom...


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved bn_rom dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/bn_rom.parquet
Processing br...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved br dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/br.parquet
Processing bs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved bs dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/bs.parquet
Processing ca...


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ca dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ca.parquet
Processing cs...


Map:   0%|          | 0/163 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved cs dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/cs.parquet
Processing cy...


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved cy dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/cy.parquet
Processing da...


Map:   0%|          | 0/456 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved da dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/da.parquet
Processing de...


Map:   0%|          | 0/666 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved de dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/de.parquet
Processing el...


Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved el dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/el.parquet
Processing en...


Map:   0%|          | 0/3008 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Saved en dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/en.parquet
Processing eo...


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved eo dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/eo.parquet
Processing es...


Map:   0%|          | 0/533 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved es dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/es.parquet
Processing et...


Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved et dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/et.parquet
Processing eu...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved eu dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/eu.parquet
Processing fa...


Map:   0%|          | 0/1116 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Saved fa dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/fa.parquet
Processing fi...


Map:   0%|          | 0/543 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved fi dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/fi.parquet
Processing fr...


Map:   0%|          | 0/568 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved fr dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/fr.parquet
Processing fy...


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved fy dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/fy.parquet
Processing ga...


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ga dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ga.parquet
Processing gd...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved gd dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/gd.parquet
Processing gl...


Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved gl dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/gl.parquet
Processing gu...


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved gu dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/gu.parquet
Processing ha...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ha dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ha.parquet
Processing he...


Map:   0%|          | 0/316 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved he dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/he.parquet
Processing hi...


Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved hi dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/hi.parquet
Processing hi_rom...


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved hi_rom dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/hi_rom.parquet
Processing hr...


Map:   0%|          | 0/205 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved hr dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/hr.parquet
Processing hu...


Map:   0%|          | 0/584 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved hu dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/hu.parquet
Processing hy...


Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved hy dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/hy.parquet
Processing id...


Map:   0%|          | 0/1483 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Saved id dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/id.parquet
Processing is...


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved is dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/is.parquet
Processing it...


Map:   0%|          | 0/302 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved it dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/it.parquet
Processing ja...


Map:   0%|          | 0/693 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ja dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ja.parquet
Processing jv...


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved jv dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/jv.parquet
Processing ka...


Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ka dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ka.parquet
Processing kk...


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved kk dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/kk.parquet
Processing km...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved km dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/km.parquet
Processing kn...


Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved kn dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/kn.parquet
Processing ko...


Map:   0%|          | 0/542 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ko dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ko.parquet
Processing ku...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ku dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ku.parquet
Processing ky...


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ky dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ky.parquet
Processing la...


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved la dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/la.parquet
Processing lo...


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved lo dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/lo.parquet
Processing lt...


Map:   0%|          | 0/137 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved lt dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/lt.parquet
Processing lv...


Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved lv dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/lv.parquet
Processing mg...


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved mg dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/mg.parquet
Processing mk...


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved mk dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/mk.parquet
Processing ml...


Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ml dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ml.parquet
Processing mn...


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved mn dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/mn.parquet
Processing mr...


Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved mr dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/mr.parquet
Processing ms...


Map:   0%|          | 0/85 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ms dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ms.parquet
Processing my...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved my dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/my.parquet
Processing my_zaw...


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved my_zaw dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/my_zaw.parquet
Processing ne...


Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ne dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ne.parquet
Processing nl...


Map:   0%|          | 0/293 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved nl dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/nl.parquet
Processing no...


Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved no dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/no.parquet
Processing om...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved om dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/om.parquet
Processing or...


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved or dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/or.parquet
Processing pa...


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved pa dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/pa.parquet
Processing pl...


Map:   0%|          | 0/446 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved pl dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/pl.parquet
Processing ps...


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ps dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ps.parquet
Processing pt...


Map:   0%|          | 0/491 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved pt dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/pt.parquet
Processing ro...


Map:   0%|          | 0/614 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ro dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ro.parquet
Processing ru...


Map:   0%|          | 0/2780 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Saved ru dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ru.parquet
Processing sa...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sa dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sa.parquet
Processing sd...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sd dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sd.parquet
Processing si...


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved si dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/si.parquet
Processing sk...


Map:   0%|          | 0/232 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sk dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sk.parquet
Processing sl...


Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sl dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sl.parquet
Processing so...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved so dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/so.parquet
Processing sq...


Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sq dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sq.parquet
Processing sr...


Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sr dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sr.parquet
Processing su...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved su dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/su.parquet
Processing sv...


Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sv dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sv.parquet
Processing sw...


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved sw dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/sw.parquet
Processing ta...


Map:   0%|          | 0/122 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ta dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ta.parquet
Processing ta_rom...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ta_rom dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ta_rom.parquet
Processing te...


Map:   0%|          | 0/47 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved te dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/te.parquet
Processing te_rom...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved te_rom dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/te_rom.parquet
Processing th...


Map:   0%|          | 0/717 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved th dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/th.parquet
Processing tl...


Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tl dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/tl.parquet
Processing tr...


Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved tr dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/tr.parquet
Processing ug...


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ug dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ug.parquet
Processing uk...


Map:   0%|          | 0/846 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved uk dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/uk.parquet
Processing ur...


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ur dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ur.parquet
Processing ur_rom...


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved ur_rom dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/ur_rom.parquet
Processing uz...


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved uz dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/uz.parquet
Processing vi...


Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Saved vi dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/vi.parquet
Processing xh...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved xh dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/xh.parquet
Processing yi...


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved yi dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/yi.parquet
Processing zh-Hans...


Map:   0%|          | 0/469 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved zh-Hans dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/zh-Hans.parquet
Processing zh-Hant...


Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Saved zh-Hant dataset at /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data/zh-Hant.parquet


In [6]:
def split_parquet_into_n_chunks(parquet_path, out_dir, lang_code, num_chunks=5):
    df = pd.read_parquet(parquet_path)
    total_rows = len(df)
    rows_per_chunk = math.ceil(total_rows / num_chunks)

    os.makedirs(out_dir, exist_ok=True)

    for i in range(num_chunks):
        start_idx = i * rows_per_chunk
        end_idx = min(start_idx + rows_per_chunk, total_rows)
        chunk = df.iloc[start_idx:end_idx]

        out_path = os.path.join(out_dir, f"{lang_code}_{i}.parquet")
        chunk.to_parquet(out_path, index=False)
        print(f"Saved {lang_code} chunk {i} with {len(chunk)} rows to {out_path}")

In [8]:
OUTPUT_DIR = "/content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split"
os.makedirs(OUTPUT_DIR, exist_ok=True)

for lang in languages:
    split_parquet_into_n_chunks(
        f"{DATA_DIR}/{lang}.parquet",
        OUTPUT_DIR,
        lang,
        num_chunks=5
    )

Saved af chunk 0 with 3 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/af_0.parquet
Saved af chunk 1 with 3 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/af_1.parquet
Saved af chunk 2 with 3 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/af_2.parquet
Saved af chunk 3 with 3 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/af_3.parquet
Saved af chunk 4 with 1 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/af_4.parquet
Saved am chunk 0 with 2 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/am_0.parquet
Saved am chunk 1 with 2 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/am_1.parquet
Saved am chunk 2 with 2 rows to /content/drive/MyDrive/KD-EE-XLMR/minimal_data/distillation_data_split/am_2.parquet
Saved am chunk 3 with 2 rows to /content/drive/MyDrive/KD-EE-XLMR/minima