# Download Dataset

## Hyperparam

In [2]:
import os
os.environ["HF_HOME"] = "/network/scratch/x/xut/hf_cache"
os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "transformers")
os.environ["HF_DATASETS_CACHE"] = os.path.join(os.environ["HF_HOME"], "datasets")

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
import json
from tqdm import tqdm
from datatrove.pipeline.readers import ParquetReader
from transformers import AutoTokenizer
from pathlib import Path
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
fine_web2_labels = ['aeb_Arab', 
                    'afr_Latn', 
                    'amh_Ethi', 
                    'arz_Arab', 
                    'bam_Latn', 
                    'bem_Latn', 
                    'cjk_Latn', 
                    'dyu_Latn', 
                    'gaz_Latn', 
                    'ibo_Latn', 
                    'kab_Latn', 
                    'kam_Latn', 
                    'kbp_Latn', 
                    'kin_Latn', 
                    'kmb_Latn', 
                    'knc_Arab', 
                    'knc_Latn', 
                    'lin_Latn', 
                    'lug_Latn', 
                    'luo_Latn', 
                    'nus_Latn', 
                    'plt_Latn', 
                    'run_Latn', 
                    'sag_Latn', 
                    'sna_Latn', 
                    'sot_Latn', 
                    'ssw_Latn', 
                    'swc_Latn', 
                    'taq_Tfng', 
                    'tir_Ethi', 
                    'tsn_Latn', 
                    'twi_Latn', 
                    'tzm_Tfng', 
                    'umb_Latn', 
                    'xho_Latn', 
                    'yor_Latn', 
                    'zul_Latn',
                    'spa_Latn',
                    'fra_Latn',
                    'por_Latn'
                    ]

afri_mgsm_langs = {
    "amh_Ethi": "amh",
    "ewe_Latn": "ewe",
    "gaz_Latn": "orm",  
    "hau_Latn": "hau",
    "kin_Latn": "kin",
    "lin_Latn": "lin",
    "lug_Latn": "lug",
    "sna_Latn": "sna",
    "swc_Latn": "swa",
    "twi_Latn": "twi",
    "wol_Latn": "wol",
    "xho_Latn": "xho",
    "yor_Latn": "yor",
    "zul_Latn": "zul"
}


mgsm_langs = {
    "eng_Latn": "en",
    "spa_Latn": "es",
    "fra_Latn": "fr"
}


wura_langs = {
        "afr_Latn": "af",
        "amh_Ethi": "am",
        "arz_Arab": "ar",
        "hau_Latn": "ha",
        "ibo_Latn": "ig",
        "kin_Latn": "ki",
        "plt_Latn": "mg",
        "gaz_Latn": "or",
        "som_Latn": "sm",
        "sna_Latn": "sn",
        "sot_Latn": "st",
        "swc_Latn": "sw",
        "tir_Ethi": "ti",
        "xho_Latn": "xh",
        "yor_Latn": "yo",
        "zul_Latn": "zu",
}


madlad_langs = {
    "afr_Latn": "af",
    "aka_Latn": "ak",
    "amh_Ethi": "am",
    "bam_Latn": "bm",
    "dik_Latn": "din",
    "dyu_Latn": "dyu",
    "ewe_Latn": "ee",
    "fon_Latn": "fon",
    "fuv_Latn": "ff",
    "gaz_Latn": "om",   
    "hau_Latn": "ha",  
    "ibo_Latn": "ig",   
    "kbp_Latn": "kbp",
    "kin_Latn": "rw",
    "kmb_Latn": "kmb",
    "kon_Latn": "kg",
    "lin_Latn": "ln",
    "lug_Latn": "lg",
    "run_Latn": "rn",
    "sag_Latn": "sg",
    "sna_Latn": "sn",
    "som_Latn": "so",
    "sot_Latn": "st",
    "ssw_Latn": "ss",
    "swc_Latn": "sw",
    "tir_Ethi": "ti",
    "tsn_Latn": "tn",
    "tso_Latn": "ts",
    "tzm_Tfng": "ber",
    "wol_Latn": "wo",
    "xho_Latn": "xh",
    "yor_Latn": "yo",
    "zul_Latn": "zu"
}

all_african_language_list = [
    'aeb_Arab',
    'afr_Latn',
    'aka_Latn',
    'amh_Ethi',
    'ary_Arab',
    'arz_Arab',
    'bam_Latn',
    'bem_Latn',
    'cjk_Latn',
    'dik_Latn',
    'dyu_Latn',
    'ewe_Latn',
    'fon_Latn',
    'fuv_Latn',
    'gaz_Latn',
    'hau_Latn',
    'ibo_Latn',
    'kab_Latn',
    'kam_Latn',
    'kbp_Latn',
    'kea_Latn',
    'kik_Tatn',
    'kin_Latn',
    'kmb_Latn',
    'knc_Arab',
    'knc_Latn',
    'kon_Latn',
    'lin_Latn',
    'lua_Latn',
    'lug_Latn',
    'luo_Latn',
    'Mos_Latn',
    'nqo_Nkoo',
    'nso_Latn',
    'nus_Latn',
    'nya_Latn',
    'plt_Latn',
    'run_Latn',
    'sag_Latn',
    'sna_Latn',
    'som_Latn',
    'sot_Latn',
    'ssw_Latn',
    'swc_Latn',
    'taq_Latn',
    'taq_Tfng',
    'tir_Ethi',
    'tsn_Latn',
    'tso_Latn',
    'tum_Latn',
    'twi_Latn',
    'tzm_Tfng',
    'umb_Latn',
    'wol_Latn',
    'xho_Latn',
    'yor_Latn',
    'zul_Latn',
]

high_resource_lang = {
    'eng_Latn',
    'spa_Latn',
    'fra_Latn',
    'por_Latn'
}

# Fineweb 2 Dataset

In [8]:
# Read token
with open("/network/scratch/x/xut/hf_cache/token", "r") as f:
    token = f.read().strip()

# Hyperparam
output_dir = "../../scratch/data/data_pretrain/fineweb2"
os.makedirs(output_dir, exist_ok=True)
MAX_TOKENS = 1_000_000_000
BUFFER_SIZE = 1000

# Tokenizer (Gemma 2B/7B tokenizer)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", token=token)

# Count tokens
def count_tokens(text):
    return len(tokenizer.encode(text, add_special_tokens=False))

# Loop through languages
for lang_code in fine_web2_labels:
    reader = ParquetReader(f"hf://datasets/HuggingFaceFW/fineweb-2/data/{lang_code}/train")
    output_path = os.path.join(output_dir, f"{lang_code}_fw2.jsonl")
    
    # Skip if already exists
    if os.path.exists(output_path):
        print(f"{lang_code}: {output_path} already exists, skipping.")
        continue

    token_count = 0
    doc_count = 0
    buffer = []

    with open(output_path, "w", encoding="utf-8") as out_file:
        for doc in tqdm(reader(), desc=f"{lang_code} docs"):
            text = doc.text.strip()
            if not text:
                continue
            tokens = count_tokens(text)
            token_count += tokens
            doc_count += 1
            buffer.append(json.dumps({"text": text}))

            if len(buffer) >= BUFFER_SIZE:
                out_file.write("\n".join(buffer) + "\n")
                buffer = []

            if token_count >= MAX_TOKENS:
                break

        # Write remaining documents in buffer
        if buffer:
            out_file.write("\n".join(buffer) + "\n")

    print(f"{lang_code}: {doc_count:,} docs, {token_count:,} tokens ➜ {output_path}")


aeb_Arab: ../../scratch/data/data_pretrain/fineweb2/aeb_Arab_fw2.jsonl already exists, skipping.
afr_Latn: ../../scratch/data/data_pretrain/fineweb2/afr_Latn_fw2.jsonl already exists, skipping.
amh_Ethi: ../../scratch/data/data_pretrain/fineweb2/amh_Ethi_fw2.jsonl already exists, skipping.
arz_Arab: ../../scratch/data/data_pretrain/fineweb2/arz_Arab_fw2.jsonl already exists, skipping.
bam_Latn: ../../scratch/data/data_pretrain/fineweb2/bam_Latn_fw2.jsonl already exists, skipping.
bem_Latn: ../../scratch/data/data_pretrain/fineweb2/bem_Latn_fw2.jsonl already exists, skipping.
cjk_Latn: ../../scratch/data/data_pretrain/fineweb2/cjk_Latn_fw2.jsonl already exists, skipping.
dyu_Latn: ../../scratch/data/data_pretrain/fineweb2/dyu_Latn_fw2.jsonl already exists, skipping.
gaz_Latn: ../../scratch/data/data_pretrain/fineweb2/gaz_Latn_fw2.jsonl already exists, skipping.
ibo_Latn: ../../scratch/data/data_pretrain/fineweb2/ibo_Latn_fw2.jsonl already exists, skipping.
kab_Latn: ../../scratch/data/d

# Afri-MGSM

In [6]:
# Output directory
output_dir = "../../scratch/data/data_pretrain/afrimgsm"
os.makedirs(output_dir, exist_ok=True)

# Loop through each mapping
for long_code, mgsm_code in afri_mgsm_langs.items():
    
    output_path = os.path.join(output_dir, f"{long_code}_mgsm.jsonl")

    # Skip if file already exists
    if os.path.exists(output_path):
        print(f"{long_code}: {output_path} already exists, skipping.")
        continue

    try:
        dataset = load_dataset("masakhane/afrimgsm", name=mgsm_code, split="train")
    except Exception as e:
        print(f"Failed to load {mgsm_code}: {e}")
        continue

    with open(output_path, "w", encoding="utf-8") as f:
        for example in dataset:
            # Check which field has the text content and use that
            if "question" in example and "answer" in example:
                # For MGSM, you might want to combine question and answer
                text = f"Question: {example['question']}\nAnswer: {example['answer']}"
                json.dump({"text": text}, f)
            elif "text" in example:
                # If there's already a text field
                json.dump({"text": example["text"]}, f)
            else:
                # Dump the entire example as text if you're not sure which field to use
                json.dump({"text": str(example)}, f)
            f.write("\n")

    print(f"Saved to {output_path}")

Generating train split: 100%|██████████| 8/8 [00:00<00:00, 172.73 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 8472.11 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/amh_Ethi_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1160.85 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 24394.57 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/ewe_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1168.33 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 23278.93 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/gaz_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1168.17 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 25916.36 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/hau_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1214.99 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 32634.40 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/kin_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 994.09 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 25486.13 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/lin_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1458.44 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 41044.98 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/lug_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1135.51 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 25387.41 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/sna_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1191.86 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 22292.58 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/swc_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1124.70 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 22741.24 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/twi_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 971.72 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 25451.49 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/wol_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 973.02 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 29029.54 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/xho_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1163.31 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 9778.85 examples/s]


Saved to ../../scratch/data/data_pretrain/afrimgsm/yor_Latn_mgsm.jsonl


Generating train split: 100%|██████████| 8/8 [00:00<00:00, 1092.48 examples/s]
Generating test split: 100%|██████████| 250/250 [00:00<00:00, 26026.36 examples/s]

Saved to ../../scratch/data/data_pretrain/afrimgsm/zul_Latn_mgsm.jsonl





# MGSM

In [7]:
# Output directory
output_dir = "../../scratch/data/data_pretrain/mgsm"
os.makedirs(output_dir, exist_ok=True)

# Loop through each mapping
for long_code, mgsm_code in mgsm_langs.items():
    
    output_path = os.path.join(output_dir, f"{long_code}_mgsm.jsonl")

    # Skip if file already exists
    if os.path.exists(output_path):
        print(f"{long_code}: {output_path} already exists, skipping.")
        continue

    try:
        dataset = load_dataset("juletxara/mgsm", name=mgsm_code, split="train")
    except Exception as e:
        print(f"Failed to load {mgsm_code}: {e}")
        continue

    with open(output_path, "w", encoding="utf-8") as f:
        for example in dataset:
            # Check which field has the text content and use that
            if "question" in example and "answer" in example:
                # For MGSM, you might want to combine question and answer
                text = f"Question: {example['question']}\nAnswer: {example['answer']}"
                json.dump({"text": text}, f)
            elif "text" in example:
                # If there's already a text field
                json.dump({"text": example["text"]}, f)
            else:
                # Dump the entire example as text if you're not sure which field to use
                json.dump({"text": str(example)}, f)
            f.write("\n")

    print(f"Saved to {output_path}")

Saved to ../../scratch/data/data_pretrain/mgsm/eng_Latn_mgsm.jsonl
Saved to ../../scratch/data/data_pretrain/mgsm/spa_Latn_mgsm.jsonl
Saved to ../../scratch/data/data_pretrain/mgsm/fra_Latn_mgsm.jsonl


# Wura

In [8]:
# Output directory
output_dir = "../../scratch/data/data_pretrain/wura"
os.makedirs(output_dir, exist_ok=True)

# Loop through each language
for long_code, wura_code in wura_langs.items():
    output_path = os.path.join(output_dir, f"{long_code}_wura.jsonl")

    # Skip if file already exists
    if os.path.exists(output_path):
        print(f"{long_code}: {output_path} already exists, skipping.")
        continue

    print(f"⬇ Downloading WURA for {long_code} ({wura_code})...")

    try:
        ds = load_dataset("llama-lang-adapt/wura", name=wura_code, split="train")
    except Exception as e:
        print(f"Failed to load {wura_code}: {e}")
        continue

    with open(output_path, "w", encoding="utf-8") as f:
        for example in ds:
            # Check if there's a text field in the example
            if "text" in example:
                json.dump({"text": example["text"]}, f)
            else:
                # If not, you might need to identify the relevant field or concatenate multiple fields
                # This is a fallback if you're not sure about the structure
                text = str(example)
                json.dump({"text": text}, f)
            f.write("\n")

    print(f"Saved to {output_path}")

⬇ Downloading WURA for afr_Latn (af)...


Generating train split: 100%|██████████| 2390884/2390884 [01:02<00:00, 38140.27 examples/s]
Generating eval split: 100%|██████████| 265117/265117 [00:06<00:00, 38760.88 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/afr_Latn_wura.jsonl
⬇ Downloading WURA for amh_Ethi (am)...


Generating train split: 100%|██████████| 291026/291026 [00:11<00:00, 24287.75 examples/s]
Generating eval split: 100%|██████████| 32307/32307 [00:01<00:00, 27808.02 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/amh_Ethi_wura.jsonl
⬇ Downloading WURA for arz_Arab (ar)...


Generating train split: 100%|██████████| 1116034/1116034 [00:02<00:00, 464460.98 examples/s]
Generating eval split: 100%|██████████| 124808/124808 [00:00<00:00, 561581.51 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/arz_Arab_wura.jsonl
⬇ Downloading WURA for hau_Latn (ha)...


Generating train split: 100%|██████████| 565471/565471 [00:08<00:00, 67807.93 examples/s] 
Generating eval split: 100%|██████████| 63067/63067 [00:00<00:00, 68103.54 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/hau_Latn_wura.jsonl
⬇ Downloading WURA for ibo_Latn (ig)...


Generating train split: 100%|██████████| 121421/121421 [00:06<00:00, 20210.45 examples/s]
Generating eval split: 100%|██████████| 13899/13899 [00:00<00:00, 45357.96 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/ibo_Latn_wura.jsonl
⬇ Downloading WURA for kin_Latn (ki)...


Generating train split: 100%|██████████| 61485/61485 [00:00<00:00, 83720.99 examples/s] 
Generating eval split: 100%|██████████| 6902/6902 [00:00<00:00, 97443.13 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/kin_Latn_wura.jsonl
⬇ Downloading WURA for plt_Latn (mg)...


Generating train split: 100%|██████████| 355390/355390 [00:06<00:00, 57410.95 examples/s] 
Generating eval split: 100%|██████████| 39314/39314 [00:00<00:00, 58818.38 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/plt_Latn_wura.jsonl
⬇ Downloading WURA for gaz_Latn (or)...


Generating train split: 100%|██████████| 37280/37280 [00:00<00:00, 69197.13 examples/s]
Generating eval split: 100%|██████████| 4005/4005 [00:00<00:00, 71467.66 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/gaz_Latn_wura.jsonl
⬇ Downloading WURA for som_Latn (sm)...


Generating train split: 100%|██████████| 1235959/1235959 [00:16<00:00, 73959.48 examples/s]
Generating eval split: 100%|██████████| 137938/137938 [00:02<00:00, 68609.10 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/som_Latn_wura.jsonl
⬇ Downloading WURA for sna_Latn (sn)...


Generating train split: 100%|██████████| 141559/141559 [00:02<00:00, 54297.09 examples/s]
Generating eval split: 100%|██████████| 16126/16126 [00:00<00:00, 38758.04 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/sna_Latn_wura.jsonl
⬇ Downloading WURA for sot_Latn (st)...
Saved to ../../scratch/data/data_pretrain/wura/sot_Latn_wura.jsonl
⬇ Downloading WURA for swc_Latn (sw)...


Generating train split: 100%|██████████| 1801101/1801101 [00:33<00:00, 54058.79 examples/s] 
Generating eval split: 100%|██████████| 200345/200345 [00:03<00:00, 55826.03 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/swc_Latn_wura.jsonl
⬇ Downloading WURA for tir_Ethi (ti)...


Generating train split: 100%|██████████| 9807/9807 [00:00<00:00, 28713.63 examples/s]
Generating eval split: 100%|██████████| 1084/1084 [00:00<00:00, 43425.27 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/tir_Ethi_wura.jsonl
⬇ Downloading WURA for xho_Latn (xh)...


Generating train split: 100%|██████████| 69713/69713 [00:01<00:00, 61956.25 examples/s]
Generating eval split: 100%|██████████| 7846/7846 [00:00<00:00, 73477.00 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/xho_Latn_wura.jsonl
⬇ Downloading WURA for yor_Latn (yo)...


Generating train split: 100%|██████████| 141321/141321 [00:02<00:00, 68143.40 examples/s] 
Generating eval split: 100%|██████████| 15612/15612 [00:00<00:00, 71896.12 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/yor_Latn_wura.jsonl
⬇ Downloading WURA for zul_Latn (zu)...


Generating train split: 100%|██████████| 166370/166370 [00:03<00:00, 47804.97 examples/s]
Generating eval split: 100%|██████████| 18289/18289 [00:00<00:00, 56318.91 examples/s]


Saved to ../../scratch/data/data_pretrain/wura/zul_Latn_wura.jsonl


# Madlad

In [9]:
# Folder
output_dir = "../../scratch/data/data_pretrain/madlad400"
os.makedirs(output_dir, exist_ok=True)

# write every 1000 docs
BUFFER_SIZE = 1000

for fineweb_code, madlad_code in madlad_langs.items():

    output_path = os.path.join(output_dir, f"{fineweb_code}_madlad.jsonl")

    # Skip if already exists
    if os.path.exists(output_path):
        print(f"{fineweb_code}: {output_path} already exists, skipping.")
        continue

    try:
        dataset = load_dataset("allenai/madlad-400", languages=[madlad_code], split="clean", streaming=True)
    except Exception as e:
        print(f"Failed to stream {madlad_code}: {e}")
        continue

    buffer = []
    i = 0

    with open(output_path, "w", encoding="utf-8") as f:
        for i, example in enumerate(dataset, 1):
            # Extract the text field from the MADLAD example
            if 'text' in example:
                # Create the standardized format with "text" field
                formatted_example = {"text": example['text']}
                buffer.append(json.dumps(formatted_example))
            else:
                print(f"Warning: No 'text' field found in example {i} for {madlad_code}")
                continue

            # write buffer to file every BUFFER_SIZE items
            if len(buffer) >= BUFFER_SIZE:
                f.write("\n".join(buffer) + "\n")
                buffer = []

        # write remaining items
        if buffer:
            f.write("\n".join(buffer) + "\n")

    print(f"Done: {output_path} (streamed {i} docs)")

Done: ../../scratch/data/data_pretrain/madlad400/afr_Latn_madlad.jsonl (streamed 868671 docs)
Done: ../../scratch/data/data_pretrain/madlad400/aka_Latn_madlad.jsonl (streamed 4768 docs)
Done: ../../scratch/data/data_pretrain/madlad400/amh_Ethi_madlad.jsonl (streamed 106301 docs)
Done: ../../scratch/data/data_pretrain/madlad400/bam_Latn_madlad.jsonl (streamed 702 docs)
Done: ../../scratch/data/data_pretrain/madlad400/dik_Latn_madlad.jsonl (streamed 611 docs)
Done: ../../scratch/data/data_pretrain/madlad400/dyu_Latn_madlad.jsonl (streamed 483 docs)
Done: ../../scratch/data/data_pretrain/madlad400/ewe_Latn_madlad.jsonl (streamed 4536 docs)
Done: ../../scratch/data/data_pretrain/madlad400/fon_Latn_madlad.jsonl (streamed 1065 docs)
Done: ../../scratch/data/data_pretrain/madlad400/fuv_Latn_madlad.jsonl (streamed 26 docs)
Done: ../../scratch/data/data_pretrain/madlad400/gaz_Latn_madlad.jsonl (streamed 18895 docs)
Done: ../../scratch/data/data_pretrain/madlad400/hau_Latn_madlad.jsonl (streamed

# Other Data

## Malagsay

In [11]:
# Output folder & path
output_dir = "../../scratch/data/data_pretrain/extradata"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "tsn_Latn.jsonl")

# Load the extra dataset
ds = load_dataset("OxxoCodes/Marothodi", split="train")

# Print structure of first example to verify
if len(ds) > 0:
    print("Marothodi example structure:", ds[0])

# Check if the file exists and how many lines it has
if os.path.exists(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        existing_lines = sum(1 for _ in f)

    if existing_lines >= len(ds):
        print(f"{output_path} already contains all {existing_lines} examples, skipping.")
    else:
        print(f"{output_path} has {existing_lines}/{len(ds)} examples — appending missing...")
        with open(output_path, "a", encoding="utf-8") as out_f:
            for i, example in enumerate(ds):
                if i < existing_lines:
                    continue
                
                # Create standardized format with "text" field
                if "text" in example:
                    json.dump({"text": example["text"]}, out_f)
                elif "sentence" in example:  # Assuming the dataset might have a "sentence" field
                    json.dump({"text": example["sentence"]}, out_f)
                else:
                    # If no clear text field, identify the appropriate field based on dataset structure
                    # For example, concatenate multiple fields or use a specific field:
                    # You may need to adjust this based on the actual structure
                    content = str(example)
                    json.dump({"text": content}, out_f)
                
                out_f.write("\n")
        
        print(f"Appended {len(ds) - existing_lines} new examples to {output_path}")
else:
    # File does not exist; write all from scratch
    with open(output_path, "w", encoding="utf-8") as out_f:
        for example in ds:
            # Create standardized format with "text" field
            if "text" in example:
                json.dump({"text": example["text"]}, out_f)
            elif "sentence" in example:  # Assuming the dataset might have a "sentence" field
                json.dump({"text": example["sentence"]}, out_f)
            else:
                # If no clear text field, identify the appropriate field based on dataset structure
                content = str(example)
                json.dump({"text": content}, out_f)
            
            out_f.write("\n")
    
    print(f"Wrote all {len(ds)} examples to {output_path}")

Marothodi example structure: {'text': 'Ka goo Ruthe aya le Naomi Betlehema nageng ya Israele.', 'source': 'https://downloads.wortschatz-leipzig.de/corpora/tsn_community_2017.tar.gz', 'source-category': 'tsn_community_2017'}
../../scratch/data/data_pretrain/extradata/tsn_Latn.jsonl already contains all 152464 examples, skipping.


# Combine everything and LlaMa Factory Style

In [2]:
import json
import os
from pathlib import Path
import glob
import gc  # Garbage collector

# Set up paths
base_dir = "/home/mila/x/xut/scratch/data/data_pretrain"
final_output_folder = os.path.join(base_dir, "llama_factory_data")  
os.makedirs(final_output_folder, exist_ok=True)

# Dataset folders & suffixes
datasets = [
    ("fineweb2",   "_fw2.jsonl"),
    ("afrimgsm",   "_mgsm.jsonl"),
    ("mgsm",       "_mgsm.jsonl"),
    ("madlad400",  "_madlad.jsonl"),
    ("wura",       "_wura.jsonl"),
    ("extradata",  ".jsonl"),
    ("fineweb",    ".jsonl")
]

# Track statistics without keeping all data in memory
dataset_stats = {}
language_counts = {}

# Create output files for each language up front
language_files = {}

print("Consolidating datasets...")

# Process each dataset folder
for folder_name, suffix in datasets:
    folder_path = os.path.join(base_dir, folder_name)
    
    if not os.path.exists(folder_path):
        print(f"Warning: Folder {folder_path} doesn't exist, skipping...")
        continue
    
    print(f"\nProcessing {folder_name} datasets...")
    
    # Find all JSONL files in the folder
    jsonl_pattern = os.path.join(folder_path, f"*{suffix}")
    jsonl_files = glob.glob(jsonl_pattern)
    
    if not jsonl_files:
        print(f"  No files matching pattern: {jsonl_pattern}")
        continue
    
    for file_path in jsonl_files:
        try:
            # Extract language code from filename
            filename = os.path.basename(file_path)
            lang_code = filename.replace(suffix, "")
            
            print(f"  Reading {filename}...")
            
            # Initialize count for this language if not already done
            if lang_code not in language_counts:
                language_counts[lang_code] = 0
            
            # Initialize the output file if not already done
            if lang_code not in language_files:
                output_path = os.path.join(final_output_folder, f"{lang_code}_data.jsonl")
                language_files[lang_code] = output_path
            
            # Process line by line to avoid memory issues
            examples_processed = 0
            
            with open(file_path, 'r', encoding='utf-8') as in_file, \
                 open(language_files[lang_code], 'a', encoding='utf-8') as out_file:
                
                for line_num, line in enumerate(in_file, 1):
                    try:
                        # Skip empty lines
                        if not line.strip():
                            continue
                        
                        item = json.loads(line.strip())
                        
                        # Ensure we have the {"text": content} format
                        if 'text' in item and item['text']:
                            # Write directly to output file, don't store in memory
                            out_file.write(json.dumps({"text": item['text']}, ensure_ascii=False) + '\n')
                            examples_processed += 1
                        else:
                            # If no text field, try to find the right field
                            if isinstance(item, dict):
                                # Find the first value that looks like text
                                for key, value in item.items():
                                    if isinstance(value, str) and len(value) > 10:
                                        out_file.write(json.dumps({"text": value}, ensure_ascii=False) + '\n')
                                        examples_processed += 1
                                        break
                            
                        # Periodically report progress and clear memory
                        if line_num % 100000 == 0:
                            print(f"    Processed {line_num:,} lines...")
                            gc.collect()  # Force garbage collection
                    
                    except json.JSONDecodeError:
                        print(f"    Warning: Skipping invalid JSON at line {line_num} in {filename}")
                        continue
                    except Exception as e:
                        print(f"    Error processing line {line_num} in {filename}: {e}")
                        continue
            
            # Update the count for this language
            language_counts[lang_code] += examples_processed
            print(f"    Added {examples_processed:,} examples from {filename}")
            
            # Force garbage collection after each file
            gc.collect()
            
        except Exception as e:
            print(f"    Error processing {file_path}: {e}")
            continue

# Create dataset info JSON for LLaMA Factory
print("\nCreating dataset_info.json...")
dataset_info = {}

for lang_code, count in language_counts.items():
    # Add entry to dataset info (LLaMA Factory format)
    dataset_info[f"{lang_code}_dataset"] = {
        "file_name": f"{lang_code}_data.jsonl",
        "columns": {
            "prompt": "text"  # Maps the "text" field to the "prompt" column for LLaMA Factory
        }
    }

# Save dataset info
dataset_info_path = os.path.join(final_output_folder, "dataset_info.json")
with open(dataset_info_path, 'w', encoding='utf-8') as f:
    json.dump(dataset_info, f, ensure_ascii=False, indent=2)

print(f"\n=== Summary ===")
print(f"Total languages: {len(language_counts)}")
total_examples = sum(language_counts.values())
print(f"Total examples: {total_examples:,}")
for lang, count in sorted(language_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {lang}: {count:,} examples")
print(f"\nAll datasets saved to {final_output_folder}")
print(f"Dataset info saved to {dataset_info_path}")

Consolidating datasets...

Processing fineweb2 datasets...
  Reading aeb_Arab_fw2.jsonl...
    Processed 100,000 lines...
    Processed 200,000 lines...
    Added 262,884 examples from aeb_Arab_fw2.jsonl
  Reading afr_Latn_fw2.jsonl...
    Processed 100,000 lines...
    Processed 200,000 lines...
    Processed 300,000 lines...
    Processed 400,000 lines...
    Processed 500,000 lines...
    Processed 600,000 lines...
    Processed 700,000 lines...
    Processed 800,000 lines...
    Added 877,109 examples from afr_Latn_fw2.jsonl
  Reading amh_Ethi_fw2.jsonl...
    Processed 100,000 lines...
    Processed 200,000 lines...
    Added 280,355 examples from amh_Ethi_fw2.jsonl
  Reading arz_Arab_fw2.jsonl...
    Processed 100,000 lines...
    Processed 200,000 lines...
    Processed 300,000 lines...
    Processed 400,000 lines...
    Processed 500,000 lines...
    Processed 600,000 lines...
    Processed 700,000 lines...
    Processed 800,000 lines...
    Processed 900,000 lines...
    Proce

In [4]:
all_language_list = [
    'aeb_Arab',
    'afr_Latn',
    'aka_Latn',
    'amh_Ethi',
    'ary_Arab',
    'arz_Arab',
    'bam_Latn',
    'bem_Latn',
    'cjk_Latn',
    'dik_Latn',
    'dyu_Latn',
    'ewe_Latn',
    'fon_Latn',
    'fuv_Latn',
    'gaz_Latn',
    'hau_Latn',
    'ibo_Latn',
    'kab_Latn',
    'kam_Latn',
    'kbp_Latn',
    'kea_Latn',
    'kik_Tatn',
    'kin_Latn',
    'kmb_Latn',
    'knc_Arab',
    'knc_Latn',
    'kon_Latn',
    'lin_Latn',
    'lua_Latn',
    'lug_Latn',
    'luo_Latn',
    'Mos_Latn',
    'nqo_Nkoo',
    'nso_Latn',
    'nus_Latn',
    'nya_Latn',
    'plt_Latn',
    'run_Latn',
    'sag_Latn',
    'sna_Latn',
    'som_Latn',
    'sot_Latn',
    'ssw_Latn',
    'swc_Latn',
    'taq_Latn',
    'taq_Tfng',
    'tir_Ethi',
    'tsn_Latn',
    'tso_Latn',
    'tum_Latn',
    'twi_Latn',
    'tzm_Tfng',
    'umb_Latn',
    'wol_Latn',
    'xho_Latn',
    'yor_Latn',
    'zul_Latn',
    'eng_Latn',
    'spa_Latn',
    'fra_Latn',
    'por_Latn'
]


In [5]:
import os
import gzip
import csv
import shutil
import io

# ─── Configuration ────────────────────────────────────────────────────────────
base_dir   = "/home/mila/x/xut/scratch/data/data_pretrain/llama_factory_data"
output_csv = "jsonl_file_size_comparison.csv"
MAX_SIZE_BYTES = 2 * 1024 * 1024 * 1024  # 2 GB

# ─── Helpers ───────────────────────────────────────────────────────────────────
def get_readable_size(bytesize):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytesize < 1024:
            return f"{bytesize:.2f} {unit}"
        bytesize /= 1024
    return f"{bytesize:.2f} PB"

class _ByteCounter(io.RawIOBase):
    """A write-only stream that just counts bytes written to it."""
    def __init__(self):
        self.count = 0
    def write(self, b):
        n = len(b)
        self.count += n
        return n
    def tell(self):
        return self.count

def get_gzip_size(filepath):
    """Stream-compress the file and return the total compressed byte count."""
    counter = _ByteCounter()
    with open(filepath, 'rb') as f_in, \
         gzip.GzipFile(fileobj=counter, mode='wb') as gz:
        # copyfileobj reads & writes in chunks, never storing all compressed data in RAM
        shutil.copyfileobj(f_in, gz, length=1024 * 1024)
    return counter.tell()

def count_jsonl_lines(filepath):
    """Count the number of lines (examples) in a JSONL file."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)

# ─── Main Loop ─────────────────────────────────────────────────────────────────
rows = []
for filename in sorted(os.listdir(base_dir)):
    if not filename.endswith(".jsonl"):
        continue

    filepath = os.path.join(base_dir, filename)
    try:
        size_bytes = os.path.getsize(filepath)
        num_lines = count_jsonl_lines(filepath)
        size_gzip_bytes = get_gzip_size(filepath)

        rows.append([
            filename,
            get_readable_size(size_bytes),
            size_bytes,
            get_readable_size(size_gzip_bytes),
            size_gzip_bytes,
            f"{(size_gzip_bytes / size_bytes * 100):.1f}%",
            num_lines
        ])

        print(
            f"✓ {filename} → "
            f"{get_readable_size(size_bytes)} ({size_bytes} B) → "
            f"gzip {get_readable_size(size_gzip_bytes)} → "
            f"{num_lines} examples"
        )

    except Exception as e:
        print(f"Failed for {filepath}: {e}")

# ─── Write to CSV ───────────────────────────────────────────────────────────────
with open(output_csv, mode="w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        "File",
        "Size (Human)",
        "Size (Bytes)",
        "Gzipped Size (Human)",
        "Gzipped Size (Bytes)",
        "Compression Ratio",
        "Number of Examples"
    ])
    writer.writerows(rows)

print(f"\n📊 Comparison saved as: {output_csv}")


✓ aeb_Arab_data.jsonl → 594.40 MB (623273088 B) → gzip 147.30 MB → 262884 examples
✓ afr_Latn_data.jsonl → 11.64 GB (12496774779 B) → gzip 4.45 GB → 4136664 examples
✓ aka_Latn_data.jsonl → 25.76 MB (27014100 B) → gzip 9.01 MB → 4768 examples
✓ amh_Ethi_data.jsonl → 3.75 GB (4027695752 B) → gzip 1.04 GB → 677690 examples
✓ arz_Arab_data.jsonl → 3.88 GB (4165053531 B) → gzip 1.09 GB → 2526168 examples
✓ bam_Latn_data.jsonl → 29.93 MB (31383088 B) → gzip 7.06 MB → 14746 examples
✓ bem_Latn_data.jsonl → 3.97 MB (4157790 B) → gzip 1.37 MB → 1143 examples
✓ cjk_Latn_data.jsonl → 81.79 KB (83756 B) → gzip 24.13 KB → 44 examples
✓ dik_Latn_data.jsonl → 3.17 MB (3328890 B) → gzip 1.07 MB → 611 examples
✓ dyu_Latn_data.jsonl → 6.53 MB (6845028 B) → gzip 2.00 MB → 2692 examples
✓ eng_Latn_data.jsonl → 4.23 GB (4547144294 B) → gzip 1.67 GB → 1465688 examples
✓ ewe_Latn_data.jsonl → 34.98 MB (36682212 B) → gzip 11.73 MB → 4544 examples
✓ fon_Latn_data.jsonl → 9.79 MB (10262740 B) → gzip 3.07 MB → 