# Datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
import json
from tqdm import tqdm
from datatrove.pipeline.readers import ParquetReader
from transformers import AutoTokenizer
from pathlib import Path
from collections import defaultdict
from datatrove.pipeline.readers import ParquetReader

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
print("Virtual Env Name: ", os.environ.get("VIRTUAL_ENV"))

Virtual Env Name:  /Users/tianyixu/Documents/github_projects/data_pretrain/venv


In [3]:
# Defining the names of all languages

african_language_list = [
    'aeb_Arab',
    'afr_Latn',
    'aka_Latn',
    'amh_Ethi',
    'ary_Arab',
    'arz_Arab',
    'bam_Latn',
    'bem_Latn',
    'cjk_Latn',
    'dik_Latn',
    'dyu_Latn',
    'ewe_Latn',
    'fon_Latn',
    'fuv_Latn',
    'gaz_Latn',
    'hau_Latn',
    'ibo_Latn',
    'kab_Latn',
    'kam_Latn',
    'kbp_Latn',
    'kea_Latn',
    'kik_Latn',
    'kin_Latn',
    'kmb_Latn',
    'knc_Arab',
    'knc_Latn',
    'kon_Latn',
    'lin_Latn',
    'lua_Latn',
    'lug_Latn',
    'luo_Latn',
    'mos_Latn',
    'nqo_Nkoo',
    'nso_Latn',
    'nus_Latn',
    'nya_Latn',
    'plt_Latn',
    'run_Latn',
    'sag_Latn',
    'sna_Latn',
    'som_Latn',
    'sot_Latn',
    'ssw_Latn',
    'swh_Latn',
    'taq_Latn',
    'taq_Tfng',
    'tir_Ethi',
    'tsn_Latn',
    'tso_Latn',
    'tum_Latn',
    'twi_Latn',
    'tzm_Tfng',
    'umb_Latn',
    'wol_Latn',
    'xho_Latn',
    'yor_Latn',
    'zul_Latn',
]

high_lang_list = {
    'eng_Latn',
    'fra_Latn',
    'por_Latn',
    'arb_Arab'
    }

In [4]:
# Dealing with Wura Dataset mapping

# Wura dataset follows the flores 200 mapping
flores_200_mapping = {
'afr_Latn': 'afr',
'amh_Ethi': 'amh',
'arb_Arab': 'ara',
'asm_Beng': 'asm',
'ast_Latn': 'ast',
'azj_Latn': 'azj',
'arz_Arab': 'arz',
'bel_Cyrl': 'bel',
'ben_Beng': 'ben',
'bos_Latn': 'bos',
'bul_Cyrl': 'bul',
'cat_Latn': 'cat',
'ceb_Latn': 'ceb',
'ces_Latn': 'ces',
'ckb_Arab': 'ckb',
'cym_Latn': 'cym',
'dan_Latn': 'dan',
'deu_Latn': 'deu',
'ell_Grek': 'ell',
'eng_Latn': 'eng',
'est_Latn': 'est',
'fin_Latn': 'fin',
'fra_Latn': 'fra',
'fuv_Latn': 'ful',
'gaz_Latn': 'gaz',
'gle_Latn': 'gle',
'glg_Latn': 'glg',
'guj_Gujr': 'guj',
'hau_Latn': 'hau',
'heb_Hebr': 'heb',
'hin_Deva': 'hin',
'hrv_Latn': 'hrv',
'hun_Latn': 'hun',
'hye_Armn': 'hye',
'ibo_Latn': 'ibo',
'ind_Latn': 'ind',
'isl_Latn': 'isl',
'ita_Latn': 'ita',
'jav_Latn': 'jav',
'jpn_Jpan': 'jpn',
'kam_Latn': 'kam',
'kan_Knda': 'kan',
'kat_Geor': 'kat',
'kaz_Cyrl': 'kaz',
'khm_Khmr': 'khm',
'kir_Cyrl': 'kir',
'kin_Latn': 'kin',
'kor_Hang': 'kor',
'lao_Laoo': 'lao',
'lij_Latn': 'Latvian',
'lim_Latn': 'kea',
'lin_Latn': 'lin',
'lit_Latn': 'lit',
'ltz_Latn': 'ltz',
'lug_Latn': 'lug',
'luo_Latn': 'luo',
'lvs_Latn': 'lav',
'mal_Mlym': 'mal',
'mar_Deva': 'mar',
'mkd_Cyrl': 'mkd',
'mlt_Latn': 'mlt',
'khk_Cyrl': 'mon',
'mri_Latn': 'mri',
'mya_Mymr': 'mya',
'nld_Latn': 'nld',
'nob_Latn': 'nob',
'npi_Deva': 'npi',
'nso_Latn': 'nso',
'nya_Latn': 'nya',
'oci_Latn': 'oci',
'gaz_Latn': 'orm',
'ory_Orya': 'ory',
'pan_Guru': 'pan',
'pes_Arab': 'fas',
'pol_Latn': 'pol',
'por_Latn': 'por',
'pbt_Arab': 'pus',
'plt_Latn': 'plt',
'ron_Latn': 'ron',
'rus_Cyrl': 'rus',
'slk_Latn': 'slk',
'sna_Latn': 'sna',
'snd_Arab': 'snd',
'som_Latn': 'som',
'spa_Latn': 'spa',
'srp_Cyrl': 'srp',
'swc_Latn': 'swc',
'swe_Latn': 'swe',
'swh_Latn': 'swa',
'tam_Taml': 'tam',
'tel_Telu': 'tel',
'tgk_Cyrl': 'tgk',
'tir_Ethi': 'tir',
'tgl_Latn': 'tgl',
'tha_Thai': 'tha',
'tur_Latn': 'tur',
'ukr_Cyrl': 'ukr',
'umb_Latn': 'umb',
'urd_Arab': 'urd',
'uzn_Latn': 'uzb',
'vie_Latn': 'vie',
'wol_Latn': 'wol',
'xho_Latn': 'xho',
'yor_Latn': 'yor',
'zho_Hans': 'zho_simpl',
'zho_Hant': 'zho_trad',
'zsm_Latn': 'msa',
'zul_Latn': 'zul'}


# List of all wura languages,
wura_langs = [
    "afr", "amh", "arz", "eng", "fra", "hau", "ibo", "kin",
    "mlg", "nya", "orm", "por", "sna", "som", "sot",
    "swa", "tir", "xho", "yor", "zul"
]


In [5]:
# Defining the madlad 400 mapping

afri_madlad_langs = {
    "afr_Latn": "af",
    "aka_Latn": "ak",
    "amh_Ethi": "am",
    "bam_Latn": "bm",
    "dik_Latn": "din",
    "dyu_Latn": "dyu",
    "ewe_Latn": "ee",
    "fon_Latn": "fon",
    "fuv_Latn": "ff",
    "gaz_Latn": "om",   
    "hau_Latn": "ha",  
    "ibo_Latn": "ig",   
    "kbp_Latn": "kbp",
    "kin_Latn": "rw",
    "kmb_Latn": "kmb",
    "kon_Latn": "kg",
    "lin_Latn": "ln",
    "lug_Latn": "lg",
    "run_Latn": "rn",
    "sag_Latn": "sg",
    "sna_Latn": "sn",
    "som_Latn": "so",
    "sot_Latn": "st",
    "ssw_Latn": "ss",
    "swh_Latn": "sw",
    "tir_Ethi": "ti",
    "tsn_Latn": "tn",
    "tso_Latn": "ts",
    "tzm_Tfng": "ber",
    "wol_Latn": "wo",
    "xho_Latn": "xh",
    "yor_Latn": "yo",
    "zul_Latn": "zu"
}

hr_madlad_langs = {
    "eng_Latn": "en",
    "fra_Latn": "fr",
    "por_Latn": "pt",
    "arb_Arab": "ar"
}

In [6]:
# MGSM like mapping


afri_mgsm_langs = {
    "amh_Ethi": "amh",
    "ewe_Latn": "ewe",
    "gaz_Latn": "orm",  
    "hau_Latn": "hau",
    "ibo_Latn": "ibo",
    "kin_Latn": "kin",
    "lin_Latn": "lin",
    "lug_Latn": "lug",
    "sna_Latn": "sna",
    "swh_Latn": "swa",
    "sot_Latn": "sot",
    "twi_Latn": "twi",
    "wol_Latn": "wol",
    "xho_Latn": "xho",
    "yor_Latn": "yor",
    "zul_Latn": "zul"
}

mgsm_langs = {
    "eng_Latn": "en",
    "fra_Latn": "fr"
}

# Data folder Creation

In [None]:
# Create folder for each language in all languages
for lang in african_language_list:
    os.makedirs(f"data/{lang}", exist_ok=True)

for lang in high_lang_list:
    os.makedirs(f"data/{lang}", exist_ok=True)

# Fineweb 2

In [7]:
# Inspect dataset

dataset = load_dataset(
    "parquet",
    data_files={"train": "hf://datasets/HuggingFaceFW/fineweb-2/data/por_Latn/train/000_00000.parquet"},
    split="train",
    streaming=True
)

print("Available columns:", dataset.features)


Available columns: {'text': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'dump': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'date': Value(dtype='string', id=None), 'file_path': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'language_score': Value(dtype='float64', id=None), 'language_script': Value(dtype='string', id=None), 'minhash_cluster_size': Value(dtype='int64', id=None), 'top_langs': Value(dtype='string', id=None)}


## Crafting African Language Set that is part of FW2

In [33]:
from datatrove.pipeline.readers import ParquetReader

afri_fw2_existing_langs = []

for lang_code in african_language_list:
    path = f"hf://datasets/HuggingFaceFW/fineweb-2/data/{lang_code}/train"
    try:
        reader = ParquetReader(path, limit=1)  # Try to load just 1 file
        _ = next(reader())                     # Trigger read
        afri_fw2_existing_langs.append(lang_code)
        print(f"Exists: {lang_code}")
    except Exception as e:
        print(f" Missing: {lang_code}")

print("\nAvailable languages:", afri_fw2_existing_langs)

[32m2025-05-12 12:05:06.092[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:06.856[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: aeb_Arab
Exists: afr_Latn


[32m2025-05-12 12:05:08.033[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


 Missing: aka_Latn


[32m2025-05-12 12:05:08.532[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/2[0m


Exists: amh_Ethi


[32m2025-05-12 12:05:09.764[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: ary_Arab


[32m2025-05-12 12:05:10.978[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: arz_Arab


[32m2025-05-12 12:05:11.301[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: bam_Latn


[32m2025-05-12 12:05:11.513[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:11.586[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: bem_Latn
Exists: cjk_Latn


[32m2025-05-12 12:05:11.796[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:11.995[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: dik_Latn
Exists: dyu_Latn


[32m2025-05-12 12:05:12.366[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: ewe_Latn


[32m2025-05-12 12:05:12.693[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: fon_Latn


[32m2025-05-12 12:05:13.361[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: fuv_Latn


[32m2025-05-12 12:05:13.922[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: gaz_Latn
 Missing: hau_Latn


[32m2025-05-12 12:05:14.773[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: ibo_Latn


[32m2025-05-12 12:05:15.075[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:15.258[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: kab_Latn
Exists: kam_Latn


[32m2025-05-12 12:05:15.513[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: kbp_Latn


[32m2025-05-12 12:05:15.896[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: kea_Latn


[32m2025-05-12 12:05:16.149[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: kik_Latn


[32m2025-05-12 12:05:16.631[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: kin_Latn


[32m2025-05-12 12:05:16.843[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: kmb_Latn


[32m2025-05-12 12:05:17.215[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: knc_Arab
Exists: knc_Latn


[32m2025-05-12 12:05:17.537[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


 Missing: kon_Latn


[32m2025-05-12 12:05:17.964[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:18.138[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: lin_Latn
Exists: lua_Latn


[32m2025-05-12 12:05:18.506[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: lug_Latn


[32m2025-05-12 12:05:18.747[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: luo_Latn


[32m2025-05-12 12:05:19.025[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:19.189[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: mos_Latn
Exists: nqo_Nkoo


[32m2025-05-12 12:05:19.581[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:19.692[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: nso_Latn
Exists: nus_Latn


[32m2025-05-12 12:05:20.183[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: nya_Latn


[32m2025-05-12 12:05:20.719[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: plt_Latn


[32m2025-05-12 12:05:21.100[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: run_Latn


[32m2025-05-12 12:05:21.482[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: sag_Latn


[32m2025-05-12 12:05:21.896[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: sna_Latn


[32m2025-05-12 12:05:22.455[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: som_Latn


[32m2025-05-12 12:05:22.898[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: sot_Latn


[32m2025-05-12 12:05:23.163[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: ssw_Latn


[32m2025-05-12 12:05:23.797[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: swh_Latn


[32m2025-05-12 12:05:24.029[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:24.172[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: taq_Latn
Exists: taq_Tfng


[32m2025-05-12 12:05:24.645[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: tir_Ethi


[32m2025-05-12 12:05:25.058[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: tsn_Latn


[32m2025-05-12 12:05:25.434[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:25.586[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: tso_Latn
Exists: tum_Latn


[32m2025-05-12 12:05:25.954[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: twi_Latn


[32m2025-05-12 12:05:26.197[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m
[32m2025-05-12 12:05:26.335[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: tzm_Tfng
Exists: umb_Latn


[32m2025-05-12 12:05:26.734[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: wol_Latn


[32m2025-05-12 12:05:27.143[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: xho_Latn


[32m2025-05-12 12:05:27.581[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/1[0m


Exists: yor_Latn
Exists: zul_Latn

Available languages: ['aeb_Arab', 'afr_Latn', 'amh_Ethi', 'ary_Arab', 'arz_Arab', 'bam_Latn', 'bem_Latn', 'cjk_Latn', 'dik_Latn', 'dyu_Latn', 'ewe_Latn', 'fon_Latn', 'fuv_Latn', 'gaz_Latn', 'ibo_Latn', 'kab_Latn', 'kam_Latn', 'kbp_Latn', 'kea_Latn', 'kik_Latn', 'kin_Latn', 'kmb_Latn', 'knc_Arab', 'knc_Latn', 'lin_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'mos_Latn', 'nqo_Nkoo', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'plt_Latn', 'run_Latn', 'sag_Latn', 'sna_Latn', 'som_Latn', 'sot_Latn', 'ssw_Latn', 'swh_Latn', 'taq_Latn', 'taq_Tfng', 'tir_Ethi', 'tsn_Latn', 'tso_Latn', 'tum_Latn', 'twi_Latn', 'tzm_Tfng', 'umb_Latn', 'wol_Latn', 'xho_Latn', 'yor_Latn', 'zul_Latn']


## Crafting High Resource Language Set that is part of FW2

In [8]:
from datatrove.pipeline.readers import ParquetReader

hr_fw2_existing_langs = []

for lang_code in high_lang_list:
    path = f"hf://datasets/HuggingFaceFW/fineweb-2/data/{lang_code}/train"
    try:
        reader = ParquetReader(path, limit=1)  # Try to load just 1 file
        _ = next(reader())                     # Trigger read
        hr_fw2_existing_langs.append(lang_code)
        print(f"Exists: {lang_code}")
    except Exception as e:
        print(f" Missing: {lang_code}")

print("\nAvailable languages:", hr_fw2_existing_langs)

[32m2025-05-15 11:50:25.858[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/116[0m
[32m2025-05-15 11:50:28.494[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/60[0m


Exists: fra_Latn
Exists: por_Latn


[32m2025-05-15 11:50:29.910[0m | [1mINFO    [0m | [36mdatatrove.pipeline.readers.base[0m:[36mread_files_shard[0m:[36m201[0m - [1mReading input file 000_00000.parquet, 1/25[0m


 Missing: eng_Latn
Exists: arb_Arab

Available languages: ['fra_Latn', 'por_Latn', 'arb_Arab']


## Downloading African Language Datasets for FW2

In [None]:
import os
import fsspec
from tqdm import tqdm

def download_fw2_parquet(lang_code: str, save_dir: str):
    """
    Download the FineWeb-2 train split for one language **unchanged**
    (same .parquet files, no decoding) and store them under `save_dir/lang_code/`.
    """
    remote = f"hf://datasets/HuggingFaceFW/fineweb-2/data/{lang_code}/train"
    fs, _ = fsspec.core.url_to_fs(remote)          # fsspec handles the HF filesystem
    os.makedirs(save_dir, exist_ok=True)

    # Download all parquet files there, because its african languages so I download all of them
    for idx, path in enumerate(tqdm(fs.glob(f"{remote}/*.parquet"),
                                    desc=f"Downloading {lang_code}"), start=1):
        fname      = f"{lang_code}_{idx:04d}_fw2.parquet"     # File name structure: afr_Latn_0001.parquet, ...
        local_path = os.path.join(save_dir, fname)
        fs.get(path, local_path)  # Download the file as is


# Loop through all languages and download the parquet files
for lang_code in afri_fw2_existing_langs:
    save_dir = f"data/{lang_code}"
    download_fw2_parquet(lang_code, save_dir)


## Downloading first 1BT for High Resource Languages in FW2

In [10]:
def download_limited_token_dataset(
    lang_code: str,
    hf_token: str,
    *,
    # existing parameters
    target_token_count: int = 100,
    batch_size: int = 3,
    tokenizer_name: str = "google/gemma-3-1b-it",
    # new parameters to support both loaders
    loader: str = "parquet",                        # or the HF repo id
    loader_kwargs: dict = None,                     # e.g. {"name":"sample-10BT","split":"train"}
    suffix: str = "fw2",                            # file suffix for shards
    data_root: str = "hf://datasets/HuggingFaceFW/fineweb-2/data"
):
    save_dir = os.path.join("debug", lang_code)
    os.makedirs(save_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
                                              use_auth_token=hf_token)

    # pick your loader
    if loader == "parquet":
        ds = load_dataset(
            "parquet",
            data_files={"train": f"{data_root}/{lang_code}/train/000_00000.parquet"},
            split="train",
            streaming=True
        )
    else:
        # pass your token here so the HF loader can authenticate and resolve
        ds = load_dataset(
            loader,
            **(loader_kwargs or {}),
            streaming=True
        )

    buffer, total_tokens, shard_id = [], 0, 1
    pbar = tqdm(total=target_token_count, unit="tokens", desc=lang_code)

    for ex in ds:
        text = ex.get("text", "").strip()
        if not text:
            continue
        tokens = tokenizer(text, truncation=False, padding=False)["input_ids"]
        total_tokens += len(tokens)
        pbar.update(len(tokens))
        buffer.append({"text": text})

        if len(buffer) >= batch_size or total_tokens >= target_token_count:
            df = pd.DataFrame(buffer)
            name = f"{lang_code}_{shard_id:04d}_{suffix}.parquet"
            df.to_parquet(os.path.join(save_dir, name), index=False)
            shard_id += 1
            buffer = []

        if total_tokens >= target_token_count:
            break

    if buffer:
        df = pd.DataFrame(buffer)
        name = f"{lang_code}_{shard_id:04d}_{suffix}.parquet"
        df.to_parquet(os.path.join(save_dir, name), index=False)

    pbar.close()
    print(f"{lang_code}: saved {shard_id} shards (~{total_tokens:,} tokens) → {save_dir}\n")


HF_TOKEN = os.environ.get("HF_TOKEN")

# 1) fineweb-2
for lang in hr_fw2_existing_langs:
    download_limited_token_dataset(
        lang_code=lang,
        hf_token=HF_TOKEN,
        loader="parquet",
        suffix="fw2",
        data_root="hf://datasets/HuggingFaceFW/fineweb-2/data"
    )

# 2) fineweb-edu (English)
download_limited_token_dataset(
    lang_code="eng_Latn",
    hf_token=HF_TOKEN,
    loader="HuggingFaceFW/fineweb-edu",
    loader_kwargs={"name":"sample-10BT","split":"train"},
    suffix="fwedu"
)


fra_Latn: 648tokens [00:02, 293.52tokens/s]          


fra_Latn: saved 2 shards (~648 tokens) → debug/fra_Latn



por_Latn: 505tokens [00:02, 231.27tokens/s]          


por_Latn: saved 2 shards (~505 tokens) → debug/por_Latn



arb_Arab: 3069tokens [00:01, 1653.76tokens/s]        


arb_Arab: saved 2 shards (~3,069 tokens) → debug/arb_Arab



eng_Latn: 794tokens [00:01, 420.58tokens/s]          

eng_Latn: saved 2 shards (~794 tokens) → debug/eng_Latn






In [None]:
#!/usr/bin/env python3
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import os
import pandas as pd

def download_limited_token_dataset(
    lang_code: str,
    hf_token: str,
    target_token_count: int = 1_000_000_000,
    batch_size: int = 100_000,
    tokenizer_name: str = "google/gemma-3-1b-it",
    data_root: str = "hf://datasets/HuggingFaceFW/fineweb-2/data"
):
    """
    Stream the first `target_token_count` tokens of the fineweb-2 train split
    for `lang_code`, tokenize, and save into parquet shards of size `batch_size`.
    """
    save_dir = os.path.join("data", lang_code)
    os.makedirs(save_dir, exist_ok=True)
    
    # initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
                                              use_auth_token=hf_token)
    
    # streaming load
    ds = load_dataset(
        "parquet",
        data_files={"train": f"{data_root}/{lang_code}/train/000_00000.parquet"},
        split="train",
        streaming=True
    )
    
    buffer = []
    total_tokens = 0
    shard_id = 1
    pbar = tqdm(total=target_token_count, unit="tokens",
                desc=f"{lang_code}")
    
    # Iterate over dataset and tokenize 
    for example in ds:
        text = example.get("text", "").strip()
        if not text:
            continue
        
        # Tokenize the text only
        input_ids = tokenizer(text, truncation=False, padding=False)["input_ids"]
        token_count = len(input_ids)
        
        buffer.append({"text": text})
        total_tokens += token_count
        pbar.update(token_count)
        
        # time to flush a shard?
        if len(buffer) >= batch_size or total_tokens >= target_token_count:
            df = pd.DataFrame(buffer)
            shard_name = f"{lang_code}_{shard_id:04d}_fw2.parquet"
            df.to_parquet(os.path.join(save_dir, shard_name), index=False)
            shard_id += 1
            buffer = []
        
        if total_tokens >= target_token_count:
            break
    
    # final partial shard
    if buffer:
        df = pd.DataFrame(buffer)
        shard_name = f"{lang_code}_{shard_id:04d}_fw2.parquet"
        df.to_parquet(os.path.join(save_dir, shard_name), index=False)
    
    pbar.close()
    print(f"{lang_code}: saved {shard_id} shards, ~{total_tokens:,} tokens → {save_dir}\n")


    
for lang in hr_fw2_existing_langs:
    download_limited_token_dataset(lang_code=lang, hf_token=HF_TOKEN)


# Fineweb Edu for English

In [None]:
# Reuse function for fineweb-2 (English “sample-10BT”):

download_limited_token_dataset(
    lang_code="eng_Latn",
    hf_token=HF_TOKEN,
    loader="HuggingFaceFW/fineweb-edu",
    loader_kwargs={"name":"sample-10BT","split":"train"},
    suffix="fwedu"
)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import os
import pandas as pd

# Parameters
TARGET_TOKEN_COUNT = 1_000_000_000  # 1 billion tokens
BATCH_SIZE = 100_000
save_dir = "data/eng_Latn"
file_prefix = "eng_Latn"

# Setup
tokenizer = AutoTokenizer.from_pretrained(
    "google/gemma-3-1b-it",
    use_auth_token=hf_token
)
os.makedirs(save_dir, exist_ok=True)

print("LOADED TOKENIZER")

dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)

print("Loaded DATASET")

# Streaming loop
buffer = []
total_tokens = 0
shard_id = 1
pbar = tqdm(total=TARGET_TOKEN_COUNT, unit="tokens")

for example in dataset:
    text = example.get("text", "").strip()
    if not text:
        continue

    input_ids = tokenizer(text, truncation=False, padding=False)["input_ids"]
    token_count = len(input_ids)

    buffer.append({"text": text})
    total_tokens += token_count
    pbar.update(token_count)

    if len(buffer) >= BATCH_SIZE or total_tokens >= TARGET_TOKEN_COUNT:
        # Save this shard to a parquet file
        df = pd.DataFrame(buffer)
        shard_name = f"{file_prefix}_{shard_id:04d}_fw2.parquet"
        df.to_parquet(os.path.join(save_dir, shard_name), index=False)
        shard_id += 1
        buffer = []

    if total_tokens >= TARGET_TOKEN_COUNT:
        break

# Save any remaining buffer
if buffer:
    df = pd.DataFrame(buffer)
    shard_name = f"{file_prefix}_{shard_id:04d}_fw2.parquet"
    df.to_parquet(os.path.join(save_dir, shard_name), index=False)

pbar.close()
print(f"\n✅ Done. Saved {shard_id} shards with ~{total_tokens:,} tokens to: {save_dir}")


LOADED TOKENIZER
Loaded DATASET


1000001887tokens [33:56, 491013.70tokens/s]                             


✅ Done. Saved 11 shards with ~1,000,001,887 tokens to: data/eng_Latn





# WURA

In [26]:
# Create mapping of African language lable to Wura langauges
afri_wura_lang = {
    lang: flores_200_mapping[lang]
    for lang in african_language_list
    if lang in flores_200_mapping and flores_200_mapping[lang] in wura_langs
}

print(afri_wura_lang)
# print length of afri_wura_lang
print(f"Number of languages in afri_wura_lang: {len(afri_wura_lang)}")

{'afr_Latn': 'afr', 'amh_Ethi': 'amh', 'arz_Arab': 'arz', 'gaz_Latn': 'orm', 'hau_Latn': 'hau', 'ibo_Latn': 'ibo', 'kin_Latn': 'kin', 'nya_Latn': 'nya', 'sna_Latn': 'sna', 'som_Latn': 'som', 'swh_Latn': 'swa', 'tir_Ethi': 'tir', 'xho_Latn': 'xho', 'yor_Latn': 'yor', 'zul_Latn': 'zul'}
Number of languages in afri_wura_lang: 15


In [27]:
# High resource languages avaialbe in Wura

hr_wura_lang = {
    lang: flores_200_mapping[lang]
    for lang in high_lang_list
    if lang in flores_200_mapping and flores_200_mapping[lang] in wura_langs
}

print(hr_wura_lang)

{'por_Latn': 'por', 'fra_Latn': 'fra', 'eng_Latn': 'eng'}


## Downloading Wura for African Languages, Passage Level

In [32]:
# Download African Languages for Wura

import os
from tqdm import tqdm

def download_wura_data(lang_code: str, save_dir: str):
    """
    Download the WURA dataset (PASSAGE LEVEL) and save it as a text file
    
    Args:
        lang_code: Language code for the dataset (e.g., 'hau', 'ibo', 'yor')
        save_dir: Directory to save the downloaded dataset
    """
    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Output file path
    output_file = os.path.join(save_dir, f"{lang_code}_wura.txt")
    
    # Load the dataset
    print(f"Loading WURA dataset for language: {lang_code}")
    data = load_dataset("castorini/wura", lang_code, level="passage", verification_mode="no_checks")
    
    # Write the text data to file
    print(f"Saving data to {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in tqdm(data['train']):
            f.write(item['text'] + '\n')
    
    print(f"Dataset saved to {output_file}")
    return output_file



for lang_code, wura_code in afri_wura_lang.items():
    save_dir = f"data/{lang_code}"
    download_wura_data(wura_code, save_dir)


Loading WURA dataset for language: afr


Generating train split: 2390884 examples [00:21, 109539.78 examples/s]                      
Generating validation split: 265117 examples [00:02, 109467.13 examples/s]                     


Saving data to data/afr_Latn/afr_wura.txt


100%|██████████| 2390884/2390884 [00:41<00:00, 57943.71it/s]


Dataset saved to data/afr_Latn/afr_wura.txt
Loading WURA dataset for language: amh


Generating train split: 291026 examples [00:05, 54060.84 examples/s]                     
Generating validation split: 32307 examples [00:00, 54342.70 examples/s]                    


Saving data to data/amh_Ethi/amh_wura.txt


100%|██████████| 291026/291026 [00:08<00:00, 35334.99it/s]


Dataset saved to data/amh_Ethi/amh_wura.txt
Loading WURA dataset for language: arz
Saving data to data/arz_Arab/arz_wura.txt


100%|██████████| 1116034/1116034 [00:11<00:00, 93822.81it/s]


Dataset saved to data/arz_Arab/arz_wura.txt
Loading WURA dataset for language: orm


Generating train split: 37280 examples [00:00, 128892.96 examples/s]                    
Generating validation split: 4005 examples [00:00, 135245.66 examples/s]    


Saving data to data/gaz_Latn/orm_wura.txt


100%|██████████| 37280/37280 [00:00<00:00, 80271.49it/s]


Dataset saved to data/gaz_Latn/orm_wura.txt
Loading WURA dataset for language: hau


Generating train split: 565471 examples [00:04, 123259.05 examples/s]                     
Generating validation split: 63067 examples [00:00, 120974.69 examples/s]                    


Saving data to data/hau_Latn/hau_wura.txt


100%|██████████| 565471/565471 [00:08<00:00, 64773.72it/s]


Dataset saved to data/hau_Latn/hau_wura.txt
Loading WURA dataset for language: ibo


Generating train split: 121421 examples [00:01, 96586.56 examples/s]                   
Generating validation split: 13899 examples [00:00, 77222.26 examples/s]    


Saving data to data/ibo_Latn/ibo_wura.txt


100%|██████████| 121421/121421 [00:01<00:00, 65795.93it/s]


Dataset saved to data/ibo_Latn/ibo_wura.txt
Loading WURA dataset for language: kin


Generating train split: 120301 examples [00:00, 124077.06 examples/s]                   
Generating validation split: 6902 examples [00:00, 131061.91 examples/s]    


Saving data to data/kin_Latn/kin_wura.txt


100%|██████████| 120301/120301 [00:01<00:00, 78872.23it/s]


Dataset saved to data/kin_Latn/kin_wura.txt
Loading WURA dataset for language: nya


Generating train split: 150016 examples [00:01, 113182.90 examples/s]                   
Generating validation split: 16880 examples [00:00, 115140.06 examples/s]   


Saving data to data/nya_Latn/nya_wura.txt


100%|██████████| 150016/150016 [00:01<00:00, 75460.11it/s]


Dataset saved to data/nya_Latn/nya_wura.txt
Loading WURA dataset for language: sna


Generating train split: 141559 examples [00:01, 119713.74 examples/s]                   
Generating validation split: 16126 examples [00:00, 112140.36 examples/s]   


Saving data to data/sna_Latn/sna_wura.txt


100%|██████████| 141559/141559 [00:01<00:00, 78166.39it/s]


Dataset saved to data/sna_Latn/sna_wura.txt
Loading WURA dataset for language: som


Generating train split: 1235959 examples [00:09, 134330.05 examples/s]                    
Generating validation split: 137938 examples [00:00, 139105.92 examples/s]                    


Saving data to data/som_Latn/som_wura.txt


100%|██████████| 1235959/1235959 [00:18<00:00, 67298.14it/s]


Dataset saved to data/som_Latn/som_wura.txt
Loading WURA dataset for language: swa


Generating train split: 1801101 examples [00:14, 122225.82 examples/s]                      
Generating validation split: 200345 examples [00:01, 121385.67 examples/s]                     


Saving data to data/swh_Latn/swa_wura.txt


100%|██████████| 1801101/1801101 [00:27<00:00, 64774.84it/s]


Dataset saved to data/swh_Latn/swa_wura.txt
Loading WURA dataset for language: tir


Generating train split: 9807 examples [00:00, 52096.99 examples/s]                   
Generating validation split: 1084 examples [00:00, 56507.90 examples/s]    


Saving data to data/tir_Ethi/tir_wura.txt


100%|██████████| 9807/9807 [00:00<00:00, 31060.78it/s]


Dataset saved to data/tir_Ethi/tir_wura.txt
Loading WURA dataset for language: xho


Generating train split: 69713 examples [00:00, 114813.28 examples/s]                    
Generating validation split: 7846 examples [00:00, 101720.80 examples/s]    


Saving data to data/xho_Latn/xho_wura.txt


100%|██████████| 69713/69713 [00:00<00:00, 76505.96it/s]


Dataset saved to data/xho_Latn/xho_wura.txt
Loading WURA dataset for language: yor


Generating train split: 141321 examples [00:01, 98364.38 examples/s]                   
Generating validation split: 15612 examples [00:00, 104066.20 examples/s]   


Saving data to data/yor_Latn/yor_wura.txt


100%|██████████| 141321/141321 [00:02<00:00, 66708.58it/s]


Dataset saved to data/yor_Latn/yor_wura.txt
Loading WURA dataset for language: zul


Generating train split: 166370 examples [00:01, 121777.41 examples/s]                   
Generating validation split: 18289 examples [00:00, 115465.86 examples/s]   


Saving data to data/zul_Latn/zul_wura.txt


100%|██████████| 166370/166370 [00:02<00:00, 80169.54it/s]

Dataset saved to data/zul_Latn/zul_wura.txt





## Downloading WURA for High Resource langauges, Passage Level

In [33]:
# Download African Languages for Wura

import os
from tqdm import tqdm

def download_wura_data(lang_code: str, save_dir: str):
    """
    Download the WURA dataset (PASSAGE LEVEL) and save it as a text file
    
    Args:
        lang_code: Language code for the dataset (e.g., 'hau', 'ibo', 'yor')
        save_dir: Directory to save the downloaded dataset
    """
    # Create the save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Output file path
    output_file = os.path.join(save_dir, f"{lang_code}_wura.txt")
    
    # Load the dataset
    print(f"Loading WURA dataset for language: {lang_code}")
    data = load_dataset("castorini/wura", lang_code, level="passage", verification_mode="no_checks")
    
    # Write the text data to file
    print(f"Saving data to {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in tqdm(data['train']):
            f.write(item['text'] + '\n')
    
    print(f"Dataset saved to {output_file}")
    return output_file



for lang_code, wura_code in hr_wura_lang.items():
    save_dir = f"data/{lang_code}"
    download_wura_data(wura_code, save_dir)


Loading WURA dataset for language: fra


Generating train split: 2220759 examples [00:20, 106846.40 examples/s]                      
Generating validation split: 246611 examples [00:02, 105122.77 examples/s]                     


Saving data to data/fra_Latn/fra_wura.txt


100%|██████████| 2220759/2220759 [00:35<00:00, 63007.96it/s]


Dataset saved to data/fra_Latn/fra_wura.txt
Loading WURA dataset for language: por


Generating train split: 1548167 examples [00:12, 123995.43 examples/s]                      
Generating validation split: 173578 examples [00:01, 111056.08 examples/s]                    


Saving data to data/por_Latn/por_wura.txt


100%|██████████| 1548167/1548167 [00:23<00:00, 67271.18it/s]


Dataset saved to data/por_Latn/por_wura.txt
Loading WURA dataset for language: eng


Generating train split: 2336199 examples [00:18, 127262.61 examples/s]                      
Generating validation split: 260463 examples [00:01, 130732.86 examples/s]                     


Saving data to data/eng_Latn/eng_wura.txt


100%|██████████| 2336199/2336199 [00:35<00:00, 65527.13it/s]

Dataset saved to data/eng_Latn/eng_wura.txt





In [36]:
data = load_dataset("castorini/wura", 'eng', level="passage", verification_mode="no_checks")
print(data['train'].column_names)
print(data['train'][0])

['id', 'text']
{'id': '0', 'text': "AFCON 2019: Ghanaians attack referee Victor Gomes over Black Stars loss to Tunisia - The Black Stars have been eliminated from the 2019 AFCON in the round of 16 after being defeated on penalties - The match which ended 1-1 after extra time saw a first half goal from team captain, Dede Ayew, being disallowed by the referee Victor Gomes - Ghanaians who were incensed by the decision took to social media to insult the South African match official Ghana's Black Stars have been eliminated from the 2019 African Cup of Nations following their defeat to the Carthage Eagles of Tunisia. After playing out a 1-1 draw at the end of extra time, the fate of the Black Stars was to be decided by penalties. While Tunisians scored all of their five kicks, second-half substitute Caleb Ekuban missed for Ghana, leaving the penalties scoreline 5-4 in favour of Tunisia. READ ALSO: Gernot Rohr says Onyekuru, Osimhen not experienced enough to start for Super Eagles Ghanaians, 

# Download for document level

In [33]:
data = load_dataset("castorini/wura", "yor", level="document", verification_mode="no_checks")
                    
print(data['train'].column_names)
print(data['train'][1])

['id', 'headline', 'content', 'category', 'url']
{'id': '1', 'headline': ' \r\n                  Itura de! Iṣẹ afara-nla Kutọ pari, ṣugbọn awọn ara Akute n bẹ Dapọ Abiọdun\n', 'content': 'Pẹlu iroyin to n tẹ wa lọwọ, o ṣee ṣe ki iṣẹ afara-nla Kutọ pari laipẹ jọjọ. Gomina Dapọ Abiọdun lo fọrọ naa sita pe laipẹ lawọn agbaṣẹṣe maa pari ise naa, koda o ni wọn ti n da ọda sori biriiji naa lọwọ yii.Bakan naa lo tun mẹnuba a pe gbogbo awọn ojuna to wa lagbegbe afara-nla Kutọ ni awọn agbaṣẹṣe naa n ṣe lọwọ.Iṣẹ biriiji Kutọ yii wa lara awọn iṣẹ ti ijọba Ibikunle Amosun bẹrẹ ṣugbọn ti wọn pa ti, ko too di pe ijọba Abiọdun tun iṣẹ naa bẹrẹ lọtun.Gomina si ti ṣeleri pe gbogbo awọn iṣẹ yooku bẹẹ lawọn maa ṣe agbeyẹwo rẹ.Lara awọn iṣẹ ojuna ti awọn araalu n pariwo pe ki Dapọ Abiọdun gbe yẹwo ni ọna to lọ lati Ijoko Ọta wa si Berger, l’Ekoo.', 'category': None, 'url': 'https://www.asejere.net/itura-de-i%e1%b9%a3e-afara-nla-kuto-pari-%e1%b9%a3ugbon-awon-ara-akute-n-be-dapo-abiodun/'}


In [31]:
data = load_dataset("castorini/wura", "yor", level="passage", verification_mode="no_checks")
                    
print(data['train'].column_names)

['id', 'text']


In [36]:
#!/usr/bin/env python3
import os
import json
from datasets import load_dataset
from tqdm import tqdm

def download_wura_data(lang_code: str, save_dir: str):
    """
    Download the WURA dataset (Document LEVEL) and save it as a jsonl file,
    concatenating `headline` + `content` into `text`, and putting everything
    else under `hyperparam`.
    """
    os.makedirs(save_dir, exist_ok=True)
    output_file = os.path.join(save_dir, f"{lang_code}_wura_documentLevel.jsonl")

    print(f"Loading WURA dataset for language: {lang_code}")
    data = load_dataset(
        "castorini/wura",
        lang_code,
        level="document",
        verification_mode="no_checks"
    )

    print(f"Writing transformed JSONL to {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in tqdm(data['train'], desc="items"):
            # 1) safe-get + strip headline/content
            headline = (item.get('headline') or "").strip()
            content  = (item.get('content')  or "").strip()

            # 2) concatenate and trim any extra whitespace
            text = f"{headline} {content}".strip()

            # 3) stash the rest under hyperparam
            hyperparam = {
                k: v
                for k, v in item.items()
                if k not in ('headline', 'content')
            }

            # 4) write one JSON line
            out = {
                "text": text,
                "hyperparam": hyperparam
            }
            f.write(json.dumps(out, ensure_ascii=False) + "\n")

    print(f"Done → {output_file}")
    return output_file

if __name__ == "__main__":
    for lang_folder, wura_code in afri_wura_lang.items():
        save_dir = os.path.join("data", lang_folder)
        download_wura_data(wura_code, save_dir)


Loading WURA dataset for language: afr
Writing transformed JSONL to data/afr_Latn/afr_wura_documentLevel.jsonl


items: 100%|██████████| 1042812/1042812 [00:58<00:00, 17705.75it/s]


Done → data/afr_Latn/afr_wura_documentLevel.jsonl
Loading WURA dataset for language: amh
Writing transformed JSONL to data/amh_Ethi/amh_wura_documentLevel.jsonl


items: 100%|██████████| 135863/135863 [00:10<00:00, 13083.61it/s]


Done → data/amh_Ethi/amh_wura_documentLevel.jsonl
Loading WURA dataset for language: arz


Generating train split: 100%|██████████| 1455662/1455662 [00:20<00:00, 70315.97 examples/s]
Generating validation split: 100%|██████████| 161740/161740 [00:02<00:00, 68592.50 examples/s]


Writing transformed JSONL to data/arz_Arab/arz_wura_documentLevel.jsonl


items: 100%|██████████| 1455662/1455662 [00:43<00:00, 33800.12it/s]


Done → data/arz_Arab/arz_wura_documentLevel.jsonl
Loading WURA dataset for language: orm


Generating train split: 100%|██████████| 20169/20169 [00:00<00:00, 43086.39 examples/s]
Generating validation split: 100%|██████████| 2241/2241 [00:00<00:00, 47427.34 examples/s]


Writing transformed JSONL to data/gaz_Latn/orm_wura_documentLevel.jsonl


items: 100%|██████████| 20169/20169 [00:00<00:00, 26367.98it/s]


Done → data/gaz_Latn/orm_wura_documentLevel.jsonl
Loading WURA dataset for language: hau


Generating train split: 100%|██████████| 359881/359881 [00:08<00:00, 42264.26 examples/s]
Generating validation split: 100%|██████████| 39986/39986 [00:00<00:00, 43639.88 examples/s]


Writing transformed JSONL to data/hau_Latn/hau_wura_documentLevel.jsonl


items: 100%|██████████| 359881/359881 [00:16<00:00, 21919.51it/s]


Done → data/hau_Latn/hau_wura_documentLevel.jsonl
Loading WURA dataset for language: ibo


Generating train split: 100%|██████████| 51386/51386 [00:01<00:00, 31944.25 examples/s]
Generating validation split: 100%|██████████| 5709/5709 [00:00<00:00, 31011.18 examples/s]


Writing transformed JSONL to data/ibo_Latn/ibo_wura_documentLevel.jsonl


items: 100%|██████████| 51386/51386 [00:02<00:00, 20679.44it/s]


Done → data/ibo_Latn/ibo_wura_documentLevel.jsonl
Loading WURA dataset for language: kin


Generating train split: 100%|██████████| 97064/97064 [00:01<00:00, 50121.71 examples/s]
Generating validation split: 100%|██████████| 5831/5831 [00:00<00:00, 54234.85 examples/s]


Writing transformed JSONL to data/kin_Latn/kin_wura_documentLevel.jsonl


items: 100%|██████████| 97064/97064 [00:03<00:00, 28018.61it/s]


Done → data/kin_Latn/kin_wura_documentLevel.jsonl
Loading WURA dataset for language: nya


Generating train split: 100%|██████████| 39647/39647 [00:01<00:00, 23624.82 examples/s]
Generating validation split: 100%|██████████| 4405/4405 [00:00<00:00, 13636.84 examples/s]


Writing transformed JSONL to data/nya_Latn/nya_wura_documentLevel.jsonl


items: 100%|██████████| 39647/39647 [00:03<00:00, 12018.02it/s]


Done → data/nya_Latn/nya_wura_documentLevel.jsonl
Loading WURA dataset for language: sna


Generating train split: 100%|██████████| 60986/60986 [00:01<00:00, 39217.88 examples/s]
Generating validation split: 100%|██████████| 6776/6776 [00:00<00:00, 37269.06 examples/s]


Writing transformed JSONL to data/sna_Latn/sna_wura_documentLevel.jsonl


items: 100%|██████████| 60986/60986 [00:02<00:00, 23257.76it/s]


Done → data/sna_Latn/sna_wura_documentLevel.jsonl
Loading WURA dataset for language: som


Generating train split: 100%|██████████| 976484/976484 [00:19<00:00, 49044.24 examples/s]
Generating validation split: 100%|██████████| 108498/108498 [00:02<00:00, 48988.78 examples/s]


Writing transformed JSONL to data/som_Latn/som_wura_documentLevel.jsonl


items: 100%|██████████| 976484/976484 [00:43<00:00, 22612.05it/s]


Done → data/som_Latn/som_wura_documentLevel.jsonl
Loading WURA dataset for language: swa


Generating train split: 100%|██████████| 1036254/1036254 [00:27<00:00, 37882.01 examples/s]
Generating validation split: 100%|██████████| 115139/115139 [00:02<00:00, 40600.00 examples/s]


Writing transformed JSONL to data/swh_Latn/swa_wura_documentLevel.jsonl


items: 100%|██████████| 1036254/1036254 [00:53<00:00, 19518.87it/s]


Done → data/swh_Latn/swa_wura_documentLevel.jsonl
Loading WURA dataset for language: tir


Generating train split: 100%|██████████| 8240/8240 [00:00<00:00, 31747.01 examples/s]
Generating validation split: 100%|██████████| 915/915 [00:00<00:00, 30916.49 examples/s]


Writing transformed JSONL to data/tir_Ethi/tir_wura_documentLevel.jsonl


items: 100%|██████████| 8240/8240 [00:00<00:00, 23094.37it/s]


Done → data/tir_Ethi/tir_wura_documentLevel.jsonl
Loading WURA dataset for language: xho


Generating train split: 100%|██████████| 23892/23892 [00:00<00:00, 32444.41 examples/s]
Generating validation split: 100%|██████████| 2654/2654 [00:00<00:00, 33111.68 examples/s]


Writing transformed JSONL to data/xho_Latn/xho_wura_documentLevel.jsonl


items: 100%|██████████| 23892/23892 [00:01<00:00, 21538.92it/s]


Done → data/xho_Latn/xho_wura_documentLevel.jsonl
Loading WURA dataset for language: yor
Writing transformed JSONL to data/yor_Latn/yor_wura_documentLevel.jsonl


items: 100%|██████████| 73473/73473 [00:03<00:00, 19566.71it/s]


Done → data/yor_Latn/yor_wura_documentLevel.jsonl
Loading WURA dataset for language: zul


Generating train split: 100%|██████████| 65447/65447 [00:01<00:00, 34357.80 examples/s]
Generating validation split: 100%|██████████| 7271/7271 [00:00<00:00, 36510.15 examples/s]


Writing transformed JSONL to data/zul_Latn/zul_wura_documentLevel.jsonl


items: 100%|██████████| 65447/65447 [00:03<00:00, 19023.42it/s]

Done → data/zul_Latn/zul_wura_documentLevel.jsonl





In [37]:
#!/usr/bin/env python3
import os
import json
from datasets import load_dataset
from tqdm import tqdm

def download_wura_data(lang_code: str, save_dir: str):
    """
    Download the WURA dataset (Document LEVEL) and save it as a jsonl file,
    concatenating `headline` + `content` into `text`, and putting everything
    else under `hyperparam`.
    """
    os.makedirs(save_dir, exist_ok=True)
    output_file = os.path.join(save_dir, f"{lang_code}_wura_documentLevel.jsonl")

    print(f"Loading WURA dataset for language: {lang_code}")
    data = load_dataset(
        "castorini/wura",
        lang_code,
        level="document",
        verification_mode="no_checks"
    )

    print(f"Writing transformed JSONL to {output_file}")
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in tqdm(data['train'], desc="items"):
            # 1) safe-get + strip headline/content
            headline = (item.get('headline') or "").strip()
            content  = (item.get('content')  or "").strip()

            # 2) concatenate and trim any extra whitespace
            text = f"{headline} {content}".strip()

            # 3) stash the rest under hyperparam
            hyperparam = {
                k: v
                for k, v in item.items()
                if k not in ('headline', 'content')
            }

            # 4) write one JSON line
            out = {
                "text": text,
                "hyperparam": hyperparam
            }
            f.write(json.dumps(out, ensure_ascii=False) + "\n")

    print(f"Done → {output_file}")
    return output_file

if __name__ == "__main__":
    for lang_folder, wura_code in hr_wura_lang.items():
        save_dir = os.path.join("data", lang_folder)
        download_wura_data(wura_code, save_dir)
# Download Wura dataset for high resource languages

Loading WURA dataset for language: por


Generating train split: 100%|██████████| 1089199/1089199 [00:22<00:00, 49116.56 examples/s]
Generating validation split: 100%|██████████| 121022/121022 [00:02<00:00, 49751.55 examples/s]


Writing transformed JSONL to data/por_Latn/por_wura_documentLevel.jsonl


items: 100%|██████████| 1089199/1089199 [00:44<00:00, 24446.31it/s]


Done → data/por_Latn/por_wura_documentLevel.jsonl
Loading WURA dataset for language: fra


Generating train split: 100%|██████████| 1443177/1443177 [00:32<00:00, 44996.29 examples/s]
Generating validation split: 100%|██████████| 160352/160352 [00:04<00:00, 33930.80 examples/s]


Writing transformed JSONL to data/fra_Latn/fra_wura_documentLevel.jsonl


items: 100%|██████████| 1443177/1443177 [01:02<00:00, 23170.40it/s]


Done → data/fra_Latn/fra_wura_documentLevel.jsonl
Loading WURA dataset for language: eng


Generating train split: 100%|██████████| 1378555/1378555 [00:29<00:00, 47322.47 examples/s]
Generating validation split: 100%|██████████| 153172/153172 [00:03<00:00, 47078.89 examples/s]


Writing transformed JSONL to data/eng_Latn/eng_wura_documentLevel.jsonl


items: 100%|██████████| 1378555/1378555 [01:01<00:00, 22485.13it/s]

Done → data/eng_Latn/eng_wura_documentLevel.jsonl





In [41]:
# load your single JSONL as the “train” split
data = load_dataset(
    "json",
    data_files={ "train": "data/por_Latn/por_wura_documentLevel.jsonl" }
)

# inspect the columns
print(data["train"].column_names)
print(data["train"][0])

['text', 'hyperparam']
{'text': 'Polícia acusada de facilitar entrada de munícipes num posto de recenseamento eleitoral Um agente da PRM foi acusado, hoje, na cidade da Beira, de estar a interferir negativamente no processo de recenseamento eleitoral, ao facilitar a entrada de pessoas para serem inscritas em detrimento de dezenas de munícipes que estavam na fila desde a madrugada. A situação criou um tumulto que paralisou, por alguns instantes, o processo na capital de Sofala. Os munícipes da Beira manifestaram-se, na manhã desta segunda-feira, contra o agente da Polícia que, alegadamente, facilitou a entrada de sete pessoas para serem recenseadas, em detrimento dos que estavam na fila desde a madrugada e de outros que há vários dias não conseguem recensear-se na Escola Primária de Macombe, no bairro da Munhava. A comunidade insurgiu-se e um dos jovens do bairro foi detido. O jovem em causa foi levado para uma viatura da Polícia e, enquanto a nossa equipa de reportagem tentava colher o

# Madlad 400

## Madlad 400 for African Languages

In [20]:
# Defining the madlad 400 mapping

afri_madlad_langs = {
    "afr_Latn": "af",
    "aka_Latn": "ak",
    "amh_Ethi": "am",
    "bam_Latn": "bm",
    "dik_Latn": "din",
    "dyu_Latn": "dyu",
    "ewe_Latn": "ee",
    "fon_Latn": "fon",
    "fuv_Latn": "ff",
    "gaz_Latn": "om",   
    "hau_Latn": "ha",  
    "ibo_Latn": "ig",   
    "kbp_Latn": "kbp",
    "kin_Latn": "rw",
    "kmb_Latn": "kmb",
    "kon_Latn": "kg",
    "lin_Latn": "ln",
    "lug_Latn": "lg",
    "run_Latn": "rn",
    "sag_Latn": "sg",
    "sna_Latn": "sn",
    "som_Latn": "so",
    "sot_Latn": "st",
    "ssw_Latn": "ss",
    "swh_Latn": "sw",
    "tir_Ethi": "ti",
    "tsn_Latn": "tn",
    "tso_Latn": "ts",
    "tzm_Tfng": "ber",
    "wol_Latn": "wo",
    "xho_Latn": "xh",
    "yor_Latn": "yo",
    "zul_Latn": "zu"
}

hr_madlad_langs = {
    "eng_Latn": "en",
    "fra_Latn": "fr",
    "por_Latn": "pt",
    "arb_Arab": "ar"
}

In [38]:
import os
import requests
import gzip
import shutil

def download_and_merge_madlad_clean_files(lang_code: str, output_filename: str, save_dir: str):
    os.makedirs(save_dir, exist_ok=True)

    base_url = f"https://huggingface.co/datasets/allenai/madlad-400/resolve/main/data/{lang_code}/"
    i = 0
    temp_dir = os.path.join(save_dir, "temp_chunks")
    os.makedirs(temp_dir, exist_ok=True)

    # Step 1: Download all chunks
    print(f"Downloading all chunks for {lang_code} ...")
    while True:
        filename = f"{lang_code}_clean_{i:04d}.jsonl.gz"
        url = base_url + filename
        local_path = os.path.join(temp_dir, filename)

        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(local_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Downloaded: {filename}")
            i += 1
        else:
            print(f"No more files after: {filename}")
            break

    # Step 2: Decompress and merge into single file
    final_path = os.path.join(save_dir, output_filename)
    with open(final_path, 'wb') as outfile:
        for j in range(i):
            part_file = os.path.join(temp_dir, f"{lang_code}_clean_{j:04d}.jsonl.gz")
            with gzip.open(part_file, 'rb') as f_in:
                shutil.copyfileobj(f_in, outfile)

    print(f"\nFinal merged file saved to: {final_path}")

    # Step 3: Clean up
    shutil.rmtree(temp_dir)
    print("🧹 Cleaned up temp files.")


for lang_code, madlad_code in afri_madlad_langs.items():
    download_and_merge_madlad_clean_files(madlad_code, f"{lang_code}_ml400.jsonl", f"data/{lang_code}")


Downloading all chunks for af ...
Downloaded: af_clean_0000.jsonl.gz
No more files after: af_clean_0001.jsonl.gz

Final merged file saved to: data/afr_Latn/afr_Latn_ml400.jsonl
🧹 Cleaned up temp files.
Downloading all chunks for ak ...
Downloaded: ak_clean_0000.jsonl.gz
No more files after: ak_clean_0001.jsonl.gz

Final merged file saved to: data/aka_Latn/aka_Latn_ml400.jsonl
🧹 Cleaned up temp files.
Downloading all chunks for am ...
Downloaded: am_clean_0000.jsonl.gz
No more files after: am_clean_0001.jsonl.gz

Final merged file saved to: data/amh_Ethi/amh_Ethi_ml400.jsonl
🧹 Cleaned up temp files.
Downloading all chunks for bm ...
Downloaded: bm_clean_0000.jsonl.gz
No more files after: bm_clean_0001.jsonl.gz

Final merged file saved to: data/bam_Latn/bam_Latn_ml400.jsonl
🧹 Cleaned up temp files.
Downloading all chunks for din ...
Downloaded: din_clean_0000.jsonl.gz
No more files after: din_clean_0001.jsonl.gz

Final merged file saved to: data/dik_Latn/dik_Latn_ml400.jsonl
🧹 Cleaned up

In [41]:
import json

# Path to one of your merged JSONL files
file_path = "data/afr_Latn/afr_Latn_ml400.jsonl"

# Read the first non-empty line
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            sample = json.loads(line)
            break

# Print the keys (columns)
print("Columns:", list(sample.keys()))

Columns: ['text']


In [21]:
import os
import requests
import gzip
import shutil

def download_and_merge_madlad_clean_files(lang_code: str, output_filename: str, save_dir: str):
    os.makedirs(save_dir, exist_ok=True)

    base_url = f"https://huggingface.co/datasets/allenai/madlad-400/resolve/main/data/{lang_code}/"
    i = 0
    temp_dir = os.path.join(save_dir, "temp_chunks")
    os.makedirs(temp_dir, exist_ok=True)

    # Step 1: Download all chunks
    print(f"Downloading all chunks for {lang_code} ...")
    while True:
        filename = f"{lang_code}_clean_{i:04d}.jsonl.gz"
        url = base_url + filename
        local_path = os.path.join(temp_dir, filename)

        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(local_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Downloaded: {filename}")
            i += 1
        else:
            print(f"No more files after: {filename}")
            break

    # Step 2: Decompress and merge into single file
    final_path = os.path.join(save_dir, output_filename)
    with open(final_path, 'wb') as outfile:
        for j in range(i):
            part_file = os.path.join(temp_dir, f"{lang_code}_clean_{j:04d}.jsonl.gz")
            with gzip.open(part_file, 'rb') as f_in:
                shutil.copyfileobj(f_in, outfile)

    print(f"\nFinal merged file saved to: {final_path}")

    # Step 3: Clean up
    shutil.rmtree(temp_dir)
    print("🧹 Cleaned up temp files.")


for lang_code, madlad_code in hr_madlad_langs.items():
    download_and_merge_madlad_clean_files(madlad_code, f"{lang_code}_ml400.jsonl", f"data/{lang_code}")


Downloading all chunks for en ...
Downloaded: en_clean_0000.jsonl.gz


KeyboardInterrupt: 

In [None]:
import json

# Path to one of your merged JSONL files
file_path = "data/afr_Latn/afr_Latn_ml400.jsonl"

# Read the first non-empty line
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            sample = json.loads(line)
            break

# Print the keys (columns)
print("Columns:", list(sample.keys()))

# Extra Data

## Tswana

In [3]:
# Output folder & path
output_dir = "./data/tsn_Latn"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "tsn_Latn_extra.jsonl")

# Load the extra dataset
ds = load_dataset("OxxoCodes/Marothodi", split="train")

# Print structure of first example to verify
if len(ds) > 0:
    print("Marothodi example structure:", ds[0])

# Check if the file exists and how many lines it has
if os.path.exists(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        existing_lines = sum(1 for _ in f)

    if existing_lines >= len(ds):
        print(f"{output_path} already contains all {existing_lines} examples, skipping.")
    else:
        print(f"{output_path} has {existing_lines}/{len(ds)} examples — appending missing...")
        with open(output_path, "a", encoding="utf-8") as out_f:
            for i, example in enumerate(ds):
                if i < existing_lines:
                    continue
                
                # Create standardized format with "text" field
                if "text" in example:
                    json.dump({"text": example["text"]}, out_f)
                elif "sentence" in example:  # Assuming the dataset might have a "sentence" field
                    json.dump({"text": example["sentence"]}, out_f)
                else:
                    # If no clear text field, identify the appropriate field based on dataset structure
                    # For example, concatenate multiple fields or use a specific field:
                    # You may need to adjust this based on the actual structure
                    content = str(example)
                    json.dump({"text": content}, out_f)
                
                out_f.write("\n")
        
        print(f"Appended {len(ds) - existing_lines} new examples to {output_path}")
else:
    # File does not exist; write all from scratch
    with open(output_path, "w", encoding="utf-8") as out_f:
        for example in ds:
            # Create standardized format with "text" field
            if "text" in example:
                json.dump({"text": example["text"]}, out_f)
            elif "sentence" in example:  # Assuming the dataset might have a "sentence" field
                json.dump({"text": example["sentence"]}, out_f)
            else:
                # If no clear text field, identify the appropriate field based on dataset structure
                content = str(example)
                json.dump({"text": content}, out_f)
            
            out_f.write("\n")
    
    print(f"Wrote all {len(ds)} examples to {output_path}")

NameError: name 'os' is not defined

In [42]:
ds = load_dataset("OxxoCodes/Marothodi", split="train")
print(ds.column_names)
print(ds[0])

['text', 'source', 'source-category']
{'text': 'Ka goo Ruthe aya le Naomi Betlehema nageng ya Israele.', 'source': 'https://downloads.wortschatz-leipzig.de/corpora/tsn_community_2017.tar.gz', 'source-category': 'tsn_community_2017'}


# Convert to LLaMMa Factory Style

In [None]:
import os
import json
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
import pyarrow as pa
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class DatasetTransformer:
    def __init__(self, root_dir="data", output_dir="preprocessed_data"):
        self.root_dir = Path(root_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True, parents=True)
        
    def scan_directory(self):
        """Scan the data directory for language subdirectories and files."""
        lang_dirs = [d for d in self.root_dir.iterdir() if d.is_dir()]
        logger.info(f"Found {len(lang_dirs)} language directories")
        return lang_dirs
    
    def process_all(self):
        """Process all language directories and their files."""
        lang_dirs = self.scan_directory()
        
        for lang_dir in lang_dirs:
            lang_code = lang_dir.name
            logger.info(f"Processing language: {lang_code}")
            
            # Create output directory for this language
            lang_output_dir = self.output_dir / lang_code
            lang_output_dir.mkdir(exist_ok=True)
            
            # Process each file in the language directory
            for file_path in lang_dir.glob("*"):
                if file_path.is_file():
                    self.process_file(file_path, lang_output_dir, lang_code)
    
    def process_file(self, file_path, output_dir, lang_code):
        """Process a single file based on its type and name pattern."""
        filename = file_path.name
        logger.info(f"Processing file: {filename}")
        output_path = output_dir / f"{file_path.stem}.parquet"

        # ←— Skip if we already wrote this file
        if output_path.exists():
            logger.info(f"Skipping {filename}: output already at {output_path}")
            return
        
        # Determine file type
        if filename.endswith(".jsonl") and "wura" in filename: # This will be the document level of wura
            self.process_wura_jsonl(file_path, lang_code, output_path)
            return
        if filename.endswith(".parquet") and "fw2" in filename:
            self.process_fw2(file_path, lang_code, output_path)
            logger.info(f"Chunk-wrote FW2 to {output_path}")
            return
        elif filename.endswith(".txt") and "wura" in filename: # This will be the passage level of wura
            # pass output_path so it can write itself
            self.process_wura(file_path, lang_code, output_path)
            logger.info(f"Chunk-wrote Wura → {output_path}")
            return
        elif filename.endswith(".jsonl") and "ml400" in filename:
            self.process_madlad400(file_path, lang_code, output_path)
            logger.info(f"Chunk-wrote MADLAD400 → {output_path}")
            return
        elif filename.endswith(".jsonl") and "extra" in filename:
            df = self.process_extra_data(file_path, lang_code)
        else:
            logger.warning(f"Unknown file format for {filename}, skipping")
            return
        
        # Save the transformed dataframe
        output_filename = f"{file_path.stem}.parquet"
        output_path = output_dir / output_filename
        df.to_parquet(output_path, index=False)
        logger.info(f"Saved transformed data to {output_path}")

    def process_wura_jsonl(self, file_path: Path, lang_code: str, output_path: Path):
        """Stream Wura JSONL in chunks and write to Parquet."""
        writer = None
        batch, n = [], 0
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    entry = json.loads(line)
                    batch.append({
                        'text': entry.get('text', ''),
                        'hyperparam': {
                            'id':       f"wura_{lang_code}_{i}",
                            'language': lang_code,
                            'dataset':  'wura'
                        }
                    })
                    n += 1
                    if n >= self.chunk_size:
                        tbl = pa.Table.from_pandas(
                            pd.DataFrame(batch)[['text','hyperparam']],
                            preserve_index=False
                        )
                        if writer is None:
                            writer = pq.ParquetWriter(str(output_path), schema=tbl.schema)
                        writer.write_table(tbl)
                        batch, n = [], 0
        except Exception as e:
            logger.error(f"Error streaming Wura JSONL {file_path}: {e}")

        # flush remainder
        if batch:
            tbl = pa.Table.from_pandas(
                pd.DataFrame(batch)[['text','hyperparam']],
                preserve_index=False
            )
            if writer is None:
                writer = pq.ParquetWriter(str(output_path), schema=tbl.schema)
            writer.write_table(tbl)

        # if nothing written, stub out an empty file
        if writer:
            writer.close()
        else:
            empty = pa.Table.from_pandas(
                pd.DataFrame(columns=['text','hyperparam']),
                preserve_index=False
            )
            pq.write_table(empty, str(output_path))
        logger.info(f"Wrote Wura JSONL → {output_path}")
    
    def process_fw2(self, file_path, lang_code, output_path):
        """
        Process one large FW2 parquet by row-group, build hyperparam, and stream directly
        to output_path (a pathlib.Path to the final .parquet).
        """
        # List every field you *want*, but we'll only read the ones that actually exist
        wanted = [
            'text','id','dump','url','date','file_path',
            'language','language_score','language_script',
            'minhash_cluster_size','top_langs'
        ]

        # Open the source Parquet
        pqf = pq.ParquetFile(str(file_path))
        writer = None

        for rg in range(pqf.num_row_groups):
            # 1) Figure out which of our "wanted" fields are actually in this file
            existing = set(pqf.schema.names)
            to_read  = [c for c in wanted if c in existing]

            # always require "text" so we can output something
            if 'text' not in to_read:
                logger.warning(f"No 'text' column in row-group {rg}, skipping")
                continue

            # 2) Read just those columns
            table = pqf.read_row_group(rg, columns=to_read)
            df    = table.to_pandas()

            # 3) For any field we *wanted* but wasn’t present, add a default
            for c in wanted:
                if c not in df.columns:
                    df[c] = None

            # 4) Build hyperparam *safely* using row.get() with defaults
            def make_hp(row):
                return {
                    'id':                    row.get('id', '')                   or '',
                    'dump':                  row.get('dump', '')                 or '',
                    'url':                   row.get('url', '')                  or '',
                    'date':                  str(row.get('date', ''))            or '',
                    'file_path':             row.get('file_path', '')            or '',
                    'language':              lang_code,
                    'language_score':        row.get('language_score', 0.0)      or 0.0,
                    'language_script':       row.get('language_script', '')      or '',
                    'minhash_cluster_size':  row.get('minhash_cluster_size', 0) or 0,
                    'top_langs':             row.get('top_langs', [])            or [],
                    'dataset':               'fw2'
                }

            df['hyperparam'] = df.apply(make_hp, axis=1)

            # 5) Stream it straight out to Parquet without concatenating in RAM
            out_table = pa.Table.from_pandas(
                df[['text','hyperparam']],
                preserve_index=False
            )

            if writer is None:
                writer = pq.ParquetWriter(str(output_path), schema=out_table.schema)
            writer.write_table(out_table)

            # free up memory
            del df, table, out_table

        # 6) Clean up
        if writer:
            writer.close()
        else:
            # no data at all? write an empty stub
            empty = pa.Table.from_pandas(
                pd.DataFrame(columns=['text','hyperparam']),
                preserve_index=False
            )
            pq.write_table(empty, str(output_path))

    
    def process_wura(self, file_path, lang_code, output_path, chunk_size=50_000):
        """Process Wura `.txt` by streaming in chunks and writing to Parquet."""
        writer = None
        rows = []
        count = 0

        # 1) Stream the file line by line
        with open(file_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                line = line.rstrip('\n')
                # parse id/text
                if '\t' in line:
                    doc_id, text = line.split('\t', 1)
                else:
                    doc_id = f"wura_{lang_code}_{i}"
                    text   = line

                rows.append({
                    'text': text,
                    'hyperparam': {
                        'id':       doc_id,
                        'language': lang_code,
                        'dataset':  'wura'
                    }
                })
                count += 1

                # 2) Once we hit chunk_size, flush to Parquet
                if count >= chunk_size:
                    df = pd.DataFrame(rows)
                    table = pa.Table.from_pandas(df[['text','hyperparam']],
                                                preserve_index=False)
                    if writer is None:
                        writer = pq.ParquetWriter(str(output_path),
                                                schema=table.schema)
                    writer.write_table(table)

                    # reset
                    rows = []
                    count = 0

        # 3) Flush any remaining rows
        if rows:
            df = pd.DataFrame(rows)
            table = pa.Table.from_pandas(df[['text','hyperparam']],
                                        preserve_index=False)
            if writer is None:
                writer = pq.ParquetWriter(str(output_path),
                                        schema=table.schema)
            writer.write_table(table)

        # 4) If we never saw any data, write an empty stub
        if writer:
            writer.close()
        else:
            empty = pa.Table.from_pandas(
                pd.DataFrame(columns=['text','hyperparam']),
                preserve_index=False
            )
            pq.write_table(empty, str(output_path))

        logger.info(f"Wrote Wura data for {lang_code}: {output_path}")
        
    def process_madlad400(self, file_path, lang_code, output_path, chunk_size=50_000):
        """Process MADLAD400 JSONL by streaming in chunks and writing to Parquet."""
        writer = None
        batch = []
        n = 0

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    try:
                        entry = json.loads(line)
                    except json.JSONDecodeError:
                        logger.warning(f"Invalid JSON on line {i} in {file_path}")
                        continue

                    text = entry.get('text', '')
                    batch.append({
                        'text': text,
                        'hyperparam': {
                            'id':       f"madlad400_{lang_code}_{i}",
                            'language': lang_code,
                            'dataset':  'madlad400'
                        }
                    })
                    n += 1

                    # flush chunk
                    if n >= chunk_size:
                        tbl = pa.Table.from_pandas(
                            pd.DataFrame(batch)[['text','hyperparam']],
                            preserve_index=False
                        )
                        if writer is None:
                            writer = pq.ParquetWriter(str(output_path), schema=tbl.schema)
                        writer.write_table(tbl)
                        batch, n = [], 0

        except Exception as e:
            logger.error(f"Error streaming MADLAD400 {file_path}: {e}")

        # flush remainder
        if batch:
            tbl = pa.Table.from_pandas(
                pd.DataFrame(batch)[['text','hyperparam']],
                preserve_index=False
            )
            if writer is None:
                writer = pq.ParquetWriter(str(output_path), schema=tbl.schema)
            writer.write_table(tbl)

        # if nothing written, stub out an empty file
        if writer:
            writer.close()
        else:
            empty = pa.Table.from_pandas(
                pd.DataFrame(columns=['text','hyperparam']),
                preserve_index=False
            )
            pq.write_table(empty, str(output_path))

        logger.info(f"Wrote MADLAD400 for {lang_code}: {output_path}")
    
    def process_extra_data(self, file_path, lang_code):
        """Process extra data JSONL files."""
        try:
            data = []
            with open(file_path, 'r', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    try:
                        # Parse the JSON line
                        entry = json.loads(line)
                        text = entry.get('text', '')
                        source = entry.get('source', '')
                        source_category = entry.get('source-category', '')
                        
                        data.append({
                            'text': text,
                            'hyperparam': {
                                'id': f"extra_{lang_code}_{i}",
                                'language': lang_code,
                                'source': source,
                                'source_category': source_category,
                                'dataset': 'extra'
                            }
                        })
                    except json.JSONDecodeError:
                        logger.warning(f"Invalid JSON on line {i} in file {file_path}")
                    except Exception as e:
                        logger.warning(f"Error processing line {i} in extra data file: {e}")
            
            return pd.DataFrame(data)
        except Exception as e:
            logger.error(f"Error processing extra data file {file_path}: {e}")
            return pd.DataFrame(columns=['text', 'hyperparam'])


def main():
    """Main function to run the dataset transformer."""
    # Configure these paths as needed
    input_dir = "data"
    output_dir = "preprocessed_data"
    
    transformer = DatasetTransformer(root_dir=input_dir, output_dir=output_dir)
    transformer.process_all()
    logger.info("Dataset transformation complete!")


if __name__ == "__main__":
    main()

2025-05-14 14:44:28,501 - INFO - Found 61 language directories
2025-05-14 14:44:28,501 - INFO - Processing language: fon_Latn
2025-05-14 14:44:28,502 - INFO - Processing file: fon_Latn_ml400.jsonl
2025-05-14 14:44:28,502 - INFO - Skipping fon_Latn_ml400.jsonl: output already at preprocessed_data/fon_Latn/fon_Latn_ml400.parquet
2025-05-14 14:44:28,503 - INFO - Processing file: fon_Latn_0001_fw2.parquet
2025-05-14 14:44:28,503 - INFO - Skipping fon_Latn_0001_fw2.parquet: output already at preprocessed_data/fon_Latn/fon_Latn_0001_fw2.parquet
2025-05-14 14:44:28,504 - INFO - Processing language: hau_Latn
2025-05-14 14:44:28,505 - INFO - Processing file: hau_wura.txt
2025-05-14 14:44:28,506 - INFO - Processing file: hau_Latn_ml400.jsonl
2025-05-14 14:44:28,506 - INFO - Processing file: hau_wura_documentLevel.jsonl
2025-05-14 14:44:40,593 - ERROR - Error streaming Wura JSONL data/hau_Latn/hau_wura_documentLevel.jsonl: [Errno 60] Operation timed out
2025-05-14 14:44:40,604 - INFO - Wrote Wura

KeyboardInterrupt: 

In [10]:
dataset = load_dataset(
    "parquet",
    data_files={"preprocessed_data/arz_Arab/arz_wura_documentLevel.parquet"},
    split="train",
    streaming=True
)

print("Available columns:", dataset.features)
print("First example:", next(iter(dataset)))


Available columns: {'text': Value(dtype='string', id=None), 'hyperparam': {'dataset': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None)}, 'dataset_origin': Value(dtype='string', id=None)}
First example: {'text': 'طائر المغرد البيروفى طائر المغرد البيروفى\n\nطائر المغرد البيروفى ( الاسم العلمى: Hypocnemis peruviana ) هوا نوع من الطيور بيتبع هيپوكنيميس.', 'hyperparam': {'dataset': 'wura', 'id': 'wura_arz_Arab_0', 'language': 'arz_Arab'}, 'dataset_origin': 'wuraDocumentLevel'}


# Create My Huggingface Dataset

In [22]:
# Download dataset

from datasets import load_dataset, get_dataset_config_names
configs = get_dataset_config_names("Tiany1/multilingualPretrainDataset")

print("Available languages:", configs)


ds_aeb = load_dataset("Tiany1/multilingualPretrainDataset", "aeb_Arab")

print(ds_aeb)

print(ds_aeb["fineweb2"][0])        # first record from the fineweb2 split

Available languages: ['aeb_Arab', 'afr_Latn', 'aka_Latn', 'amh_Ethi', 'arb_Arab', 'ary_Arab', 'arz_Arab', 'bam_Latn', 'bem_Latn', 'cjk_Latn', 'dik_Latn', 'dyu_Latn', 'eng_Latn', 'ewe_Latn', 'fon_Latn', 'fra_Latn', 'fuv_Latn', 'gaz_Latn', 'hau_Latn', 'ibo_Latn', 'kab_Latn', 'kam_Latn', 'kbp_Latn', 'kea_Latn', 'kik_Latn', 'kin_Latn', 'kmb_Latn', 'knc_Arab', 'knc_Latn', 'kon_Latn', 'lin_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'mos_Latn', 'nqo_Nkoo', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'plt_Latn', 'por_Latn', 'run_Latn', 'sag_Latn', 'sna_Latn', 'som_Latn', 'sot_Latn', 'ssw_Latn', 'swh_Latn', 'taq_Latn', 'taq_Tfng', 'tir_Ethi', 'tsn_Latn', 'tso_Latn', 'tum_Latn', 'twi_Latn', 'tzm_Tfng', 'umb_Latn', 'wol_Latn', 'xho_Latn', 'yor_Latn', 'zul_Latn']


Generating fineweb2 split: 262884 examples [00:00, 309567.86 examples/s]

DatasetDict({
    fineweb2: Dataset({
        features: ['text', 'hyperparam', 'dataset_origin'],
        num_rows: 262884
    })
})
{'text': 'هم كثير مشاء الله\nمن السابقون الشيخ عبد الباسط رحمة الله و الشيخ البناء عليه رحمة الله و السيخ الحصري ادخلة الله فسيح جناته\nاما من الاحياء اطال الله اعمارهم الشيخ مشاري العفاسي و الشيخ شيخ ابو بكر الشاطري و انا اعشق صوت الشيخ عبد الرشيد صوفي\nعليهم جميعاً رجمة الله وبركاته وجمعنا الله واياهم في الجنان مع رسول الله و الصحابة و الشهداء\nامين امين امين', 'hyperparam': {'dataset': 'fw2', 'date': '2013-06-20T06:03:56Z', 'dump': 'CC-MAIN-2013-20', 'file_path': 's3://commoncrawl/crawl-data/CC-MAIN-2013-20/segments/1368710366143/warc/CC-MAIN-20130516131926-00095-ip-10-60-113-184.ec2.internal.warc.gz', 'id': '<urn:uuid:ab7fab11-f6e8-4919-a6e8-8b62a2a32938>', 'language': 'aeb_Arab', 'language_score': 0.4684912860393524, 'language_script': 'Arab', 'minhash_cluster_size': 2, 'top_langs': '{"aeb_Arab_score": 0.4684912860393524, "ars_Arab_score": 0.17445436




# Command for Dataset Loading

In [23]:
# Loading a single language

from datasets import load_dataset, get_dataset_config_names
configs = get_dataset_config_names("Tiany1/multilingualPretrainDataset")

print("Available languages:", configs)

ds_aeb = load_dataset("Tiany1/multilingualPretrainDataset", "amh_Ethi")

print(ds_aeb)

print(ds_aeb["wura_passageLevel"][0])        # first record from the wurapassage level split

print(ds_aeb["fineweb2"][0])        # first record from the fineweb2 split

Available languages: ['aeb_Arab', 'afr_Latn', 'aka_Latn', 'amh_Ethi', 'arb_Arab', 'ary_Arab', 'arz_Arab', 'bam_Latn', 'bem_Latn', 'cjk_Latn', 'dik_Latn', 'dyu_Latn', 'eng_Latn', 'ewe_Latn', 'fon_Latn', 'fra_Latn', 'fuv_Latn', 'gaz_Latn', 'hau_Latn', 'ibo_Latn', 'kab_Latn', 'kam_Latn', 'kbp_Latn', 'kea_Latn', 'kik_Latn', 'kin_Latn', 'kmb_Latn', 'knc_Arab', 'knc_Latn', 'kon_Latn', 'lin_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'mos_Latn', 'nqo_Nkoo', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'plt_Latn', 'por_Latn', 'run_Latn', 'sag_Latn', 'sna_Latn', 'som_Latn', 'sot_Latn', 'ssw_Latn', 'swh_Latn', 'taq_Latn', 'taq_Tfng', 'tir_Ethi', 'tsn_Latn', 'tso_Latn', 'tum_Latn', 'twi_Latn', 'tzm_Tfng', 'umb_Latn', 'wol_Latn', 'xho_Latn', 'yor_Latn', 'zul_Latn']


Generating fineweb2 split: 280355 examples [00:02, 114262.66 examples/s]
Generating madlad400 split: 106301 examples [00:01, 56247.37 examples/s]
Generating wura_passageLevel split: 291026 examples [00:00, 299292.19 examples/s]
Generating wura_documentLevel split: 135863 examples [00:01, 85321.96 examples/s] 


DatasetDict({
    fineweb2: Dataset({
        features: ['text', 'hyperparam', 'dataset_origin'],
        num_rows: 280355
    })
    madlad400: Dataset({
        features: ['text', 'hyperparam', 'dataset_origin'],
        num_rows: 106301
    })
    wura_passageLevel: Dataset({
        features: ['text', 'hyperparam', 'dataset_origin'],
        num_rows: 291026
    })
    wura_documentLevel: Dataset({
        features: ['text', 'hyperparam', 'dataset_origin'],
        num_rows: 135863
    })
})
{'text': 'ህጻን ልጃቸውን ለ7 ዓመታት የቆለፉባት ጀርመናውያን ቤተሰቦች ምርመራ ተከፈተባቸው በጀርመን የስምንት ዓመት ህፃን ልጅ ላይ ለአመታት ከቤት እንዳትወጣ የቆለፉት እናት እና አያቶች ምርመራ ተከፈተባቸው። የጀርመን አቃቤ ህግ ህጻኗ ለሰባት ዓመታት ያህል በቤት ውስጥ ተዘግቶባት እንደነበር አስታውቋል። በመጨረሻም በያዝነው ዓመት መስከረም መገባደጃ ላይ ነፃ እንደወጣችና ለማደጎም ተሰጥታለች ተብሏል። በህጻናት ደህንነት ላይ የሚሰሩ ባለስልጣናት ቀላል የሚባሉ እንደ ደረጃም መውጣትም ሆነ የዕለት ተዕለት ተግባራትን ለማከናወን እንደምትቸገር ተናግረዋል። የጀርመን ሚዲያዎች ውጭ ወጥታ እንደማታውቅና ጫካም ሆነ ሜዳ አይታም ሆነ ምንነቱንም እንደማታውቅ ዘግበዋል። እናቷ ኑሯችንን በጣሊያን አድርገናል በሚል ለባለስልጣናቱ እንደዋሸች አስታውቀዋል ። ቤተሰቦቿ ነዋሪነታቸው በምዕራብ ጀር

In [24]:
# Load a single langauge with specified splits

from datasets import load_dataset

# only load the fineweb2 split for English-Latin
fw2_eng = load_dataset(
    "Tiany1/multilingualPretrainDataset",
    name="eng_Latn",
    split="fineweb2"
)
print(fw2_eng)

Generating wura_passageLevel split: 2336199 examples [00:03, 682650.62 examples/s]
Generating wura_documentLevel split: 1378555 examples [00:04, 328715.99 examples/s]


ValueError: Unknown split "fineweb2". Should be one of ['wura_passageLevel', 'wura_documentLevel'].

In [None]:
# Load multiple languages (all splits) in a loop

from datasets import load_dataset

langs = ["eng_Latn", "fra_Latn", "por_Latn"]
all_ds = {}

for lang in langs:
    all_ds[lang] = load_dataset("Tiany1/multilingualPretrainDataset", name=lang)

# now all_ds["fra_Latn"]["wura_passageLevel"] etc. are available


## Creating the Dataset_Type hyperparam 

This is to keep track and load exactly the dataset type I want in the future

In [None]:
#!/usr/bin/env python3
import logging
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

def infer_origin(stem: str) -> str | None:
    stem = stem.lower()
    if "fw2" in stem:
        return "fw2"
    if "documentlevel" in stem:
        return "wuraDocumentLevel"
    if "fwedu" in stem:
        return "fwedu"
    if "ml400" in stem:
        return "madlad400"
    if "wura" in stem:
        return "wura"
    if "extra" in stem:
        return "extra"
    return None

def process_file_in_chunks(pq_path: Path, origin: str, batch_size: int = 50_000):
    """
    Read pq_path in batches, add a constant `dataset_origin` column,
    and overwrite the file in a memory-safe way.
    """
    # Open source file
    parquet_file = pq.ParquetFile(pq_path)

    # Extend schema with new string column
    new_schema = parquet_file.schema_arrow.append(
        pa.field("dataset_origin", pa.string())
    )

    tmp_path = pq_path.with_suffix(".tmp.parquet")
    writer = pq.ParquetWriter(tmp_path, new_schema, compression="snappy")

    # Stream through row‐groups / batches
    for batch in parquet_file.iter_batches(batch_size=batch_size):
        table = pa.Table.from_batches([batch], schema=parquet_file.schema_arrow)
        # constant column with `origin`
        origin_col = pa.array([origin] * table.num_rows)
        table = table.append_column("dataset_origin", origin_col)
        writer.write_table(table)

    writer.close()
    tmp_path.replace(pq_path)


def add_origin_column(root_dir: str = "preprocessed_data"):
    root = Path(root_dir)
    if not root.is_dir():
        logger.error(f"Root directory {root_dir!r} does not exist or is not a folder.")
        return

    for lang_dir in root.iterdir():
        if not lang_dir.is_dir():
            continue

        logger.info(f"Scanning language folder: {lang_dir.name}")
        for pq_file in lang_dir.glob("*.parquet"):
            # Detect existing column
            try:
                schema = pq.ParquetFile(pq_file).schema_arrow
                if "dataset_origin" in schema.names:
                    logger.info(f"{pq_file.name} already has dataset_origin; skipping")
                    continue
            except Exception as e:
                logger.warning(f"Could not read schema for {pq_file.name}: {e}")
                continue

            origin = infer_origin(pq_file.stem)
            if origin is None:
                logger.warning(f"Could not infer origin for {pq_file.name}; skipping")
                continue

            logger.info(f"Processing {pq_file.name} in chunks (origin={origin})")
            try:
                process_file_in_chunks(pq_file, origin)
                logger.info(f"✅ Updated {pq_file.name}")
            except Exception as e:
                logger.error(f"Failed to process {pq_file.name}: {e}")

if __name__ == '__main__':
    add_origin_column("preprocessed_data/eng_Latn")


2025-05-14 12:00:04,552 - INFO - Scanning language folder: eng_Latn
2025-05-14 12:00:04,554 - INFO - eng_Latn_0008_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:04,555 - INFO - eng_Latn_0001_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:04,556 - INFO - eng_Latn_0005_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:10,452 - INFO - eng_Latn_0002_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:10,457 - INFO - eng_Latn_0006_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:10,459 - INFO - eng_wura.parquet already has dataset_origin; skipping
2025-05-14 12:00:10,460 - INFO - eng_Latn_0010_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:10,462 - INFO - eng_Latn_0007_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:10,463 - INFO - eng_Latn_0003_fwEdu.parquet already has dataset_origin; skipping
2025-05-14 12:00:10,464 - INFO - eng_Latn_0004_fwEdu.parquet 

In [31]:
dataset = load_dataset(
    "parquet",
    data_files={"preprocessed_data/fon_Latn/fon_Latn_0001_fw2.parquet"},
    split="train",
    streaming=True
)

print("Available columns:", dataset.features)

print(next(iter(dataset)))


Available columns: {'text': Value(dtype='string', id=None), 'hyperparam': {'dataset': Value(dtype='string', id=None), 'date': Value(dtype='string', id=None), 'dump': Value(dtype='string', id=None), 'file_path': Value(dtype='string', id=None), 'id': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'language_score': Value(dtype='float64', id=None), 'language_script': Value(dtype='string', id=None), 'minhash_cluster_size': Value(dtype='int64', id=None), 'top_langs': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None)}, 'dataset_origin': Value(dtype='string', id=None)}
{'text': 'Wěmàxòkplé sùpípù tɔn\nSùpípù ɔ tìtómɛ nù jíjlá tɔn ɖê minirézo kéjé bó lɛ vlɔnkán nú bíbló gblògblòjí uZine tɔn ɔ. Mì sɔ kɛ mí ɖò àcɛ wàlɔ GPl tɔn mɛ. Mī zé bò zán bó dó bló gblògblòjí lɛ̂ ná : é síwú nyí mì tɔn ɖé sú kàbí gbɛtá ɖé tɔn, kàbí tòxóɖɔgbɛ ɖé tɔn, àló àjɔwíwá wú.\nGblògblòjí é lɔ̂, é wɛ nyí Wěmàxòkplé sùpípù tɔn ɖò tájí. Mí ná lɛ mɔ àlɔwlí gégé ɖê ná ɖɔn mî 

# Double check for bad files

In [33]:
from pathlib import Path
import pyarrow.parquet as pq

def find_bad_parquet_files(preproc_root: str = "preprocessed_data") -> list[str]:
    """
    Scan all .parquet files under preproc_root and return a list of paths
    that are either empty or that PyArrow rejects as invalid Parquet.
    """
    bad_files = []
    root = Path(preproc_root)
    for pq_path in root.rglob("*.parquet"):
        # 1) skip empty files
        if pq_path.stat().st_size == 0:
            bad_files.append(str(pq_path) + "  (empty file)")
            continue

        # 2) try to read metadata
        try:
            _ = pq.ParquetFile(pq_path).metadata
        except Exception as e:
            bad_files.append(f"{pq_path}  ({type(e).__name__}: {e})")

    return bad_files

if __name__ == "__main__":
    bad = find_bad_parquet_files("preprocessed_data")
    if not bad:
        print("✅ All .parquet files look valid.")
    else:
        print("❌ Found invalid parquet files:")
        for entry in bad:
            print("   -", entry)


❌ Found invalid parquet files:
   - preprocessed_data/por_Latn/por_Latn_0004_fw2.parquet  (TimeoutError: [Errno 60] Error reading bytes from file. Detail: [errno 60] Operation timed out)
   - preprocessed_data/por_Latn/por_Latn_0011_fw2.parquet  (TimeoutError: [Errno 60] Error reading bytes from file. Detail: [errno 60] Operation timed out)
   - preprocessed_data/por_Latn/por_Latn_0002_fw2.parquet  (TimeoutError: [Errno 60] Error reading bytes from file. Detail: [errno 60] Operation timed out)
   - preprocessed_data/por_Latn/por_Latn_0009_fw2.parquet  (TimeoutError: [Errno 60] Error reading bytes from file. Detail: [errno 60] Operation timed out)
   - preprocessed_data/fra_Latn/fra_Latn_0009_fw2.parquet  (TimeoutError: [Errno 60] Error reading bytes from file. Detail: [errno 60] Operation timed out)
   - preprocessed_data/fra_Latn/fra_Latn_0007_fw2.parquet  (TimeoutError: [Errno 60] Error reading bytes from file. Detail: [errno 60] Operation timed out)
   - preprocessed_data/afr_Latn/a

# Create the Huggingface Dataset