In [None]:
!pip -q install "transformers==4.48.0" camel-tools

In [None]:
!pip install -U huggingface_hub datasets

In [None]:
!camel_data -i disambig-bert-unfactored-msa

In [None]:
!pip install rich

In [None]:
BAREC_dataSet = load_dataset("CAMeL-Lab/BAREC-Shared-Task-2025-sent")

In [6]:
BAREC_dataSet

DatasetDict({
    train: Dataset({
        features: ['ID', 'Sentence', 'Word_Count', 'Readability_Level', 'Readability_Level_19', 'Readability_Level_7', 'Readability_Level_5', 'Readability_Level_3', 'Annotator', 'Document', 'Source', 'Book', 'Author', 'Domain', 'Text_Class'],
        num_rows: 54845
    })
    validation: Dataset({
        features: ['ID', 'Sentence', 'Word_Count', 'Readability_Level', 'Readability_Level_19', 'Readability_Level_7', 'Readability_Level_5', 'Readability_Level_3', 'Annotator', 'Document', 'Source', 'Book', 'Author', 'Domain', 'Text_Class'],
        num_rows: 7310
    })
    test: Dataset({
        features: ['ID', 'Sentence', 'Word_Count', 'Readability_Level', 'Readability_Level_19', 'Readability_Level_7', 'Readability_Level_5', 'Readability_Level_3', 'Annotator', 'Document', 'Source', 'Book', 'Author', 'Domain', 'Text_Class'],
        num_rows: 7286
    })
})

In [None]:
from datasets import load_dataset
import os, re
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk

from transformers import (AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding,
                          logging as hf_logging)
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score,
                             cohen_kappa_score, accuracy_score)
hf_logging.set_verbosity_error()

from camel_tools.utils.charmap import CharMapper
from camel_tools.utils.transliterate import Transliterator
from camel_tools.tokenizers.word import simple_word_tokenize#no
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.utils.dediac import dediac_ar #no
from functools import lru_cache


In [None]:
DB_PATH = None

@lru_cache(maxsize=1)

def _load_disambiguator(db_path):
    """Load a BERT-based Arabic morphological disambiguator (CAMeL Tools, model 'msa') using the default analyzer (the CAMeL Tools built-in MSA morphology analyzer)."""
    if db_path:
        db = MorphologyDB(db_path, "a")
        analyzer = Analyzer(db, cache_size=100_000, backoff="ADD_PROP")
        bert = BERTUnfactoredDisambiguator.pretrained(model_name="msa", pretrained_cache=False, top=1)
        bert._analyzer = analyzer
    else:
        bert = BERTUnfactoredDisambiguator.pretrained(model_name="msa", pretrained_cache=False, top=1)
    return bert

def clean(text):
    """Cleans and normalizes Arabic text via a character map, then replaces alif maqṣūra (ى) with yā’ (ي) when it appears mid-word."""
    arclean = CharMapper.mapper_from_json("arclean_map.json")
    arclean_translit = Transliterator(arclean)
    out = arclean_translit.transliterate(text)
    out = re.sub(r'(?<=\B)ى(?=\B)', 'ي', out)
    return out

def diacritic_coverage(txt):
    """Proportion of Arabic diacritics (U+064B–U+0652) among the characters in the text."""
    chars = len(txt)
    ARABIC_DIACRITICS = re.compile(r"[\u064B-\u0652]")
    return 0.0 if chars == 0 else len(ARABIC_DIACRITICS.findall(txt)) / chars

def word_len_stats(word_variant):
    """Mean and std-dev of token lengths."""
    toks = word_variant.split()
    if not toks:
        return 0.0, 0.0
    lens = np.asarray([len(t) for t in toks], dtype=np.float32)
    return lens.mean(), lens.std()

In [9]:
def produce_variants(batch):
    """Build three string variants (Word, D3Tok, D3Lex) per sentence using CAMeL Tools and add them to the batch."""
    DISAMBIGUATOR = _load_disambiguator(DB_PATH)
    sentences = batch["Sentence"]
    simple_tokenized_sentences = [simple_word_tokenize(clean(s), split_digits=True) for s in sentences]
    word_sentences = [" ".join(toks) for toks in simple_tokenized_sentences]
    disambig = DISAMBIGUATOR.disambiguate_sentences(simple_tokenized_sentences)
    d3tok_sent, d3lex_sent = [], []
    for sent_disambig in disambig:
        lex = []
        d3tok = []
        d3lex = []
        # build per-token representations
        for item in sent_disambig:
            analysis = item.analyses[0][1]
            lex_word   = dediac_ar(analysis["lex"])
            d3tok_word = dediac_ar(analysis["d3tok"]).replace("_+", " +").replace("+_", "+ ")
            d3tok.append(d3tok_word)
            # build D3Lex by swapping surface segment(s) with lex form
            d3tok_segs = d3tok_word.split(" ")
            d3lex_word = []
            for seg in d3tok_segs:
                d3lex_word.append(lex_word if ("+" not in seg or seg == "+") else seg)
            d3lex.append(" ".join(d3lex_word))
            lex.append(lex_word)
        d3tok_sent.append(" ".join(d3tok))
        d3lex_sent.append(" ".join(d3lex))

    batch["Word"]  = word_sentences
    batch["D3Tok"] = d3tok_sent
    batch["D3Lex"] = d3lex_sent
    return batch

In [None]:
BAREC_dataSet_prepro = BAREC_dataSet.map(produce_variants, batched=True, batch_size=256, num_proc=1)

In [None]:
BAREC_dataSet_prepro

DatasetDict({
    train: Dataset({
        features: ['ID', 'Sentence', 'Word_Count', 'Readability_Level', 'Readability_Level_19', 'Readability_Level_7', 'Readability_Level_5', 'Readability_Level_3', 'Annotator', 'Document', 'Source', 'Book', 'Author', 'Domain', 'Text_Class', 'Word', 'D3Tok', 'D3Lex'],
        num_rows: 54845
    })
    validation: Dataset({
        features: ['ID', 'Sentence', 'Word_Count', 'Readability_Level', 'Readability_Level_19', 'Readability_Level_7', 'Readability_Level_5', 'Readability_Level_3', 'Annotator', 'Document', 'Source', 'Book', 'Author', 'Domain', 'Text_Class', 'Word', 'D3Tok', 'D3Lex'],
        num_rows: 7310
    })
    test: Dataset({
        features: ['ID', 'Sentence', 'Word_Count', 'Readability_Level', 'Readability_Level_19', 'Readability_Level_7', 'Readability_Level_5', 'Readability_Level_3', 'Annotator', 'Document', 'Source', 'Book', 'Author', 'Domain', 'Text_Class', 'Word', 'D3Tok', 'D3Lex'],
        num_rows: 7286
    })
})

In [14]:
BAREC_dataSet_prepro["train"][700]

{'ID': 10100400030,
 'Sentence': 'كان يشعر بألم شديد،',
 'Word_Count': 5,
 'Readability_Level': '10-ya',
 'Readability_Level_19': 10,
 'Readability_Level_7': 4,
 'Readability_Level_5': 2,
 'Readability_Level_3': 1,
 'Annotator': 'A5',
 'Document': 'BAREC_Majed_0413_1987_012.txt',
 'Source': 'Majed',
 'Book': 'Edition: 413',
 'Author': '#',
 'Domain': 'Arts & Humanities',
 'Text_Class': 'Foundational',
 'Word': 'كان يشعر بألم شديد ,',
 'D3Tok': 'كان يشعر ب+ ألم شديد ,',
 'D3Lex': 'كان شعر ب+ ألم شديد ,'}

In [15]:
def build_split(ds, split_name: str):
    """Build a fused [SEP]-delimited sequence per example.
    Concatenates:
      (i) corpus metadata with protected tags ([WC],[ANN],[DOC],[BOOK],[AUTH],[DOM],[TC]),
      (ii) surface indicators [DC],[WLA],[WLS],
      (iii) Sentence and Word, and
      (iv) D3Tok and D3Lex.
    """
    rows, labels = [], []
    for ex in ds[split_name]:
        dc   = diacritic_coverage(ex["Sentence"])
        wla, wls = word_len_stats(ex["Word"])

        field_tokens = (
            f"[WC]{ex['Word_Count']} "
            f"[ANN]{ex['Annotator']} "
            f"[DOC]{ex['Document']} "
            f"[BOOK]{ex['Book']} "
            f"[AUTH]{ex['Author']} "
            f"[DOM]{ex['Domain']} "
            f"[TC]{ex['Text_Class']} "
            f"[DC]{dc:.3f} "
            f"[WLA]{wla:.3f} "
            f"[WLS]{wls:.3f}"
        )

        seq_variants = " [SEP] ".join(
            [ex["Sentence"], ex["Word"], ex["D3Tok"], ex["D3Lex"]]
        )
        rows.append(f"{field_tokens} [SEP] {seq_variants}")
        labels.append(float(ex["Readability_Level_19"]))
    return {"text": rows, "labels": labels}


In [16]:
BAREC_dataSet_prepro_ = DatasetDict({
    split: Dataset.from_dict(build_split(BAREC_dataSet_prepro, split))
    for split in ["train", "validation", "test"]
})

In [20]:
BAREC_dataSet_prepro_

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 54845
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 7310
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 7286
    })
})

In [19]:
BAREC_dataSet_prepro_["train"][700]

{'text': '[WC]5 [ANN]A5 [DOC]BAREC_Majed_0413_1987_012.txt [BOOK]Edition: 413 [AUTH]# [DOM]Arts & Humanities [TC]Foundational [DC]0.000 [WLA]3.200 [WLS]1.166 [SEP] كان يشعر بألم شديد، [SEP] كان يشعر بألم شديد , [SEP] كان يشعر ب+ ألم شديد , [SEP] كان شعر ب+ ألم شديد ,',
 'labels': 10.0}