In [56]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
import torch
device = 0 if torch.cuda.is_available() else -1  # 0 = first GPU, -1 = CPU for speed
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from arabert.preprocess import ArabertPreprocessor #preprocess text
from dotenv import load_dotenv
import re
from nltk.tokenize import sent_tokenize
from collections import Counter


load_dotenv()  # Loads HF_TOKEN automatically

True

In [57]:
from transformers import pipeline
pos = pipeline('token-classification', model='CAMeL-Lab/bert-base-arabic-camelbert-da-pos-msa')
text = 'إمارة أبوظبي هي إحدى إمارات دولة الإمارات العربية المتحدة السبع'
pos(text)



ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [51]:
def pre_process(extracted_text: str):
    """
    Preprocess text using AraBERT preprocessor.
    Returns cleaned text only for Formal arabic text only.
    """
    cleaned_text = arabert_prep.preprocess(extracted_text)
    return cleaned_text


In [9]:
def make_quiz(processed_data, quiz_level="easy"):
    """
    Generate quiz questions (True/False + MCQ) based on difficulty level.
    """
    entities = processed_data["entities"]
    questions = []

    # Example: build simple True/False questions from entities
    for e in entities[:5]:  # limit for demo
        q = {
            "question": f"الكلمة '{e['word']}' هي {e['entity']}؟",
            "type": "TF",
            "correct_answer": "صح"
        }
        questions.append(q)

    # TODO: Add MCQ generation and difficulty adjustment
    if quiz_level == "medium":
        pass
    elif quiz_level == "hard":
        pass

    return questions


In [52]:

# Initialize AraBERT preprocessor & tokenizer
arabert_model_name = "aubmindlab/bert-base-arabertv2"
arabert_prep = ArabertPreprocessor(model_name=arabert_model_name)
tokenizer = AutoTokenizer.from_pretrained(arabert_model_name)

def make_flashCards(cleaned_text: str, max_flashcards: int = 10):
    """
    Generate flashcards using AraBERT tokenizer.
    Each token (heuristically considered a noun) becomes a term.
    Definition = sentence containing the token.
    """
    sentences = sent_tokenize(cleaned_text)
    flashcards = []
    added_terms = set()

    for sentence in sentences:
        # Preprocess & tokenize sentence
        cleaned_sentence = arabert_prep.preprocess(sentence)
        tokens = tokenizer.tokenize(cleaned_sentence)

        # Simple heuristic: tokens longer than 2 chars and not already added
        for token in tokens:
            if len(token) > 2 and token not in added_terms:
                flashcards.append({
                    "term": token,
                    "definition": sentence.strip()
                })
                added_terms.add(token)

            if len(flashcards) >= max_flashcards:
                break
        if len(flashcards) >= max_flashcards:
            break

    return flashcards




100%|██████████████████████████████████████████████████████████████████████████████| 241M/241M [01:08<00:00, 3.51MiB/s]




In [11]:
if __name__ == "__main__":
    text = "ولد محمد علي في القاهرة وعمل في شركة مايكروسوفت."
    processed = generator(text)

    print("\n--- Flashcards ---")
    for f in make_flashCards(processed):
        print(f)

    print("\n--- Quiz ---")
    for q in make_quiz(processed, quiz_level="easy"):
        print(q)


--- Flashcards ---
{'term': 'محمد', 'definition': 'محمد هو B-PER'}
{'term': 'علي', 'definition': 'علي هو I-PER'}
{'term': 'القاهرة', 'definition': 'القاهرة هو B-LOC'}
{'term': 'مايكروسوفت', 'definition': 'مايكروسوفت هو B-ORG'}

--- Quiz ---
{'question': "الكلمة 'محمد' هي B-PER؟", 'type': 'TF', 'correct_answer': 'صح'}
{'question': "الكلمة 'علي' هي I-PER؟", 'type': 'TF', 'correct_answer': 'صح'}
{'question': "الكلمة 'القاهرة' هي B-LOC؟", 'type': 'TF', 'correct_answer': 'صح'}
{'question': "الكلمة 'مايكروسوفت' هي B-ORG؟", 'type': 'TF', 'correct_answer': 'صح'}


In [53]:
# Example text
text = """
الخوارزمية هي مجموعة من الخطوات المحددة لحل مشكلة أو تنفيذ مهمة.
ولد محمد علي في القاهرة وعمل في شركة مايكروسوفت.
"""

# Step 1: Preprocess text
cleaned_text = pre_process(text)

# Step 2: Generate flashcards
flashcards = make_flashCards(cleaned_text)

# Step 3: Print results
print("--- Flashcards ---")
for f in flashcards:
    print(f"Term: {f['term']}")
    print(f"Definition: {f['definition']}")
    print("--------------------")


--- Flashcards ---
Term: خوارزمي
Definition: ال+ خوارزمي +ة هي مجموع +ة من ال+ خطو +ات ال+ محدد +ة ل+ حل مشكل +ة أو تنفيذ مهم +ة .
--------------------
Term: مجموع
Definition: ال+ خوارزمي +ة هي مجموع +ة من ال+ خطو +ات ال+ محدد +ة ل+ حل مشكل +ة أو تنفيذ مهم +ة .
--------------------
Term: خطو
Definition: ال+ خوارزمي +ة هي مجموع +ة من ال+ خطو +ات ال+ محدد +ة ل+ حل مشكل +ة أو تنفيذ مهم +ة .
--------------------
Term: محدد
Definition: ال+ خوارزمي +ة هي مجموع +ة من ال+ خطو +ات ال+ محدد +ة ل+ حل مشكل +ة أو تنفيذ مهم +ة .
--------------------
Term: مشكل
Definition: ال+ خوارزمي +ة هي مجموع +ة من ال+ خطو +ات ال+ محدد +ة ل+ حل مشكل +ة أو تنفيذ مهم +ة .
--------------------
Term: تنفيذ
Definition: ال+ خوارزمي +ة هي مجموع +ة من ال+ خطو +ات ال+ محدد +ة ل+ حل مشكل +ة أو تنفيذ مهم +ة .
--------------------
Term: مهم
Definition: ال+ خوارزمي +ة هي مجموع +ة من ال+ خطو +ات ال+ محدد +ة ل+ حل مشكل +ة أو تنفيذ مهم +ة .
--------------------
Term: ولد
Definition: ولد محمد علي في ال+ قاهر +ة و+ عمل في شرك +ة م

In [54]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# NER model
ner_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix-ner"
ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
ner_model = AutoModelForTokenClassification.from_pretrained(ner_model_name)
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

# POS model
pos_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-da-pos-msa"
pos_tokenizer = AutoTokenizer.from_pretrained(pos_model_name)
pos_model = AutoModelForTokenClassification.from_pretrained(pos_model_name)
pos_pipeline = pipeline("token-classification", model=pos_model, tokenizer=pos_tokenizer, aggregation_strategy="none")

# Example text
text = "ولد محمد علي في القاهرة وعمل في شركة مايكروسوفت."

# NER
ner_results = ner_pipeline(text)
print("NER results:", ner_results)

# POS
pos_results = pos_pipeline(text)
print("POS results:", pos_results)


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da-pos-msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoin

NER results: [{'entity_group': 'PERS', 'score': 0.981635, 'word': 'محمد علي', 'start': 4, 'end': 12}, {'entity_group': 'LOC', 'score': 0.9977779, 'word': 'القاهرة', 'start': 16, 'end': 23}, {'entity_group': 'ORG', 'score': 0.9847733, 'word': 'مايكروسوفت', 'start': 37, 'end': 47}]
POS results: [{'entity': 'verb', 'score': 0.99997437, 'index': 1, 'word': 'ولد', 'start': 0, 'end': 3}, {'entity': 'noun_prop', 'score': 0.99997926, 'index': 2, 'word': 'محمد', 'start': 4, 'end': 8}, {'entity': 'noun_prop', 'score': 0.99996734, 'index': 3, 'word': 'علي', 'start': 9, 'end': 12}, {'entity': 'prep', 'score': 0.999979, 'index': 4, 'word': 'في', 'start': 13, 'end': 15}, {'entity': 'noun_prop', 'score': 0.99998593, 'index': 5, 'word': 'القاهرة', 'start': 16, 'end': 23}, {'entity': 'verb', 'score': 0.9999715, 'index': 6, 'word': 'وعمل', 'start': 24, 'end': 28}, {'entity': 'prep', 'score': 0.99998415, 'index': 7, 'word': 'في', 'start': 29, 'end': 31}, {'entity': 'noun', 'score': 0.9999869, 'index': 8,