In [1]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
import torch
device = 0 if torch.cuda.is_available() else -1  # 0 = first GPU, -1 = CPU for speed
from arabert.preprocess import ArabertPreprocessor #preprocess text
from dotenv import load_dotenv
import re
from nltk.tokenize import sent_tokenize
import random
from transformers import pipeline


load_dotenv()  # Loads HF_TOKEN automatically

  from .autonotebook import tqdm as notebook_tqdm


True

In [34]:
#directly thro hugging face
from transformers import pipeline

text = "إمارة أبوظبي هي إحدى إمارات دولة الإمارات العربية المتحدة السبع"

# POS tagging
pos = pipeline(
    "token-classification",
    model="CAMeL-Lab/bert-base-arabic-camelbert-da-pos-msa",
    framework="pt",
    trust_remote_code=True  # VERY IMPORTANT
)

# NER
ner = pipeline(
    "ner",
    model="CAMeL-Lab/bert-base-arabic-camelbert-msa-ner",
    framework="pt",
    trust_remote_code=True  # VERY IMPORTANT
)


Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da-pos-msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-msa-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoin

In [35]:
def generate_question_text(entity, pos, word, sentence=None):
    """Generate natural Arabic question from entity + POS MQ."""
    
    # Person + adjective
    if entity == "PER" and pos == "adj":
        return f"من لديه {word}؟"
    
    # Person + proper noun
    if entity == "PER" and pos == "noun_prop":
        return f"من هو {word}؟"
    
    # Location + any noun
    if entity == "LOC":
        return f"أين يقع {word}؟"
    
    # Organization + adjective
    if entity == "ORG" and pos == "adj":
        return f"ما هي المنظمة التي وُصفت بأنها {word}؟"
    
    # Cloze/fill-in-the-blank
    if sentence:
        return f"أكمل الفراغ: {sentence.replace(word, '____')}"
    
    return f"صف الكلمة: {word}"


In [36]:
import random

def generate_TF_question(sentence, entity_word, entity_type, ner_tags):
    """
    Generate a True/False question by either keeping the entity (True)
    or replacing it with a realistic distractor of the same entity type (False).
    
    sentence: the original sentence
    entity_word: the correct entity in the sentence
    entity_type: NER type, e.g., "LOC", "PER", "ORG"
    ner_tags: output from ner() for the text

    the negiation doesnt seem realistic
    """
    
    # True statement
    true_statement = f"{sentence} (صح أم خطأ؟)"
    
    # Find other entities of the same type as distractors
    other_entities = [tag['word'] for tag in ner_tags 
                      if tag['entity'] == entity_type and tag['word'] != entity_word]
    
    # Pick one as distractor if exists, otherwise fallback
    distractor = random.choice(other_entities) if other_entities else entity_word + "X"
    
    # False statement: replace entity with distractor
    false_statement = sentence.replace(entity_word, distractor) + " (صح أم خطأ؟)"
    
    # Randomly choose True or False
    if random.choice([True, False]):
        return {"type": "TF", "statement": true_statement, "answer": True}
    else:
        return {"type": "TF", "statement": false_statement, "answer": False}


In [37]:
difficulty_settings = {
    "easy": {
        "num_questions": 5,
        "tf_ratio": 0.7,       # 70% TF, 30% MCQ
        "mcq_distractor_type": "simple"  # simple placeholder distractors
    },
    "medium": {
        "num_questions": 10,
        "tf_ratio": 0.5,       # balanced
        "mcq_distractor_type": "medium"  # some context-based distractors
    },
    "hard": {
        "num_questions": 15,
        "tf_ratio": 0.3,       # mostly MCQ
        "mcq_distractor_type": "challenging"  # semantically close distractors
    }
}


In [38]:
import random
from collections import defaultdict

def make_mq_options(entity_label, correct_entity, text, num_distractors=3):
    """
    Generate MCQ options for a given entity using the entire text.
    
    entity_label: NER type of the correct entity (e.g., "PER", "LOC")
    correct_entity: the correct answer
    text: full text (string)
    num_distractors: number of distractors to generate
    """
    # Run NER on the whole text once
    ner_tags = ner(text)  # your CAMeL BERT NER pipeline
    
    # Build a dictionary of entities by type
    entities_by_type = defaultdict(list)
    for tag in ner_tags:
        word = tag['word']
        label = tag['entity']
        if word != correct_entity:  # skip correct entity
            entities_by_type[label].append(word)
    
    # Get all possible distractors of the same type
    candidates = list(set(entities_by_type.get(entity_label, [])))
    
    # If not enough distractors, fill with placeholders
    while len(candidates) < num_distractors:
        candidates.append("خيار" + str(len(candidates)+1))
    
    # Randomly pick distractors
    distractors = random.sample(candidates, num_distractors)
    
    # Combine correct answer + distractors
    options = [correct_entity] + distractors
    random.shuffle(options)
    
    return options


In [39]:
def generate_MCQ_question(entity_word, sentence, entity_type="LOC", full_text=None):
    """
    Generate MCQ using entity as correct answer and distractors from the entire text.
    
    entity_word: correct answer
    sentence: current sentence (used to generate question text)
    entity_type: NER type of the entity
    full_text: full text to extract distractors from (required)
    """
    if full_text is None:
        raise ValueError("full_text must be provided to generate distractors.")
    
    # Generate the question text
    question_text = generate_question_text(entity_type, "noun", entity_word, sentence)
    
    # Generate distractors using the full text
    distractors = make_mq_options(entity_type, entity_word, full_text, num_distractors=3)
    
    # Combine correct answer + distractors and shuffle
    options = [entity_word] + distractors
    random.shuffle(options)
    
    return {"type": "MCQ", "question": question_text, "options": options, "answer": entity_word}


In [40]:
def make_quiz(text, level="medium"):
    settings = difficulty_settings.get(level, difficulty_settings["medium"])
    num_questions = settings["num_questions"]
    tf_ratio = settings["tf_ratio"]

    questions = []
    sentences = sent_tokenize(text)
    count = 0

    for sentence in sentences:
        if count >= num_questions:
            break
        
        ner_tags = ner(sentence)
        pos_tags = pos(sentence)
        
        if not ner_tags:
            continue
        
        main_entity = ner_tags[0]
        word = main_entity['word']
        entity_type = main_entity['entity']
        pos_tag = next((p['entity'] for p in pos_tags if p['word'] == word), "noun")

        # Decide question type based on tf_ratio
        if random.random() < tf_ratio:
            questions.append(generate_TF_question(sentence, word, entity_type, ner_tags))
        else:
            questions.append(generate_MCQ_question(word, sentence, entity_type,full_text=text))
        
        count += 1

    return questions


In [45]:
text = """
تأسست جامعة الملك سعود في الرياض عام 1957. 
كما تضم مدينة أبوظبي العديد من المعالم السياحية الشهيرة.
شركة أرامكو السعودية هي أكبر شركة نفط في العالم.
"""
# Generate a quiz at medium difficulty
quiz = make_quiz(text, level="medium")
for i, q in enumerate(quiz, 1):
    if q["type"] == "TF":
        print(f"{i}. TF: {q['statement']} => {q['answer']}")
    elif q["type"] == "MCQ":
        print(f"{i}. MCQ: {q['question']}")
        print(f"Options: {q['options']}, Answer: {q['answer']}")
    print()



1. MCQ: أكمل الفراغ: 
تأسست ____ الملك سعود في الرياض عام 1957.
Options: ['جامعة', 'أر', 'جامعة', 'خيار3', 'خيار2'], Answer: جامعة

2. TF: كما تضم مدينة أبوظبي العديد من المعالم السياحية الشهيرة. (صح أم خطأ؟) => True

3. TF: شركة أرXامكو السعودية هي أكبر شركة نفط في العالم. (صح أم خطأ؟) => False




Note:quality of question, quiz  must set question count how will it process long text and randomness of question no all same


In [43]:
from nltk.tokenize import sent_tokenize

def make_flashCards(text, limit=10, window=5):
    """
    Generate flashcards: one flashcard per sentence (or main entity).
    window = words around entity for definition
    """
    flashcards = []
    
    sentences = sent_tokenize(text)  # split text into sentences
    count = 0
    
    for sentence in sentences:
        if count >= limit:
            break
        
        ner_tags = ner(sentence)
        if not ner_tags:
            continue
        
        # Pick main entity (first or highest confidence)
        main_entity = ner_tags[0]
        entity_word = main_entity['word']
        entity_type = main_entity['entity']
        
        # Extract window context
        words = sentence.split()
        try:
            idx = words.index(entity_word)
        except ValueError:
            idx = 0
        start = max(0, idx - window)
        end = min(len(words), idx + window + 1)
        definition = " ".join([w for w in words[start:end] if w != entity_word])
        
        flashcards.append({
            "term": entity_word,
            "definition": definition.strip() or f"نوع: {entity_type}"
        })
        count += 1
        
    return flashcards


In [44]:
text = "إمارة أبوظبي هي إحدى إمارات دولة الإمارات العربية المتحدة السبع"
flashcards = make_flashCards(text, limit=5)

for card in flashcards:
    print(card)


{'term': 'أبوظبي', 'definition': 'إمارة هي إحدى إمارات دولة الإمارات'}
