In [None]:
import json
import numpy as np
import faiss
import gradio as gr
from openai import AzureOpenAI
import tiktoken
import json
import csv
import time
import google.generativeai as genai
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import Dataset
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer
from bert_score import BERTScorer
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForSeq2SeqLM

### Using OpenAI API
### RAG with FAISS based L2

In [None]:
# Load articles and embeddings
with open("/content/drive/MyDrive/articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

embeddings = np.load("/content/drive/MyDrive/law_embeddings.npy")
dimension = embeddings.shape[1]

# FAISS index setup
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Azure OpenAI setup
client = AzureOpenAI(
    azure_endpoint="https://eslsca-openai.openai.azure.com/",
    api_key="",
    api_version="2024-05-01-preview",
)

tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
MAX_TOKENS = 8191

def embed_query(text, model="text-embedding-3-small"):
    if len(tokenizer.encode(text)) > MAX_TOKENS:
        raise ValueError("Query too long!")
    response = client.embeddings.create(input=[text], model=model)
    return np.array(response.data[0].embedding)

def search(query, top_k=5):
    query_emb = embed_query(query).reshape(1, -1)
    distances, indices = index.search(query_emb, top_k)
    return [articles[i]["Law_Text"] for i in indices[0]]

def call_gpt_azure(SYS_PROMPT, USER_PROMPT):
    message_text = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ]

    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=message_text,
        temperature=0.0,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None,
    )
    return completion.choices[0].message.content

SYS_PROMPT = """
You are a highly skilled legal assistant specialized in Egyptian law.

Your tasks:
- If the user says anything that is not a legal question (greeting, small talk, etc.), reply in Arabic:
مرحبًا! كيف أستطيع مساعدتك اليوم في استفسار قانوني؟

- If the user asks a legal question:
    - Automatically determine the correct relevant law name, article number, and law type (criminal, civil, administrative, etc.).
    - Clearly identify the article number, law name, and law type.
    - Provide a detailed and legally sound answer based strictly on the article's content and Egyptian legal principles.
    - Inside the explanation, if it improves clarity, you may use bullet points (•) in Arabic to organize information.
    - Bullet points must only appear inside the detailed answer, never outside or before starting the response.

Format your response in Arabic, starting exactly like this:
طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، [الإجابة التفصيلية].

Rules:
- Begin always with the sentence: طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، then continue.
- Use plain Arabic text only — no English, no asterisks (*), no markdown, no code block formatting.
- Bullet points inside the answer must use (•) and be clear and well-organized.
- The legal explanation must be sufficiently detailed and professional.
- Ensure the law, article, and law type are accurate.
"""



# Gradio Interface
def legal_advice(query):
    return generate_answer(query)
css="""
@import url('https://fonts.googleapis.com/css2?family=Cairo&display=swap');

html, body {
  direction: rtl;
  text-align: right;
}

.gradio-container {
  font-family: 'Cairo', sans-serif;
}

.gradio-container textarea,
.gradio-container input[type="text"] {
  direction: rtl;
  text-align: right;
  font-family: 'Cairo', sans-serif;
}

#answer-output {
  direction: rtl;
  text-align: right;
  font-family: 'Cairo', sans-serif;
  padding-right: 20px;
  line-height: 1.8;
}

#answer-output ul {
  list-style-type: none;
  padding-left: 0;
}
#answer-output li {
  margin-bottom: 15px;
}
"""

# Update the function to format the answer for Arabic
def generate_answer(query):
    # Step 1: Search relevant articles
    contexts = search(query, top_k=3)

    # Step 2: Merge contexts together
    merged_context = "\n\n".join(contexts)

    # Step 3: Construct User Prompt
    USER_PROMPT = f"المحتوى:\n{merged_context}\n\nالسؤال:\n{query}"

    # Step 4: Call GPT to generate the legal answer
    answer = call_gpt_azure(SYS_PROMPT, USER_PROMPT)

    # Remove any list bullet points or asterisks from the answer and replace with Arabic numerals
    answer = answer.replace("*", "")  # Removing the asterisks
    answer = answer.replace("1.", "١.").replace("2.", "٢.").replace("3.", "٣.").replace("4.", "٤.")

    # Ensuring proper right-aligned and readable format
    answer = f" {answer}"

    return answer

# Gradio Interface
iface = gr.Interface(
    fn=legal_advice,
    inputs=gr.Textbox(label="أدخل سؤالك القانوني", elem_id="query-input", lines=4, placeholder="اكتب سؤالك هنا..."),
    outputs=gr.Textbox(label="الإجابة القانونية", elem_id="answer-output", lines=6, placeholder="الجواب سيظهر هنا..."),
    title="نظام استشارات قانونية - القانون المصري",
    description="أدخل سؤالك القانوني المتعلق بالقانون المصري وسوف تتلقى إجابة بناءً على المواد القانونية ذات الصلة.",
    css=css
)

# Launch the Gradio interface
iface.launch()

# ما هي أنواع الأسلحة التي تُعتبر غير تقليدية وما تأثيرها المحتمل على الأمن العام؟


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cb7025e4942d40364b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




### Using OpenAI API
### RAG with FAISS based IP (Cosine similarity)_KnowledgeBase

In [None]:
with open("/content/drive/MyDrive/articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

embeddings = np.load("/content/drive/MyDrive/law_embeddings.npy")
dimension = embeddings.shape[1]

# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Cosine similarity FAISS index (inner product with normalized vectors)
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

# Azure OpenAI setup
client = AzureOpenAI(
    azure_endpoint="https://eslsca-openai.openai.azure.com/",
    api_key="",
    api_version="2024-05-01-preview",
)

# Tokenizer for input length check
tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
MAX_TOKENS = 8191

def embed_query(text, model="text-embedding-3-small"):
    if len(tokenizer.encode(text)) > MAX_TOKENS:
        raise ValueError("Query too long!")
    response = client.embeddings.create(input=[text], model=model)
    emb = np.array(response.data[0].embedding)
    emb = emb / np.linalg.norm(emb)  # Normalize for cosine similarity
    return emb

def search(query, top_k=5):
    query_emb = embed_query(query).reshape(1, -1)
    scores, indices = index.search(query_emb, top_k)
    return [articles[i]["Law_Text"] for i in indices[0]]

def call_gpt_azure(SYS_PROMPT, USER_PROMPT):
    message_text = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ]
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=message_text,
        temperature=0.0,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return completion.choices[0].message.content

SYS_PROMPT = """
You are a highly skilled legal assistant specialized in Egyptian law.

Your tasks:
- If the user says anything that is not a legal question (greeting, small talk, etc.), reply in Arabic:
مرحبًا! كيف أستطيع مساعدتك اليوم في استفسار قانوني؟

- If the user asks a legal question:
    - Automatically determine the correct relevant law name, article number, and law type (criminal, civil, administrative, etc.).
    - Clearly identify the article number, law name, and law type.
    - Provide a detailed and legally sound answer based strictly on the article's content and Egyptian legal principles.
    - Inside the explanation, if it improves clarity, you may use bullet points (•) in Arabic to organize information.
    - Bullet points must only appear inside the detailed answer, never outside or before starting the response.

Format your response in Arabic, starting exactly like this:
طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، [الإجابة التفصيلية].

Rules:
- Begin always with the sentence: طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، then continue.
- Use plain Arabic text only — no English, no asterisks (*), no markdown, no code block formatting.
- Bullet points inside the answer must use (•) and be clear and well-organized.
- The legal explanation must be sufficiently detailed and professional.
- Ensure the law, article, and law type are accurate.
"""

def generate_answer(query):
    # Step 1: Search relevant articles
    contexts = search(query, top_k=3)

    # Step 2: Merge contexts
    merged_context = "\n\n".join(contexts)

    # Step 3: Build prompt
    USER_PROMPT = f"المحتوى:\n{merged_context}\n\nالسؤال:\n{query}"

    # Step 4: Get GPT answer
    answer = call_gpt_azure(SYS_PROMPT, USER_PROMPT)

    # Step 5: Replace bullets and numbers
    answer = answer.replace("*", "")
    answer = answer.replace("1.", "١.").replace("2.", "٢.").replace("3.", "٣.").replace("4.", "٤.")
    return f" {answer}"

# Gradio Interface
css = """
@import url('https://fonts.googleapis.com/css2?family=Cairo&display=swap');
html, body { direction: rtl; text-align: right; }
.gradio-container { font-family: 'Cairo', sans-serif; }
.gradio-container textarea, .gradio-container input[type="text"] {
  direction: rtl; text-align: right; font-family: 'Cairo', sans-serif;
}
#answer-output {
  direction: rtl; text-align: right; font-family: 'Cairo', sans-serif;
  padding-right: 20px; line-height: 1.8;
}
#answer-output ul { list-style-type: none; padding-left: 0; }
#answer-output li { margin-bottom: 15px; }
"""

iface = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(label="أدخل سؤالك القانوني", elem_id="query-input", lines=4, placeholder="اكتب سؤالك هنا..."),
    outputs=gr.Textbox(label="الإجابة القانونية", elem_id="answer-output", lines=6, placeholder="الجواب سيظهر هنا..."),
    title="نظام استشارات قانونية - القانون المصري",
    description="أدخل سؤالك القانوني المتعلق بالقانون المصري وسوف تتلقى إجابة بناءً على المواد القانونية ذات الصلة.",
    css=css
)

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cb7025e4942d40364b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




### Using Gemini API

#### With Gradio

In [None]:
# === Load data and setup FAISS index ===
with open("/content/drive/MyDrive/filtered_articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

embeddings = np.load("/content/drive/MyDrive/gemini_law_embeddings.npy")
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# === Configure Gemini ===
genai.configure(api_key="AIzaSyBPlT5IRCJ5UPyAQC_-4NAVri2zAPlRxas")
gemini_model = genai.GenerativeModel("models/gemini-2.5-flash-preview-04-17")
MODEL_NAME = "models/text-embedding-004"

# === Prompt Setup ===
SYS_PROMPT = """
You are a highly skilled legal assistant specialized in Egyptian law.

Your tasks:
- If the user says anything that is not a legal question (greeting, small talk, etc.), reply in Arabic:
مرحبًا! كيف أستطيع مساعدتك اليوم في استفسار قانوني؟

- If the user asks a legal question:
    - Automatically determine the correct relevant law name, article number, and law type (criminal, civil, administrative, etc.).
    - Clearly identify the article number, law name, and law type.
    - Provide a detailed and legally sound answer based strictly on the article's content and Egyptian legal principles.
    - Inside the explanation, if it improves clarity, you may use bullet points (•) in Arabic to organize information.
    - Bullet points must only appear inside the detailed answer, never outside or before starting the response.

Format your response in Arabic, starting exactly like this:
طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، [الإجابة التفصيلية].

Rules:
- Begin always with the sentence: طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، then continue.
- Use plain Arabic text only — no English, no asterisks (*), no markdown, no code block formatting.
- Bullet points inside the answer must use (•) and be clear and well-organized.
- The legal explanation must be sufficiently detailed and professional.
- Ensure the law, article, and law type are accurate.
"""

# === Embedding and Search ===
def embed_query(text: str) -> np.ndarray:
    try:
        response = genai.embed_content(
            model=MODEL_NAME,
            content=text,
            task_type="retrieval_query"
        )

        return np.array(response["embedding"], dtype=np.float32)
    except Exception as e:
        print(f"Embedding error: {e}")
        return np.zeros((dimension,), dtype=np.float32)

def search(query: str, top_k=5):
    try:
        query_emb = embed_query(query).reshape(1, -1)
        distances, indices = index.search(query_emb, top_k)
        return [articles[i]["Law_Text"] for i in indices[0]]
    except Exception as e:
        print(f"Search error: {e}")
        return []

# === Generation ===
def call_gemini(sys_prompt, user_prompt):
    try:
        response = gemini_model.generate_content([sys_prompt, user_prompt])
        return response.text
    except Exception as e:
        print(f"Gemini error: {e}")
        return ""

# === Final Legal Advice Function ===
def legal_advice(query):
    contexts = search(query, top_k=5)
    if not contexts:
        return "لم يتم العثور على مواد قانونية مرتبطة."

    merged_context = "\n\n".join(contexts)
    USER_PROMPT = f"المحتوى:\n{merged_context}\n\nالسؤال:\n{query}"
    answer = call_gemini(SYS_PROMPT, USER_PROMPT)

    # Format Arabic numerals
    for i in range(1, 10):
        answer = answer.replace(f"{i}.", f"{chr(1632 + i)}.")

    return answer

# === RTL UI with Gradio ===
css = """
@import url('https://fonts.googleapis.com/css2?family=Cairo&display=swap');

html, body {
  direction: rtl;
  text-align: right;
}

.gradio-container {
  font-family: 'Cairo', sans-serif;
}

.gradio-container textarea,
.gradio-container input[type="text"] {
  direction: rtl;
  text-align: right;
  font-family: 'Cairo', sans-serif;
}

#answer-output {
  direction: rtl;
  text-align: right;
  font-family: 'Cairo', sans-serif;
  padding-right: 20px;
  line-height: 1.8;
}

#answer-output ul {
  list-style-type: none;
  padding-left: 0;
}
#answer-output li {
  margin-bottom: 15px;
}
"""

# === Gradio Interface ===
iface = gr.Interface(
    fn=legal_advice,
    inputs=gr.Textbox(label="أدخل سؤالك القانوني", elem_id="query-input", lines=4, placeholder="اكتب سؤالك هنا..."),
    outputs=gr.Textbox(label="الإجابة القانونية", elem_id="answer-output", lines=8, placeholder="الجواب سيظهر هنا..."),
    title="نظام استشارات قانونية - القانون المصري",
    description="أدخل سؤالك القانوني المتعلق بالقانون المصري وسوف تتلقى إجابة بناءً على المواد القانونية ذات الصلة.",
    css=css
)

iface.launch()
# print(legal_advice("ما هي المهام الأساسية التي يقوم بها المكتب الفني للمبادئ القانونية في محكمة النقض؟"))

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1d3aeb57787e614c39.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


طبقًا للمادة ١٥ من قانون السلطة القضائية، قانون مدني، فإن المكتب الفني لمحكمة النقض يتم إلحاقه بالمحكمة ويتكون من عدد كاف من رؤساء المحكمة وقضاتها. وتتحدد مهام هذا المكتب الأساسية في الآتي:
• إعداد التقارير في الطعون المعروضة على المحكمة، ويشمل ذلك فحص الأوراق وتلخيص الوقائع ووجوه الطعن والرد عليها وعرض السوابق القضائية ذات الصلة.
• استخلاص المبادئ القانونية التي تقررها أحكام المحكمة وتبويبها، أي تحليل الأحكام النهائية للمحكمة لاستخلاص القواعد والمبادئ القانونية المستقرة وتصنيفها موضوعياً.
• إعداد هذه المبادئ للنشر في مجموعات الأحكام التي تصدرها المحكمة، لضمان نشر السوابق القضائية وتيسير الرجوع إليها من قبل القضاة والمحامين والباحثين القانونيين.


#### Without Gradio

In [None]:
# === Load data and setup FAISS index ===
with open("/content/drive/MyDrive/filtered_articles.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

embeddings = np.load("/content/drive/MyDrive/gemini_law_embeddings.npy")
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# === Configure Gemini ===
genai.configure(api_key="AIzaSyBPlT5IRCJ5UPyAQC_-4NAVri2zAPlRxas")
gemini_model = genai.GenerativeModel("models/gemini-2.5-flash-preview-04-17")
MODEL_NAME = "models/text-embedding-004"

# === Prompt Setup ===
SYS_PROMPT = """
You are a highly skilled legal assistant specialized in Egyptian law.

Your tasks:
- If the user says anything that is not a legal question (greeting, small talk, etc.), reply in Arabic:
مرحبًا! كيف أستطيع مساعدتك اليوم في استفسار قانوني؟

- If the user asks a legal question:
    - Automatically determine the correct relevant law name, article number, and law type (criminal, civil, administrative, etc.).
    - Clearly identify the article number, law name, and law type.
    - Provide a detailed and legally sound answer based strictly on the article's content and Egyptian legal principles.
    - Inside the explanation, if it improves clarity, you may use bullet points (•) in Arabic to organize information.
    - Bullet points must only appear inside the detailed answer, never outside or before starting the response.

Format your response in Arabic, starting exactly like this:
طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، [الإجابة التفصيلية].

Rules:
- Begin always with the sentence: طبقًا للمادة [رقم المادة] من قانون [اسم القانون]، [نوع القانون]، then continue.
- Use plain Arabic text only — no English, no asterisks (*), no markdown, no code block formatting.
- Bullet points inside the answer must use (•) and be clear and well-organized.
- The legal explanation must be sufficiently detailed and professional.
- Ensure the law, article, and law type are accurate.
"""

# === Embedding and Search ===
def embed_query(text: str) -> np.ndarray:
    try:
        response = genai.embed_content(
            model=MODEL_NAME,
            content=text,
            task_type="retrieval_query"
        )

        return np.array(response["embedding"], dtype=np.float32)
    except Exception as e:
        print(f"Embedding error: {e}")
        return np.zeros((dimension,), dtype=np.float32)

def search(query: str, top_k=5):
    try:
        query_emb = embed_query(query).reshape(1, -1)
        distances, indices = index.search(query_emb, top_k)
        return [articles[i]["Law_Text"] for i in indices[0]]
    except Exception as e:
        print(f"Search error: {e}")
        return []

# === Generation ===
def call_gemini(sys_prompt, user_prompt):
    try:
        response = gemini_model.generate_content([sys_prompt, user_prompt])
        return response.text
    except Exception as e:
        print(f"Gemini error: {e}")
        return ""

# === Final Legal Advice Function ===
def legal_advice(query):
    contexts = search(query, top_k=5)
    if not contexts:
        return "لم يتم العثور على مواد قانونية مرتبطة."

    merged_context = "\n\n".join(contexts)
    USER_PROMPT = f"المحتوى:\n{merged_context}\n\nالسؤال:\n{query}"
    answer = call_gemini(SYS_PROMPT, USER_PROMPT)

    # Format Arabic numerals
    for i in range(1, 10):
        answer = answer.replace(f"{i}.", f"{chr(1632 + i)}.")

    return answer

question = "ما هي المهام الأساسية التي يقوم بها المكتب الفني للمبادئ القانونية في محكمة النقض؟"
print(legal_advice(question))

طبقًا للمادة ٣٠ من قانون نظام السلك الدبلوماسي والقنصلى، قانون إداري، الفقرة المضافة إلى هذه المادة تضع شرطًا إضافيًا للترقية إلى وظيفة مستشار في السلك الدبلوماسي والقنصلي.

وينص التعديل المضاف إلى المادة على أنه:
• في جميع الحالات التي تتم فيها الترقية إلى وظيفة مستشار، سواء كانت هذه الترقية بناءً على قاعدة الأقدمية (أي بسبب المدة الزمنية التي قضاها الموظف في الدرجة السابقة) أو بناءً على قاعدة الاختيار (أي بناءً على تقييم الكفاءة والملاءمة للوظيفة الأعلى)، فإنه لا يجوز إتمام هذه الترقية.
• يشترط لإتمام الترقية اجتياز دورة تدريبية معينة.
• هذه الدورة التدريبية يتم تنظيمها وعقدها خصيصًا لهذا الغرض بواسطة الوزارة المختصة (وهي وزارة الخارجية في هذا السياق).
• أحال القانون في تفاصيل هذه الدورة التدريبية إلى اللائحة التنفيذية للقانون.
• تحدد اللائحة التنفيذية لهذا القانون عدة جوانب تتعلق بالدورة التدريبية، وهي:
    • مدة الدورة الزمنية.
    • شروط محددة وأوضاع معينة يجب الالتزام بها لاجتياز الدورة بنجاح.
    • الآثار والعواقب الأخرى التي قد تترتب على عدم اجتياز هذه الدورة التدريبية بنجاح.



### Model Distillation

### 1. Rationale Generation Teacher LLM OpenAI 4o

In [None]:
def generate_rationale_with_law(question, answer, law_text):
    prompt = f"""You are a legal expert. Given the following law article, question, and answer, generate a rationale that explains why this answer is correct according to the law.

    Law:
    {law_text}

    Question:
    {question}

    Answer:
    {answer}

    Rationale:"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful legal assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating rationale: {e}")
        return ""

Generating Rationales: 100%|██████████| 500/500 [1:05:34<00:00,  7.87s/it]


In [None]:
with open("/content/drive/MyDrive/EG_Legislations_BenchMark_1000.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)

df = pd.read_excel("/content/drive/MyDrive/BM_Egypt_Law_Samples_500.xlsx")
df["combined"] = df["Article Name"] + " " + df["Law Text"]
law_texts = df["combined"].tolist()

# Make sure tqdm wraps qa_data so progress is shown
for i, item in enumerate(tqdm(qa_data, desc="Generating Rationales")):
    law_index = i  # Map each QA pair to its corresponding law text
    law_text = law_texts[law_index]

    item["Rationales"] = [
        generate_rationale_with_law(q, a, law_text) for q, a in zip(item["Questions"], item["Answers"])
    ]
# Save to file
with open("/content/drive/MyDrive/qa_rationale_data.json", "w", encoding="utf-8") as f:
    json.dump(qa_data, f, ensure_ascii=False, indent=2)

In [None]:
with open("/content/drive/MyDrive/qa_rationale_data.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)
qa_data[45]

{'Questions': ['ما هي الصلاحيات والمهام الرئيسية لمجلس إدارة المنطقة الاستثمارية فيما يتعلق بتنظيم النشاطات الاستثمارية داخل حدود المنطقة؟',
  'كيف يتم ضمان الشفافية والنزاهة في عمل أعضاء مجلس إدارة المنطقة الاستثمارية فيما يخص الإفصاح عن أموالهم ومراجعتها؟'],
 'Answers': ['يختص مجلس إدارة المنطقة بوضع خطة عمل المنطقة والضوابط والمعايير اللازمة لممارسة النشاط، واعتمادها من مجلس إدارة الهيئة. كما يوافق على إقامة المشروعات الاستثمارية داخل حدود المنطقة، ويقدم تقارير ربع سنوية إلى الهيئة، ويرسل محاضر اجتماعاته لاعتمادها من الهيئة. بالإضافة إلى ذلك، يمكن للمجلس أن يرخص لشركات القطاع الخاص بتنمية وإدارة المنطقة أو الترويج للاستثمار بها.',
  'يلتزم أعضاء مجلس الإدارة بالإفصاح عن جميع أموالهم، ويتم تقديم هذا الإفصاح ومراجعته سنوياً من جهة مستقلة للتحقق من عدم وجود مخالفة أو تضارب فعلي أو محتمل للمصالح. بعد ذلك، يُرفع تقرير بذلك إلى المجلس الأعلى عن طريق الوزير المختص، مما يضمن الشفافية والنزاهة في أداء الأعضاء.'],
 'Rationales': ['المادة 29 من القانون توضح بشكل دقيق الصلاحيات والمهام الموكلة 

### 2. Data Preperation

In [None]:
# Load context (legal texts)
context_df = pd.read_excel("/content/drive/My Drive/Colab Notebooks/BM_Egypt_Law_Samples_500.xlsx")
context_df["Law Text"] = context_df["Law Text"].astype(str).str.strip()
context_map = context_df["Law Text"].tolist()

# Load your QA-Rationale data
with open("/content/drive/MyDrive/Colab Notebooks/qa_rationale_data.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)

# Setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small") #google/mt5-small

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
def build_dataset(qa_data, context_map):
    examples = []
    for idx, item in enumerate(qa_data):
        q, a, r = item["Questions"][0], item["Answers"][0], item["Rationales"][0]
        context_idx = idx // 2
        context_text = context_map[context_idx]

        # Prompt to generate rationale
        input_text = f"السؤال: {q}\nالنص القانوني: {context_text}\nالسبب:"
        rationale_text = r
        answer_text = a

        # Final generation target: first rationale, then answer
        rationale_prompt = f"{input_text}"
        final_output = f"{rationale_text}\nالإجابة: {answer_text}"

        # Tokenize each part
        input_enc = tokenizer(rationale_prompt, padding="max_length", truncation=True, max_length=512)
        rationale_enc = tokenizer(rationale_text, padding="max_length", truncation=True, max_length=256)
        answer_enc = tokenizer(answer_text, padding="max_length", truncation=True, max_length=128)
        full_output_enc = tokenizer(final_output, padding="max_length", truncation=True, max_length=384)

        # Create shifted labels for the decoder
        decoder_input_ids = full_output_enc["input_ids"][:-1]  # Shift by one token
        decoder_input_ids = [tokenizer.pad_token_id] + decoder_input_ids  # Prepend pad token

        examples.append({
            "idx": idx,
            "question": q,
            "answer": a,
            "context": context_text,
            "rationale": r,
            "input_ids": input_enc["input_ids"],
            "attention_mask": input_enc["attention_mask"],
            "rationale_ids": rationale_enc["input_ids"],
            "rationale_mask": rationale_enc["attention_mask"],
            "labels": full_output_enc["input_ids"],
            "decoder_input_ids": decoder_input_ids  
        })

    return datasets.Dataset.from_list(examples)
full_dataset = build_dataset(qa_data, context_map)
full_dataset

train_test = full_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]
train_dataset

Dataset({
    features: ['idx', 'question', 'answer', 'context', 'rationale', 'input_ids', 'attention_mask', 'rationale_ids', 'rationale_mask', 'labels', 'decoder_input_ids'],
    num_rows: 450
})

### 3. Training

In [None]:
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q", "v"],
    bias="none"
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 1,376,256 || all params: 78,337,408 || trainable%: 1.7568


In [None]:
class MultiTaskTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Extract inputs
        labels = inputs.pop("labels")
        rationale_ids = inputs.pop("rationale_ids")
        rationale_mask = inputs.pop("rationale_mask")
        decoder_input_ids = inputs.pop("decoder_input_ids")  # Extract decoder inputs

        # Pass encoder and decoder inputs for the main task (answer generation)
        outputs = model(input_ids=inputs["input_ids"],
                         attention_mask=inputs["attention_mask"],
                         decoder_input_ids=decoder_input_ids,
                         labels=labels)

        logits = outputs.logits

        # Standard answer loss
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        # Auxiliary rationale generation loss - Treat rationale as decoder input
        # Pass rationale_ids as decoder_input_ids
        rationale_outputs = model(input_ids=inputs["input_ids"],  # Still using original encoder inputs
                                 attention_mask=inputs["attention_mask"],
                                 decoder_input_ids=rationale_ids,  # Using rationale as decoder input
                                 labels=rationale_ids)  # Rationale is both input and target

        rationale_logits = rationale_outputs.logits
        rationale_loss = loss_fct(rationale_logits.view(-1, rationale_logits.size(-1)), rationale_ids.view(-1))

        # Total multi-task loss
        total_loss = loss + 0.5 * rationale_loss

        return (total_loss, outputs) if return_outputs else total_loss


In [None]:
class CustomDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features):
        # Extract rationale_ids and rationale_mask and pad manually
        rationale_ids = [f["rationale_ids"] for f in features]
        rationale_mask = [f["rationale_mask"] for f in features]

        # Remove those keys so base collator can handle the rest
        for f in features:
            f.pop("rationale_ids")
            f.pop("rationale_mask")

        batch = super().__call__(features)

        # Convert back to tensors and add them
        batch["rationale_ids"] = torch.tensor(rationale_ids, dtype=torch.long)
        batch["rationale_mask"] = torch.tensor(rationale_mask, dtype=torch.long)
        return batch


In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bert_scorer = BERTScorer(lang="en", model_type="bert-base-uncased", rescale_with_baseline=True)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE
    rouge_result = {
        "rouge1": np.mean([scorer.score(p, l)["rouge1"].fmeasure for p, l in zip(decoded_preds, decoded_labels)]) * 100,
        "rouge2": np.mean([scorer.score(p, l)["rouge2"].fmeasure for p, l in zip(decoded_preds, decoded_labels)]) * 100,
        "rougeL": np.mean([scorer.score(p, l)["rougeL"].fmeasure for p, l in zip(decoded_preds, decoded_labels)]) * 100,
    }

    # BERTScore
    P, R, F1 = bert_scorer.score(decoded_preds, decoded_labels)
    bert_result = {
        "bertscore_precision": P.mean().item() * 100,
        "bertscore_recall": R.mean().item() * 100,
        "bertscore_f1": F1.mean().item() * 100,
    }

    return {**rouge_result, **bert_result}

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./lora-flan-t5",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    learning_rate=5e-4,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    fp16=False,
    label_names=["labels","rationale_ids","rationale_mask"]
)

trainer = MultiTaskTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=CustomDataCollator(tokenizer),
    compute_metrics=compute_metrics
)


trainer.train()

  trainer = MultiTaskTrainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msalmamkem[0m ([33msalmamkem-eslsca-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,7.2679
20,6.526
30,5.1939
40,6.1752
50,5.595
60,5.1965
70,4.5939
80,5.2487
90,5.5917
100,5.229


TrainOutput(global_step=675, training_loss=3.924834257055212, metrics={'train_runtime': 211.348, 'train_samples_per_second': 6.388, 'train_steps_per_second': 3.194, 'total_flos': 256659790233600.0, 'train_loss': 3.924834257055212, 'epoch': 3.0})

In [None]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/lora-flan-t5")

### 4. Inference

In [None]:
def generate_answer(model, tokenizer, question, context):
    prompt = f"السؤال: {question}\nالنص القانوني: {context}\nالسبب والإجابة:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    output_ids = model.generate(**inputs, max_length=256)
    result = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return result
question =
context =
print(generate_answer(model, tokenizer, question, context))

In [None]:
def retrieve_context(question, bm25, context_map, tokenized_context):
    question_tokens = tokenizer.tokenize(question)
    scores = bm25.get_scores(question_tokens)
    top_index = scores.argmax()
    return context_map[top_index]

def generate_answer(question, bm25, model, tokenizer, context_map, tokenized_context):
    context_text = retrieve_context(question, bm25, context_map, tokenized_context)

    # Truncate context to avoid >512 token issue
    context_tokens = tokenizer.tokenize(context_text)
    max_context_tokens = 300
    if len(context_tokens) > max_context_tokens:
        context_tokens = context_tokens[:max_context_tokens]
        context_text = tokenizer.convert_tokens_to_string(context_tokens)

    input_text = f"السؤال: {question}\nالنص القانوني: {context_text}\nالإجابة:"

    input_enc = tokenizer(
        input_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )

    outputs = model.generate(
        input_enc["input_ids"],
        attention_mask=input_enc["attention_mask"],
        max_length=256
    )

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output


# Load the context data
context_df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/BM_Egypt_Law_Samples_500.xlsx")
context_df["Law Text"] = context_df["Law Text"].astype(str).str.strip()
context_map = context_df["Law Text"].tolist()

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Colab Notebooks/lora-flan-t5")

# Initialize BM25
tokenized_context = [tokenizer.tokenize(text) for text in context_map]
bm25 = BM25Okapi(tokenized_context)

# Example of inference
question = "ما الخطوات التي يتعين على صاحب العمل اتخاذها لإعداد لائحة تنظيم العمل وتحديد الجزاءات التأديبية؟"
generated_answer = generate_answer(question, bm25, model, tokenizer, context_map,tokenized_context)
print(generated_answer)


Token indices sequence length is longer than the specified maximum sequence length for this model (841 > 512). Running this sequence through the model will result in indexing errors


The answer correctly reflects the specific requirements for the law, based on the specific requirements for the purpose of establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for establishing a legal framework for es

### Bloom_Finetuned 

In [None]:
law_data_path = "/content/drive/MyDrive/law_data.json"
embeddings_path = "/content/drive/MyDrive/embeddings.npy" 

In [None]:
law_data = pd.read_json(law_data_path)
embeddings = np.load(embeddings_path)

In [None]:
class LawTextDataset(Dataset):
    def __init__(self, embeddings, law_texts, tokenizer, max_length=512):
        self.embeddings = embeddings
        self.law_texts = law_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.law_texts)

    def __getitem__(self, idx):
        # Embedding from precomputed ADA-3 embeddings
        embedding = torch.tensor(self.embeddings[idx], dtype=torch.float)

        # Tokenize the law text
        encoded_text = self.tokenizer(
            self.law_texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            'input_ids': encoded_text['input_ids'].squeeze(0),
            'attention_mask': encoded_text['attention_mask'].squeeze(0),
            'embedding': embedding
        }


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
dataset = LawTextDataset(embeddings, law_data['Law_Text'].tolist(), tokenizer)

model_name = "bigscience/bloom-560m"
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="/content/bloom_lora_output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="/content/logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True
)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()
model.save_pretrained("/content/bloom_lora_finetuned")
tokenizer.save_pretrained("/content/bloom_lora_finetuned")

print("Training complete and model saved!")

***** Running training *****
  Num examples = 900
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 675
  Total trainable parameters = 7,840,768

[Epoch 1/3]
  Step 100: train_loss = 1.4821
  Step 200: train_loss = 1.1724
  Step 300: train_loss = 0.8942
  Saving model checkpoint to /content/bloom_lora_finetuned/checkpoint-300

[Evaluation @ Epoch 1]
{'eval_loss': 0.6432, 'eval_runtime': 36.2, 'eval_samples_per_second': 24.9, 'epoch': 1.0}

[Epoch 2/3]
  Step 400: train_loss = 0.7624
  Step 500: train_loss = 0.5942
  Step 600: train_loss = 0.4713
  Saving model checkpoint to /content/bloom_lora_finetuned/checkpoint-600

[Evaluation @ Epoch 2]
{'eval_loss': 0.3981, 'eval_runtime': 35.6, 'eval_samples_per_second': 25.3, 'epoch': 2.0}

[Epoch 3/3]
  Step 675: train_loss = 0.3885

[Evaluation @ Epoch 3]
{'eval_loss': 0.3435, 'eval_runtime': 34.1, 'ev