In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')  # Mount Google Drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Task 1

In [None]:
import unicodedata
import sentencepiece as spm
import os

# Paths and configuration for corpus and model
INPUT_FILE = "/content/drive/MyDrive/sinhala_only.txt"
NORMALIZED_FILE = "/content/drive/MyDrive/sinhala_corpus_normalized.txt"
BPE_MODEL_PREFIX = "sinhala_spm_bpe"
UNIGRAM_MODEL_PREFIX = "sinhala_spm_unigram"
CHAR_MODEL_PREFIX = "sinhala_spm_char"
VOCAB_SIZE = 16000


# Task 2

In [None]:
# In a new cell at the beginning of your .ipynb

!pip install fasttext
!pip install tokenizers
!pip install transformers   # For the BERT-like model

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313476 sha256=db606fd3d4acbf1b21ad7b140e427ce7df653807ab69f76e1874e4017a3b8147
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
!pip install gensim sentencepiece numpy



In [None]:
import fasttext
import os
import numpy as np
import pandas as pd
from tokenizers import SentencePieceBPETokenizer
from transformers import BertConfig, BertTokenizerFast, BertForMaskedLM
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

# Task 4

In [None]:
# Install required libraries
!pip install transformers sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [None]:
# Load the Sinhala Constitution document
with open("/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt", "r", encoding="utf-8") as f:
    constitution_text = f.read()

# Preprocess the text (split into chunks)
def split_text_into_chunks(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunks = split_text_into_chunks(constitution_text)
print(f"Total chunks: {len(chunks)}")

Total chunks: 143


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load a pre-trained embedding model
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Generate embeddings for the chunks
chunk_embeddings = embedding_model.encode(chunks)

# Build a FAISS index for efficient similarity search
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline

# -------------------------------
# 3. Build the RAG Pipeline
# -------------------------------
# Load a tokenizer for the language model
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Replace with a Sinhala-specific model if available

# Load a pre-trained language model for generation
generator = pipeline("text-generation", model="gpt2")  # Replace with a Sinhala-specific model if available

# Define the RAG function
def rag_answer(question, top_k=3, max_context_tokens=1000):
    # Embed the question
    question_embedding = embedding_model.encode([question])

    # Retrieve the top-k most relevant chunks
    distances, indices = index.search(question_embedding, top_k)
    relevant_chunks = [chunks[i] for i in indices[0]]

    # Combine the chunks into a context
    context = " ".join(relevant_chunks)

    # Tokenize the context and truncate if necessary
    context_tokens = tokenizer.encode(context, truncation=True, max_length=max_context_tokens)
    truncated_context = tokenizer.decode(context_tokens, skip_special_tokens=True)

    # Generate an answer using the LLM
    prompt = f"ප්‍රශ්නය: {question}\nසන්දර්භය: {truncated_context}\nපිළිතුර:"
    answer = generator(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"]

    return answer

Device set to use cpu


In [None]:
# -------------------------------
# 4. Develop the Chatbot
# -------------------------------
# Chatbot function
def chatbot(question, use_rag=True):
    if use_rag:
        return rag_answer(question)
    else:
        # Directly generate an answer without RAG
        prompt = f"ප්‍රශ්නය: {question}\nපිළිතුර:"
        return generator(prompt, max_length=100, num_return_sequences=1)[0]["generated_text"]


In [None]:
# -------------------------------
# 5. Compare with and without RAG
# -------------------------------
# Sample questions for evaluation
sample_questions = [
    "ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ පළමු වගන්තිය කුමක්ද?",
    "ජනාධිපතිගේ ධුර කාලය කොපමණද?",
    "මූලික අයිතිවාසිකම් ගැන ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ සඳහන් වන්නේ කුමක්ද?"
]

# Evaluate with and without RAG
for q in sample_questions:
    print(f"Question: {q}")
    print("With RAG:")
    print(chatbot(q, use_rag=True))
    print("Without RAG:")
    print(chatbot(q, use_rag=False))
    print("-" * 50)

Question: ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ පළමු වගන්තිය කුමක්ද?
With RAG:


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ValueError: Input length of input_ids is 100, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [None]:
# --------------------------------------
# 0) Install Required Libraries
# --------------------------------------
!pip install transformers sentence-transformers faiss-cpu

# --------------------------------------
# 1) Imports
# --------------------------------------
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer

# --------------------------------------
# 2) Load and Preprocess the Constitution Document
# --------------------------------------
# NOTE: Update the file path if needed.
with open("/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt", "r", encoding="utf-8") as f:
    constitution_text = f.read()

def split_text_into_chunks(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunks = split_text_into_chunks(constitution_text)
print(f"Total chunks: {len(chunks)}")

# --------------------------------------
# 3) Create Embeddings and Build FAISS Index
# --------------------------------------
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Generate embeddings for each chunk
chunk_embeddings = embedding_model.encode(chunks)

# Build a FAISS index for efficient similarity search
dimension = chunk_embeddings.shape[1]  # Embedding size
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

# --------------------------------------
# 4) Load a Language Model and Tokenizer
# --------------------------------------
tokenizer = AutoTokenizer.from_pretrained("gpt2")
generator = pipeline("text-generation", model="gpt2")

# --------------------------------------
# 5) RAG Answer Function
# --------------------------------------
def rag_answer(question, top_k=3, max_context_tokens=1000, max_new_tokens=50):
    """
    Retrieves the top-k most relevant chunks, then builds a prompt
    and generates an answer. Ensures the prompt does not exceed GPT-2's
    1024-token limit to avoid IndexError.
    """
    # 1) Embed the question
    question_embedding = embedding_model.encode([question])

    # 2) Retrieve the top-k relevant chunks
    distances, indices = index.search(question_embedding, top_k)
    relevant_chunks = [chunks[i] for i in indices[0]]

    # 3) Combine chunks into a single context string
    context = " ".join(relevant_chunks)

    # 4) Build the prompt BEFORE generating
    prompt_without_answer = (
        f"ප්‍රශ්නය: {question}\n"
        f"සන්දර්භය: {context}\n"
        f"පිළිතුර:"
    )

    # 5) Tokenize the entire prompt
    prompt_tokens = tokenizer.encode(prompt_without_answer, add_special_tokens=False)

    # 6) Manually truncate to stay under 1024 tokens total
    max_prompt_length = 1024 - max_new_tokens
    if len(prompt_tokens) > max_prompt_length:
        prompt_tokens = prompt_tokens[:max_prompt_length]

    # 7) Decode back into text
    truncated_prompt = tokenizer.decode(prompt_tokens, skip_special_tokens=True)

    # 8) Generate an answer from the truncated prompt
    answer_output = generator(
        truncated_prompt,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id  # Avoid warnings about missing pad_token_id
    )
    answer = answer_output[0]["generated_text"]

    return answer

# --------------------------------------
# 6) Direct (No RAG) Answer Function
# --------------------------------------
def direct_answer(question, max_new_tokens=50):
    """
    Generates an answer with GPT-2 directly, without retrieving context from FAISS.
    Also ensures prompt length does not exceed 1024 tokens.
    """
    prompt_without_answer = f"ප්‍රශ්නය: {question}\nපිළිතුර:"
    prompt_tokens = tokenizer.encode(prompt_without_answer, add_special_tokens=False)

    max_prompt_length = 1024 - max_new_tokens
    if len(prompt_tokens) > max_prompt_length:
        prompt_tokens = prompt_tokens[:max_prompt_length]

    truncated_prompt = tokenizer.decode(prompt_tokens, skip_special_tokens=True)

    answer_output = generator(
        truncated_prompt,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    return answer_output[0]["generated_text"]

# --------------------------------------
# 7) Chatbot Interface
# --------------------------------------
def chatbot(question, use_rag=True):
    if use_rag:
        return rag_answer(question)
    else:
        return direct_answer(question)

# --------------------------------------
# 8) Example Usage
# --------------------------------------
sample_questions = [
    "ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ පළමු වගන්තිය කුමක්ද?",
    "ජනාධිපතිගේ ධුර කාලය කොපමණද?",
    "මූලික අයිතිවාසිකම් ගැන ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ සඳහන් වන්නේ කුමක්ද?"
]

for q in sample_questions:
    print(f"Question: {q}\n")

    print("=== With RAG ===")
    rag_resp = chatbot(q, use_rag=True)
    print(rag_resp, "\n")

    print("=== Without RAG ===")
    direct_resp = chatbot(q, use_rag=False)
    print(direct_resp, "\n")

    print("=" * 50)


Total chunks: 143


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (24973 > 1024). Running this sequence through the model will result in indexing errors


Question: ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ පළමු වගන්තිය කුමක්ද?

=== With RAG ===
ප්‍රශ්නය: ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ පළමු වගන්තිය කුමක්ද?
සන්දර්භය: මාතා" ගීය වන්නේ ය. ජාතික ගීයෙහි පදමාලාව සහ සංගීතය තුන්වන උපලේඛනයෙහි දැක්වෙන්නේය. 8. ශ්‍රී ලංකා ජනරජයේ ජාතික දිනය පෙබරවාරි මස හතරවන දිනය වන්නේය. 1- හත්වන ආණ්ඩුක්‍රම ව්‍යවස්ථා සංශෝධනයේ 2 (අ) වන වගන්තිය මගින් ආදේශ කරන ලදී. 2 – හත්වන ආණ්ඩුක්‍රම ව්‍යවස්ථා සංශෝධනයේ 2 (අ) වන වගන්තිය මගින් ආදේශ කරන ලදී. 3 - හත්වන ආණ්ඩුක�්‍රම ව්‍ය෋මු භදී, ඣශ 

=== Without RAG ===
ප්‍රශ්නය: ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ පළමු වගන්තිය කුමක්ද?
පිළිතුර: ඹ්වඬාුරීඡැන ටඦ෯ෑ඲ 

Question: ජනාධිපතිගේ ධුර කාලය කොපමණද?

=== With RAG ===
ප්‍රශ්නය: ජනාධිපතිගේ ධුර කාලය කොපමණද?
සන්දර්භය: යටතේ පිහිටුවන ලද සෑම මහාධිකරණයකටම, මේ ව්‍යවස්ථාවේ (1) වන ඡේදයේ සඳහන් යම් කාරණයක් විභාග කොට විනිශ්චය කිරීමට අධිකරණ බලය ඇත්තේය. මේ පරිච්ඡේදයේ සඳහන් කවර හෝ ධුරයකට පත් කරනු ලැබූ තැනැත්තකු, 43 [ ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ හතරවන සහ හත්වන උපලේඛනවල දක්වා ඇති ප්‍රතිඥා] දී ඊට අත්සන් තබන තෙක් හෝ දිවුර

In [None]:
!pip install faiss-cpu  # or faiss-cpu if no GPU available
!pip install sentence-transformers transformers

import os
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from to

In [None]:
# -----------------------------------------------------------
# 1) Load & Chunk the Constitution
# -----------------------------------------------------------

CONSTITUTION_FILE = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"

def load_sinhala_constitution(filepath):
    """
    Load the entire constitution text as a single string.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read()
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    """
    Split text into overlapping chunks of `chunk_size` characters.
    Overlap helps preserve context across chunk boundaries.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end]
        chunks.append(chunk)
        # Overlap ensures each chunk shares 50 chars with the previous one
        start += chunk_size - overlap
        if start >= len(text):
            break
    return chunks

constitution_text = load_sinhala_constitution(CONSTITUTION_FILE)
chunks = chunk_text(constitution_text, chunk_size=500, overlap=50)
print(f"Total chunks: {len(chunks)}")

# -----------------------------------------------------------
# 2) Build the Vector Store (FAISS)
# -----------------------------------------------------------

EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedder = SentenceTransformer(EMBEDDING_MODEL)

# Embed each chunk
chunk_embeddings = embedder.encode(chunks, convert_to_numpy=True)

# Create FAISS index
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(chunk_embeddings)

def retrieve_chunks(question, top_k=3):
    """
    Given a user question (in Sinhala), embed it and retrieve top-k chunk indices.
    """
    q_emb = embedder.encode([question], convert_to_numpy=True)
    # Search
    distances, indices = index.search(q_emb, top_k)
    # distances.shape = (1, top_k), indices.shape = (1, top_k)
    retrieved = []
    for rank, idx in enumerate(indices[0]):
        snippet = chunks[idx]
        dist = distances[0][rank]
        retrieved.append((snippet, dist))
    return retrieved

# -----------------------------------------------------------
# 3) Load a Small Generative Model
# -----------------------------------------------------------
GEN_MODEL_NAME = "google/mt5-small"
gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)

def generate_answer(prompt, max_length=128):
    """
    Use mT5 or another Seq2Seq model to generate text from a prompt in Sinhala.
    """
    inputs = gen_tokenizer.encode(prompt, return_tensors="pt", truncation=True)
    outputs = gen_model.generate(
        inputs,
        max_length=max_length,
        num_beams=3,
        early_stopping=True
    )
    return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)

# -----------------------------------------------------------
# 4) RAG Pipeline
# -----------------------------------------------------------

def rag_pipeline(question, top_k=3):
    """
    1) Retrieve the top-k chunks from the FAISS index.
    2) Concatenate them as context for the model.
    3) Generate an answer in Sinhala.
    """
    # Retrieve
    retrieved_info = retrieve_chunks(question, top_k=top_k)

    # Combine the top chunks
    context_text = "\n\n".join([f"[Chunk]: {snip}" for (snip, dist) in retrieved_info])

    # Create the prompt for the generative model
    # For best results, you might want to add more instructions in Sinhala.
    prompt = f"ප්‍රශ්නය: {question}\n\n" \
             f"ප්‍රස්ංගය:\n{context_text}\n\n" \
             "ප්‍රශ්නයට සිංහලින් සාරාංශ පළකරන්න:"

    answer = generate_answer(prompt)
    return answer

# -----------------------------------------------------------
# 5) Non-RAG Pipeline
# -----------------------------------------------------------

def direct_pipeline(question):
    """
    Send the user’s question directly to the model,
    with no extra retrieved context.
    """
    # Simple naive approach
    prompt = f"ප්‍රශ්නය: {question}\n" \
             "එය පිළිබඳව, කරුණාකර සාරාංශ පිළිතුරක් ලබා දෙන්න:"
    answer = generate_answer(prompt)
    return answer

# -----------------------------------------------------------
# 6) Testing / Comparison
# -----------------------------------------------------------

def test_chatbot(question):
    print("=== Non-RAG Output ===")
    nr_answer = direct_pipeline(question)
    print(nr_answer)

    print("\n=== RAG Output ===")
    rag_answer = rag_pipeline(question)
    print(rag_answer)

# Example: a question about the Sri Lankan Constitution in Sinhala
sample_question = "ශ්‍රී ලංකා ජනාධිපතිවරයා කොපමණකාලයක් ධුරය දැරිය හැක්කේ ද?"
test_chatbot(sample_question)



Total chunks: 920


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


=== Non-RAG Output ===


model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

<extra_id_0>.

=== RAG Output ===
<extra_id_0> ය.]


In [None]:
# -------------------------------
# 1) Load & Chunk the Constitution
# -------------------------------
CONSTITUTION_FILE = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"

def load_sinhala_constitution(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read()
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    """
    Split text into overlapping chunks for retrieval.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

constitution_text = load_sinhala_constitution(CONSTITUTION_FILE)
# chunk_size=300, overlap=50
chunks = chunk_text(constitution_text, chunk_size=300, overlap=50)
print(f"Total chunks: {len(chunks)}")

# -------------------------------
# 2) Build the Vector Store (FAISS)
# -------------------------------
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedder = SentenceTransformer(EMBEDDING_MODEL)

chunk_embeddings = embedder.encode(chunks, convert_to_numpy=True)
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

def retrieve_chunks(question, top_k=3):
    """
    Embeds question & retrieves top-k chunk indices via FAISS.
    """
    q_emb = embedder.encode([question], convert_to_numpy=True)
    distances, indices = index.search(q_emb, top_k)
    # distances.shape = (1, top_k), indices.shape = (1, top_k)
    results = []
    for rank, idx in enumerate(indices[0]):
        snippet = chunks[idx]
        dist = distances[0][rank]
        results.append((snippet, dist))
    return results

# -------------------------------
# 3) Load mT5 for Generation
# -------------------------------
MODEL_NAME = "google/mt5-base"  # Larger than mt5-small
gen_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def generate_answer(prompt, max_len=256, min_len=20):
    inputs = gen_tokenizer.encode(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    output_ids = gen_model.generate(
        inputs,
        max_length=max_len,
        min_length=min_len,
        num_beams=4,
        early_stopping=True
    )
    return gen_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# -------------------------------
# 4) RAG Pipeline
# -------------------------------
def rag_pipeline(question, top_k=2):
    retrieved = retrieve_chunks(question, top_k=top_k)
    context_text = "\n".join([snip for (snip, dist) in retrieved])

    prompt = (
        f"question: {question}\n"
        f"context: {context_text}\n"
        "answer in Sinhala:"
    )
    return generate_answer(prompt)

# -------------------------------
# 5) Non-RAG Pipeline
# -------------------------------
def direct_pipeline(question):
    """
    Send question to the model without retrieval.
    """
    prompt = (
        f"question: {question}\n\n"
        "context: (No context)\n\n"
        "answer in Sinhala:"
    )
    answer = generate_answer(prompt)
    return answer

# -------------------------------
# 6) Testing & Comparison
# -------------------------------
def test_chatbot(question):
    print("=== Non-RAG Output ===")
    no_context_ans = direct_pipeline(question)
    print(no_context_ans)

    print("\n=== RAG Output ===")
    with_context_ans = rag_pipeline(question)
    print(with_context_ans)

# Example
sample_question = "ශ්‍රී ලංකා ජනාධිපතිවරයා කොපමණ කාලයක් ධුරය දැරිය හැක්කේ ද?"
test_chatbot(sample_question)


Total chunks: 1656


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

=== Non-RAG Output ===
<extra_id_0>? (No context) <extra_id_1> (No context) <extra_id_2> (No context) <extra_id_3>::

=== RAG Output ===
<extra_id_0> sinhala: ශ් රී ලංකා ජනාධිපතිවරයා සඳහන්: <extra_id_1>: - <extra_id_2>:


In [None]:
# -------------------------------
# 1) Load & Chunk the Constitution
# -------------------------------
CONSTITUTION_FILE = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"

def load_sinhala_constitution(filepath):
    text = open(filepath, "r", encoding="utf-8").read()
    # remove weird control chars if needed
    text = text.replace("\ufeff", "")  # BOM if present
    # any other replacements...
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    """
    Split text into overlapping chunks for retrieval.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

constitution_text = load_sinhala_constitution(CONSTITUTION_FILE)
# chunk_size=300, overlap=50
chunks = chunk_text(constitution_text, chunk_size=200, overlap=50)
print(f"Total chunks: {len(chunks)}")

# -------------------------------
# 2) Build the Vector Store (FAISS)
# -------------------------------
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedder = SentenceTransformer(EMBEDDING_MODEL)

chunk_embeddings = embedder.encode(chunks, convert_to_numpy=True)
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

def retrieve_chunks(question, top_k=3):
    """
    Embeds question & retrieves top-k chunk indices via FAISS.
    """
    q_emb = embedder.encode([question], convert_to_numpy=True)
    distances, indices = index.search(q_emb, top_k)
    # distances.shape = (1, top_k), indices.shape = (1, top_k)
    results = []
    for rank, idx in enumerate(indices[0]):
        snippet = chunks[idx]
        dist = distances[0][rank]
        results.append((snippet, dist))
    return results

# -------------------------------
# 3) Load mT5 for Generation
# -------------------------------
MODEL_NAME = "google/mt5-base"  # Larger than mt5-small
gen_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def generate_answer(prompt, max_len=256, min_len=20):
    inputs = gen_tokenizer.encode(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    output_ids = gen_model.generate(
        inputs,
        max_length=max_len,
        min_length=min_len,
        num_beams=5,          # Increase number of beams
        no_repeat_ngram_size=2,
        temperature=0.9,      # Slight randomness
        early_stopping=True
    )
    return gen_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# -------------------------------
# 4) RAG Pipeline
# -------------------------------
def rag_pipeline(question, top_k=2):
    retrieved = retrieve_chunks(question, top_k=top_k)
    context_text = "\n".join([snip for (snip, dist) in retrieved])

    # Provide a short example of a question and answer in Sinhala
    example_prompt = """
    Below is an example showing how to answer in Sinhala using the provided context:

    Example question: ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව තුළ සිටින මූලික අයිතිවාසිකම් මොනවාද?
    Example context: ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ 12 වන වගන්තිය...
    Example answer (in Sinhala): මෙම ව්‍යවස්ථාව තුළ මූලික අයිතිවාසිකම් විවිධ ශීර්ෂාංශ යටතේ සඳහන් වේ...
    ---

    Now answer the user’s question similarly in Sinhala.

    Real question: """ + question + """
    Real context:
    """ + context_text + """
    Please write a helpful and complete answer in Sinhala:
    """

    return generate_answer(example_prompt)

# -------------------------------
# 5) Non-RAG Pipeline
# -------------------------------
def direct_pipeline(question):
    """
    Send question to the model without retrieval.
    """
    prompt = (
        f"question: {question}\n\n"
        "context: (No context)\n\n"
        "answer in Sinhala:"
    )
    answer = generate_answer(prompt)
    return answer

# -------------------------------
# 6) Testing & Comparison
# -------------------------------
def test_chatbot(question):
    print("=== Non-RAG Output ===")
    no_context_ans = direct_pipeline(question)
    print(no_context_ans)

    print("\n=== RAG Output ===")
    with_context_ans = rag_pipeline(question)
    print(with_context_ans)

# Example
sample_question = "ශ්‍රී ලංකා ජනාධිපතිවරයා කොපමණ කාලයක් ධුරය දැරිය හැක්කේ ද?"
test_chatbot(sample_question)


Total chunks: 2759


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

=== Non-RAG Output ===




model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

<extra_id_0>? (No context) <extra_id_1>: (Sinhala) <extra_id_55> (Sri Lanka) -

=== RAG Output ===
<extra_id_0> in Sinhala: This is a  <extra_id_1>... Example answer:  <extra_id_2>...


In [None]:
# Install required libraries
!pip install transformers sentence-transformers faiss-cpu

# Import libraries
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np





In [None]:
# Load the document
file_path = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
with open(file_path, "r", encoding="utf-8") as file:
    document = file.read()

In [None]:
# Split the document into paragraphs
document_chunks = document.split("\n\n")  # Assuming paragraphs are separated by double newlines
document_chunks = [chunk.strip() for chunk in document_chunks if chunk.strip()]  # Remove empty chunks

In [None]:
# Load a multilingual sentence transformer model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Generate embeddings for each chunk
embeddings = model.encode(document_chunks)

# Create a FAISS index for efficient similarity search
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [None]:
def retrieve_relevant_chunks(question, top_k=3):
    question_embedding = model.encode([question])
    distances, indices = index.search(question_embedding, top_k)
    return [document_chunks[idx] for idx in indices[0]]

In [None]:
# Install transformers library
!pip install transformers

# Import libraries
from transformers import pipeline

# Load a pre-trained question-answering model
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")



Device set to use cpu


In [None]:
def answer_with_rag(question):
    # Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(question)
    context = " ".join(relevant_chunks)

    # Generate answer using the QA model
    result = qa_pipeline(question=question, context=context)
    return result['answer']

In [None]:
def answer_without_rag(question):
    # Provide a general context about the Sri Lankan Constitution
    general_context = (
        "ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව යනු ශ්‍රී ලංකාවේ මූලික නීතියයි. "
        "එය රජයේ ව්‍යුහය, ජනතාවගේ අයිතිවාසිකම් සහ රාජ්‍ය ප්‍රතිපත්ති පිළිබඳ විස්තර අඩංගු වේ."
    )

    # Generate answer using the QA model with the general context
    result = qa_pipeline(question=question, context=general_context)
    return result['answer']

In [None]:
# Sample questions
question1 = "ලංකා වවස්ථාව අ ව අමාත ම ඩල ධා යා ව ෙ අගමැ වරයා ද?"
question2 = "ලංකා රවැ ෙය අ ලස් ෙචදනාවකට වරදක  පස් වසර ගත  ඇත.  ඔ ට පා ෙ මැ වරණය සඳහා ඡ දය ලබා ය හැ ද?"

# With RAG
answer1_with_rag = answer_with_rag(question1)
answer2_with_rag = answer_with_rag(question2)

# Without RAG
answer1_without_rag = answer_without_rag(question1)
answer2_without_rag = answer_without_rag(question2)

# Print results
print("With RAG:")
print(f"Q1: {question1}\nA1: {answer1_with_rag}")
print(f"Q2: {question2}\nA2: {answer2_with_rag}")

print("\nWithout RAG:")
print(f"Q1: {question1}\nA1: {answer1_without_rag}")
print(f"Q2: {question2}\nA2: {answer2_without_rag}")

With RAG:
Q1: ලංකා වවස්ථාව අ ව අමාත ම ඩල ධා යා ව ෙ අගමැ වරයා ද?
A1: 
එස
Q2: ලංකා රවැ ෙය අ ලස් ෙචදනාවකට වරදක  පස් වසර ගත  ඇත.  ඔ ට පා ෙ මැ වරණය සඳහා ඡ දය ලබා ය හැ ද?
A2: ආ

Without RAG:
Q1: ලංකා වවස්ථාව අ ව අමාත ම ඩල ධා යා ව ෙ අගමැ වරයා ද?
A1: එ
Q2: ලංකා රවැ ෙය අ ලස් ෙචදනාවකට වරදක  පස් වසර ගත  ඇත.  ඔ ට පා ෙ මැ වරණය සඳහා ඡ දය ලබා ය හැ ද?
A2: ශ


In [None]:
!pip install langchain



In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.41 (from langchain-community)
  Downloading langchain_core-0.3.43-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.20 (from langchain-community)
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [None]:

import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from google.colab import drive

# For text processing
from langchain.text_splitter import RecursiveCharacterTextSplitter

# For embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

# For vector database
from langchain_community.vectorstores import FAISS

# For the model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# For UI
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output


In [None]:
# 1. Load the Constitution document
def load_document(file_path):
    """Load the text document from the specified path"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            document = file.read()
        print(f"✅ Document loaded: {len(document)} characters")
        return document
    except Exception as e:
        print(f"❌ Error loading document: {e}")
        return None

# 2. Preprocess and split the document
def preprocess_document(document, chunk_size=1000, chunk_overlap=200):
    """Split the document into manageable chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    chunks = text_splitter.split_text(document)
    print(f"✅ Document split into {len(chunks)} chunks")
    return chunks

# 3. Generate embeddings and create vector store
def create_embeddings(chunks):
    """Generate embeddings for document chunks using a multilingual model"""
    print("Creating embeddings (this may take a few minutes)...")

    # Using a multilingual model that supports Sinhala
    model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

    embeddings_model = HuggingFaceEmbeddings(
        model_name=model_name
    )

    # Create vector store
    vectorstore = FAISS.from_texts(chunks, embeddings_model)
    print(f"✅ Vector store created with model: {model_name}")

    return vectorstore, embeddings_model

# 4. Setup the language model that runs locally in Colab
def setup_language_model():
    """Setup a multilingual language model that can handle Sinhala"""
    print("Setting up language model...")

    model_id = "facebook/xglm-564M"  # Small model that works with Sinhala

    # Install the model if needed
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )

        # Create a text generation pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1
        )

        # Create a LangChain wrapper around the pipeline
        llm = HuggingFacePipeline(pipeline=pipe)

        print(f"✅ Language model loaded: {model_id}")
        return llm

    except Exception as e:
        print(f"❌ Error loading language model: {e}")
        print("Trying alternative model...")

        # Fallback to an even smaller model
        try:
            model_id = "distilbert-base-multilingual-cased"
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True
            )

            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=128
            )

            llm = HuggingFacePipeline(pipeline=pipe)
            print(f"✅ Alternative language model loaded: {model_id}")
            return llm

        except Exception as e2:
            print(f"❌ Error loading alternative model: {e2}")
            raise ValueError("Unable to load any language model. Please try a different approach.")

# 5. Setup RAG system
def setup_rag_system(vectorstore, llm):
    """Setup the RAG system with the vector store and LLM"""

    # Create a custom prompt template that includes instructions in Sinhala
    prompt_template = """
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:

    ප්‍රශ්නය: {question}

    ආණ්ඩුක්‍රම ව්‍යවස්ථාවෙන් ලබාගත් අදාළ තොරතුරු:
    {context}

    පිළිතුර:
    """

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    # Create the RAG chain
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )

    print("✅ RAG system setup complete")
    return rag_chain

# 6. Setup LLM-only system (for comparison)
def create_llm_only_prompt(question):
    """Create a prompt for the LLM-only approach"""
    return f"""
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:

    ප්‍රශ්නය: {question}

    පිළිතුර:
    """

# 7. Chatbot functions
def answer_with_rag(rag_chain, question):
    """Generate an answer using the RAG system"""
    result = rag_chain({"query": question})
    return {
        "answer": result["result"],
        "sources": [doc.page_content for doc in result["source_documents"]]
    }

def answer_without_rag(llm, question):
    """Generate an answer using only the LLM (without RAG)"""
    prompt = create_llm_only_prompt(question)
    result = llm(prompt)
    return {"answer": result}

# 8. Simple UI Components
def create_ui(rag_chain, llm):
    """Create a simple UI for the chatbot"""

    # Style for the UI
    display(HTML("""
    <style>
    .chat-container {
        max-width: 800px;
        margin: 0 auto;
        padding: 20px;
        border: 1px solid #ccc;
        border-radius: 5px;
        background-color: #f9f9f9;
    }
    .answer-container {
        margin: 20px 0;
        padding: 15px;
        border-radius: 5px;
        background-color: #fff;
    }
    .rag-answer {
        background-color: #e6f7ff;
        padding: 10px;
        border-radius: 5px;
        margin-bottom: 10px;
    }
    .llm-answer {
        background-color: #fff2e6;
        padding: 10px;
        border-radius: 5px;
        margin-bottom: 10px;
    }
    .source-text {
        background-color: #f2f2f2;
        padding: 10px;
        border-radius: 5px;
        margin-top: 10px;
        font-size: 0.9em;
        max-height: 200px;
        overflow-y: auto;
    }
    h2 {
        color: #333;
        border-bottom: 1px solid #ccc;
        padding-bottom: 10px;
    }
    </style>
    """))

    display(HTML("<h2>ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ බොට්</h2>"))

    # Create input box
    text_input = widgets.Text(
        description='ප්‍රශ්නය:',
        placeholder='ඔබගේ ප්‍රශ්නය මෙහි ටයිප් කරන්න...',
        layout=widgets.Layout(width='80%')
    )

    # Create toggle for showing sources
    show_sources = widgets.Checkbox(
        value=False,
        description='මූලාශ්‍ර පෙන්වන්න',
        disabled=False
    )

    # Create output area
    output = widgets.Output()

    # Create compare button
    compare_button = widgets.Button(
        description='සංසන්දනය කරන්න (RAG vs LLM)',
        button_style='info',
        icon='exchange-alt'
    )

    # Create a loading indicator
    loading = widgets.HTML(value="")

    # Define button click behavior
    def on_button_clicked(b):
        question = text_input.value
        if not question:
            return

        with output:
            clear_output()
            loading.value = "<h3>පිළිතුරු සකස් කරමින්...</h3>"
            display(loading)

            # Get answers with both methods
            try:
                rag_result = answer_with_rag(rag_chain, question)
                llm_result = answer_without_rag(llm, question)

                loading.value = ""

                # Convert newlines to <br> in answers first
                rag_answer_formatted = rag_result['answer'].replace('\n', '<br>')
                llm_answer_formatted = llm_result['answer'].replace('\n', '<br>')

                # Prepare sources HTML if enabled
                sources_html = ""
                if show_sources.value:
                    # Convert each source, joining them with <br><br>
                    formatted_sources = []
                    for source in rag_result['sources']:
                        formatted_sources.append(source.replace('\n', '<br>'))

                    source_text = "<br><br>".join(formatted_sources)
                    sources_html = f"""<h4>මූලාශ්‍ර:</h4>
                    <div class="source-text">{source_text}</div>"""

                # Build and display the complete HTML
                html_content = f"""
                <div class="answer-container">
                    <h3>ප්‍රශ්නය: {question}</h3>

                    <h4>RAG සමඟ පිළිතුර:</h4>
                    <div class="rag-answer">{rag_answer_formatted}</div>

                    <h4>RAG නොමැතිව පිළිතුර:</h4>
                    <div class="llm-answer">{llm_answer_formatted}</div>

                    {sources_html}
                </div>
                """

                display(HTML(html_content))

            except Exception as e:
                loading.value = ""
                display(HTML(f"<p style='color: red;'>දෝෂයක් ඇතිවිය: {str(e)}</p>"))

    # Connect the button to the function
    compare_button.on_click(on_button_clicked)

    # Arrange the UI elements
    display(text_input)
    display(widgets.HBox([compare_button, show_sources]))
    display(output)

# 9. Run the sample test questions
def run_sample_tests(rag_chain, llm):
    """Run the sample test questions and compare results"""
    questions = [
        "ලංකා වවස්ථාව අ ව අමාත ම ඩල ධා යා ව ෙ අගමැ වරයා ද?",
        "ලංකා රවැ ෙය අ ලස් ෙචදනාවකට වරදක  පස් වසර ගත  ඇත. ඔ ට පා ෙ මැ වරණය සඳහා ඡ දය ලබා ය හැ ද?"
    ]

    print("\n🧪 Running test questions...")

    results = []
    for i, question in enumerate(questions):
        print(f"\nQuestion {i+1}: {question}")

        # Get answer with RAG
        print("Getting RAG answer...")
        rag_result = answer_with_rag(rag_chain, question)
        print(f"\nRAG Answer: {rag_result['answer'][:200]}...")

        # Get answer without RAG
        print("Getting LLM-only answer...")
        llm_result = answer_without_rag(llm, question)
        print(f"\nLLM-only Answer: {llm_result['answer'][:200]}...")

        # Store results
        results.append({
            "question": question,
            "rag_answer": rag_result["answer"],
            "llm_answer": llm_result["answer"],
            "sources": rag_result["sources"]
        })

    # Create a DataFrame for comparison
    df = pd.DataFrame(results)
    df.to_csv("test_results.csv", index=False)
    print("\n✅ Test results saved to CSV")

    return df

# 10. Alternative implementation with TF-IDF (in case the other models fail)
def tfidf_fallback(document):
    """Create a simple TF-IDF based RAG system as fallback"""
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    print("Setting up TF-IDF fallback system...")

    # Preprocess the document
    chunks = preprocess_document(document)

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)

    # Function to retrieve relevant chunks
    def retrieve_chunks(query, top_k=3):
        query_vector = vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
        top_indices = similarities.argsort()[-top_k:][::-1]
        return [chunks[i] for i in top_indices]

    # Simple method to answer questions
    def answer_question(query):
        relevant_chunks = retrieve_chunks(query)
        context = "\n\n".join(relevant_chunks)

        answer = f"""
        Based on the retrieved text from the Sri Lankan Constitution:

        {context}

        This is the relevant information from the Constitution related to your query.
        """

        return {
            "answer": answer,
            "sources": relevant_chunks
        }

    print("✅ TF-IDF fallback system ready")

    # Create a simple UI
    display(HTML("<h2>ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ බොට් (TF-IDF Fallback)</h2>"))

    # Create input box
    text_input = widgets.Text(
        description='ප්‍රශ්නය:',
        placeholder='ඔබගේ ප්‍රශ්නය මෙහි ටයිප් කරන්න...',
        layout=widgets.Layout(width='80%')
    )

    # Create output area
    output = widgets.Output()

    # Create button
    search_button = widgets.Button(
        description='සොයන්න',
        button_style='primary'
    )

    # Define button click behavior
    def on_button_clicked(b):
        question = text_input.value
        if not question:
            return

        with output:
            clear_output()
            display(HTML("<h3>සොයමින්...</h3>"))

            # Get answer
            result = answer_question(question)

            # Prepare HTML parts separately to avoid f-string issues
            # Replace newlines with <br> tags first
            answer_html = result['answer'].replace('\n', '<br>')

            # Format sources
            formatted_sources = []
            for source in result['sources']:
                formatted_sources.append(source.replace('\n', '<br>'))

            sources_html = "<br><br>".join(formatted_sources)

            # Build the complete HTML
            html_content = f"""
            <div style="padding: 15px; background-color: #f9f9f9; border-radius: 5px;">
                <h3>ප්‍රශ්නය: {question}</h3>
                <div style="padding: 10px; background-color: #e6f7ff; border-radius: 5px;">
                    {answer_html}
                </div>
                <h4>මූලාශ්‍ර:</h4>
                <div style="padding: 10px; background-color: #f2f2f2; border-radius: 5px; max-height: 300px; overflow-y: auto;">
                    {sources_html}
                </div>
            </div>
            """

            # Display
            display(HTML(html_content))

    # Connect the button to the function
    search_button.on_click(on_button_clicked)

    # Arrange the UI elements
    display(text_input)
    display(search_button)
    display(output)

    return {"answer_function": answer_question}

# 11. Installation function for required packages
def install_required_packages():
    """Install required packages for the chatbot"""
    print("Installing required packages...")

    try:
        # Install basic requirements
        !pip install -q langchain langchain_community faiss-cpu transformers scikit-learn pandas ipywidgets

        # Force refresh imports
        import importlib
        import langchain
        import transformers

        importlib.reload(langchain)
        importlib.reload(transformers)

        print("✅ Packages installed successfully")
        return True
    except Exception as e:
        print(f"❌ Error installing packages: {e}")
        return False

# 12. Main execution function
def main():
    print("🚀 Starting Sri Lankan Constitution Chatbot...")

    # Make sure required packages are installed
    install_required_packages()

    # Load the document
    file_path = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
    document = load_document(file_path)

    if not document:
        print("❌ Failed to load document. Please check the file path.")
        return

    try:
        # Preprocess and split the document
        chunks = preprocess_document(document)

        # Generate embeddings and create vector store
        vectorstore, embeddings_model = create_embeddings(chunks)

        # Setup language model
        try:
            llm = setup_language_model()

            # Setup RAG system
            rag_chain = setup_rag_system(vectorstore, llm)

            # Run sample tests
            test_results = run_sample_tests(rag_chain, llm)

            # Create UI
            print("\n🖥️ Creating chatbot UI...")
            create_ui(rag_chain, llm)

            print("\n✅ Chatbot ready! Ask questions about the Sri Lankan Constitution.")

        except Exception as model_error:
            print(f"❌ Error setting up language model: {model_error}")
            print("Falling back to TF-IDF method...")

            # Use simple TF-IDF as fallback
            tfidf_fallback(document)

    except Exception as e:
        print(f"❌ An error occurred: {e}")
        # Still offer the TF-IDF fallback even if other steps fail
        print("Attempting to use TF-IDF as fallback...")
        tfidf_fallback(document)

# Run the main function
if __name__ == "__main__":
    main()

HTML(value='<h3>පිළිතුරු සකස් කරමින්...</h3>')



KeyboardInterrupt: 

In [None]:
# Import all necessary libraries
import os
import torch
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline

# Transformers imports
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# For TF-IDF fallback
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# 1. Load the Constitution document
def load_document(file_path):
    """Load the text document from the specified path"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            document = file.read()
        print(f"✅ Document loaded: {len(document)} characters")
        return document
    except Exception as e:
        print(f"❌ Error loading document: {e}")
        return None

# 2. Preprocess and split the document
def preprocess_document(document, chunk_size=1000, chunk_overlap=200):
    """Split the document into manageable chunks"""
    if not document:
        return []

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )
    chunks = text_splitter.split_text(document)
    print(f"✅ Document split into {len(chunks)} chunks")
    return chunks

# 3. Generate embeddings and create vector store
def create_embeddings(chunks):
    """Generate embeddings for document chunks using a multilingual model"""
    if not chunks:
        raise ValueError("No chunks provided for embedding generation")

    print("Creating embeddings (this may take a few minutes)...")

    # Using a multilingual model that supports Sinhala
    model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

    try:
        embeddings_model = HuggingFaceEmbeddings(
            model_name=model_name
        )

        # Create vector store
        vectorstore = FAISS.from_texts(chunks, embeddings_model)
        print(f"✅ Vector store created with model: {model_name}")

        return vectorstore, embeddings_model
    except Exception as e:
        print(f"❌ Error creating embeddings: {e}")
        raise

# 4. Setup the language model that runs locally in Colab
def setup_language_model():
    """Setup a multilingual language model that can handle Sinhala"""
    print("Setting up language model...")

    # Try multiple models in order of preference
    models_to_try = [
        "facebook/xglm-564M",  # Small model that works with Sinhala
        "facebook/xglm-1.7B",   # Larger but may exceed Colab resources
        "distilbert-base-multilingual-cased"  # Fallback option
    ]

    for model_id in models_to_try:
        try:
            print(f"Attempting to load model: {model_id}")
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch.float16,
                device_map="auto",
                low_cpu_mem_usage=True
            )

            # Create a text generation pipeline
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=256,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.1
            )

            # Create a LangChain wrapper around the pipeline
            llm = HuggingFacePipeline(pipeline=pipe)

            print(f"✅ Language model loaded: {model_id}")
            return llm

        except Exception as e:
            print(f"❌ Error loading model {model_id}: {e}")
            print("Trying next model in list...")

    # If we've exhausted all options, raise an error
    raise ValueError("Unable to load any language model. Please try a different approach or check your environment.")

# 5. Setup RAG system
def setup_rag_system(vectorstore, llm):
    """Setup the RAG system with the vector store and LLM"""

    # Create a custom prompt template with bilingual instructions (Sinhala and English)
    prompt_template = """
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Question): {question}

    ආණ්ඩුක්‍රම ව්‍යවස්ථාවෙන් ලබාගත් අදාළ තොරතුරු (Relevant information from the Constitution):
    {context}

    පිළිතුර (Answer):
    """

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    # Create the RAG chain with more relevant chunks
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),  # Increased from 3 to 5 chunks
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )

    print("✅ RAG system setup complete")
    return rag_chain

# 6. Setup LLM-only system (for comparison)
def create_llm_only_prompt(question):
    """Create a prompt for the LLM-only approach"""
    return f"""
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Question): {question}

    පිළිතුර (Answer):
    """

# 7. Chatbot functions
def answer_with_rag(rag_chain, question):
    """Generate an answer using the RAG system"""
    try:
        result = rag_chain({"query": question})
        return {
            "answer": result["result"],
            "sources": [doc.page_content for doc in result["source_documents"]]
        }
    except Exception as e:
        print(f"Error generating RAG answer: {e}")
        return {
            "answer": f"An error occurred while generating the answer: {str(e)}",
            "sources": []
        }

def answer_without_rag(llm, question):
    """Generate an answer using only the LLM (without RAG)"""
    try:
        prompt = create_llm_only_prompt(question)
        result = llm(prompt)
        return {"answer": result}
    except Exception as e:
        print(f"Error generating LLM-only answer: {e}")
        return {"answer": f"An error occurred while generating the answer: {str(e)}"}

# 8. Simple UI Components (to be implemented later)
def create_ui(rag_chain, llm):
    """Create a simple UI for the chatbot"""
    print("UI implementation is on hold as requested.")
    return None

# 9. Run the sample test questions
def run_sample_tests(rag_chain, llm):
    """Run the sample test questions and compare results"""
    questions = [
        "ලංකා වවස්ථාව අ ව අමාත ම ඩල ධා යා ව ෙ අගමැ වරයා ද?",
        "ලංකා රවැ ෙය අ ලස් ෙචදනාවකට වරදක  පස් වසර ගත  ඇත. ඔ ට පා ෙ මැ වරණය සඳහා ඡ දය ලබා ය හැ ද?",
        "What are the powers of the President according to the Sri Lankan Constitution?",
        "How is the Prime Minister appointed in Sri Lanka?"
    ]

    print("\n🧪 Running test questions...")

    results = []
    for i, question in enumerate(questions):
        print(f"\nQuestion {i+1}: {question}")

        # Get answer with RAG
        print("Getting RAG answer...")
        rag_result = answer_with_rag(rag_chain, question)
        print(f"\nRAG Answer: {rag_result['answer'][:200]}...")

        # Get answer without RAG
        print("Getting LLM-only answer...")
        llm_result = answer_without_rag(llm, question)
        print(f"\nLLM-only Answer: {llm_result['answer'][:200]}...")

        # Store results
        results.append({
            "question": question,
            "rag_answer": rag_result["answer"],
            "llm_answer": llm_result["answer"],
            "sources": rag_result["sources"]
        })

    # Create a DataFrame for comparison
    df = pd.DataFrame(results)

    # Save results to CSV
    try:
        df.to_csv("test_results.csv", index=False)
        print("\n✅ Test results saved to CSV")
    except Exception as e:
        print(f"\n❌ Failed to save test results: {e}")

    return df

# 10. Alternative implementation with TF-IDF (in case the other models fail)
def tfidf_fallback(document):
    """Create a simple TF-IDF based RAG system as fallback"""
    print("Setting up TF-IDF fallback system...")

    # Preprocess the document
    chunks = preprocess_document(document)

    if not chunks:
        print("❌ No document chunks available for TF-IDF processing")
        return None

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)

    # Function to retrieve relevant chunks
    def retrieve_chunks(query, top_k=3):
        query_vector = vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
        top_indices = similarities.argsort()[-top_k:][::-1]
        return [chunks[i] for i in top_indices]

    # Simple method to answer questions
    def answer_question(query):
        relevant_chunks = retrieve_chunks(query)
        context = "\n\n".join(relevant_chunks)

        answer = f"""
        Based on the retrieved text from the Sri Lankan Constitution:

        {context}

        This is the relevant information from the Constitution related to your query.
        """

        return {
            "answer": answer,
            "sources": relevant_chunks
        }

    print("✅ TF-IDF fallback system ready")

    # Return the answer function for use without UI
    return {"answer_function": answer_question}

# 11. Installation function for required packages
def install_required_packages():
    """Install required packages for the chatbot"""
    print("Installing required packages...")

    packages = [
        "langchain",
        "langchain_community",
        "faiss-cpu",
        "transformers",
        "scikit-learn",
        "pandas",
        "ipywidgets",
        "torch"
    ]

    try:
        # Python way to install packages
        import sys
        import subprocess

        for package in packages:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

        print("✅ Packages installed successfully")
        return True
    except Exception as e:
        print(f"❌ Error installing packages: {e}")
        print("You may need to run the following command manually:")
        print("!pip install -q langchain langchain_community faiss-cpu transformers scikit-learn pandas ipywidgets torch")
        return False

# 12. Save and load vectorstore to avoid recomputing embeddings
def save_vectorstore(vectorstore, file_path="constitution_vectorstore.faiss"):
    """Save the vectorstore to disk"""
    try:
        vectorstore.save_local(file_path)
        print(f"✅ Vector store saved to {file_path}")
        return True
    except Exception as e:
        print(f"❌ Error saving vector store: {e}")
        return False

def load_vectorstore(embeddings_model, file_path="constitution_vectorstore.faiss"):
    """Load the vectorstore from disk"""
    try:
        if os.path.exists(file_path):
            vectorstore = FAISS.load_local(file_path, embeddings_model)
            print(f"✅ Vector store loaded from {file_path}")
            return vectorstore
        else:
            print(f"❌ Vector store file not found at {file_path}")
            return None
    except Exception as e:
        print(f"❌ Error loading vector store: {e}")
        return None

# 13. Main execution function
def main():
    print("🚀 Starting Sri Lankan Constitution Chatbot...")

    # Make sure required packages are installed
    install_required_packages()

    # Load the document - update path to your document location
    file_path = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
    document = load_document(file_path)

    if not document:
        print("❌ Failed to load document. Please check the file path.")
        return

    try:
        # Preprocess and split the document
        chunks = preprocess_document(document)

        # Check if we have a saved vectorstore
        vectorstore_path = "constitution_vectorstore.faiss"

        # First create embeddings model
        model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
        try:
            embeddings_model = HuggingFaceEmbeddings(model_name=model_name)

            # Try to load existing vectorstore
            vectorstore = load_vectorstore(embeddings_model, vectorstore_path)

            # If not found, create new one
            if not vectorstore:
                print("Creating new vector store...")
                vectorstore, _ = create_embeddings(chunks)
                save_vectorstore(vectorstore, vectorstore_path)

            # Setup language model
            try:
                llm = setup_language_model()

                # Setup RAG system
                rag_chain = setup_rag_system(vectorstore, llm)

                # Run sample tests
                test_results = run_sample_tests(rag_chain, llm)

                # Create UI (on hold for now)
                # create_ui(rag_chain, llm)

                print("\n✅ Chatbot core functionality ready!")
                print("You can use the following functions to interact with the chatbot:")
                print("- answer_with_rag(rag_chain, 'your question')")
                print("- answer_without_rag(llm, 'your question')")

                # Return the important components for manual interaction
                return {
                    "rag_chain": rag_chain,
                    "llm": llm,
                    "vectorstore": vectorstore,
                    "test_results": test_results
                }

            except Exception as model_error:
                print(f"❌ Error setting up language model: {model_error}")
                print("Falling back to TF-IDF method...")

                # Use simple TF-IDF as fallback
                return tfidf_fallback(document)

        except Exception as e:
            print(f"❌ Error with embeddings: {e}")
            print("Falling back to TF-IDF method...")
            return tfidf_fallback(document)

    except Exception as e:
        print(f"❌ An error occurred: {e}")
        print("Attempting to use TF-IDF as fallback...")
        return tfidf_fallback(document)

# Run the main function when executed directly
if __name__ == "__main__":
    result = main()

🚀 Starting Sri Lankan Constitution Chatbot...
Installing required packages...
✅ Packages installed successfully
✅ Document loaded: 413826 characters
✅ Document split into 558 chunks
❌ Vector store file not found at constitution_vectorstore.faiss
Creating new vector store...
Creating embeddings (this may take a few minutes)...
✅ Vector store created with model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
✅ Vector store saved to constitution_vectorstore.faiss
Setting up language model...
Attempting to load model: facebook/xglm-564M


Device set to use cpu


✅ Language model loaded: facebook/xglm-564M
✅ RAG system setup complete

🧪 Running test questions...

Question 1: ලංකා වවස්ථාව අ ව අමාත ම ඩල ධා යා ව ෙ අගමැ වරයා ද?
Getting RAG answer...

RAG Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...
Getting LLM-only answer...





LLM-only Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...

Question 2: ලංකා රවැ ෙය අ ලස් ෙචදනාවකට වරදක  පස් වසර ගත  ඇත. ඔ ට පා ෙ මැ වරණය සඳහා ඡ දය ලබා ය හැ ද?
Getting RAG answer...





RAG Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...
Getting LLM-only answer...





LLM-only Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...

Question 3: What are the powers of the President according to the Sri Lankan Constitution?
Getting RAG answer...





RAG Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...
Getting LLM-only answer...





LLM-only Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...

Question 4: How is the Prime Minister appointed in Sri Lanka?
Getting RAG answer...





RAG Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...
Getting LLM-only answer...





LLM-only Answer: 
    ඔබ ශ්‍රී ලංකා ආණ්ඩුක්‍රම ව්‍යවස්ථාව පිළිබඳ විශේෂඥයෙකි. පහත ප්‍රශ්නයට පිළිතුරු දෙන්න:
    (You are an expert on the Sri Lankan Constitution. Please answer the following question:)

    ප්‍රශ්නය (Q...

✅ Test results saved to CSV

✅ Chatbot core functionality ready!
You can use the following functions to interact with the chatbot:
- answer_with_rag(rag_chain, 'your question')
- answer_without_rag(llm, 'your question')


In [None]:
!pip install sentence-transformers faiss-cpu transformers chromadb --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:

In [None]:
import os
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import faiss  # or import chromadb, as you prefer
import numpy as np
from typing import List, Tuple

In [None]:
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"

# Read the entire file
with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

# Basic chunking: split by paragraph or fixed chunk sizes
def simple_chunk_text(text: str, max_tokens:int=200) -> List[str]:
    """
    Splits text into chunks of roughly `max_tokens` words.
    This is a naive approach; advanced approaches might split by sentences or paragraphs.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0

    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    # leftover
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...")


Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...


In [None]:
embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
# Create embeddings for each chunk
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
embeddings = np.array(embeddings, dtype="float32")

# Build a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Store chunk data separately
chunk_data = pd.DataFrame({
    "chunk_text": chunks,
    "chunk_id": list(range(len(chunks)))
})


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
def retrieve_top_chunks_faiss(query: str, k=3) -> List[str]:
    """
    Embed the query, search in FAISS index, return top-k chunk strings.
    """
    query_vector = embedding_model.encode([query])
    query_vector = query_vector.astype("float32")
    distances, indices = index.search(query_vector, k)
    results = []
    for idx in indices[0]:
        results.append(chunks[idx])
    return results


In [None]:
model_name = "google/mt5-base"  # bigger than mT5-small
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

qa_pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer
)


Device set to use cpu
The model 'MT5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 

In [None]:
def answer_without_rag(question: str) -> str:
    # This naive approach just attempts to see how the model handles the question alone
    prompt = f"Answer this question in Sinhala:\n\nQuestion: {question}\nAnswer:"
    response = qa_pipeline(prompt, max_length=128, do_sample=False)
    return response[0]["generated_text"]


In [None]:
def answer_with_rag(question: str, top_k=3, use_faiss=True) -> str:
    retrieved_texts = retrieve_top_chunks_faiss(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # Add a T5-style prefix
    prompt = (
        "summarize: "
        + f"\nContext:\n{context_str}\n\n"
        + f"Question: {question}\nAnswer in Sinhala:\n"
    )

    response = qa_pipeline(
        prompt,
        max_new_tokens=128,
        do_sample=False,
        num_beams=2
    )
    raw_text = response[0]["generated_text"]

    cleaned_text = clean_output(raw_text)
    return cleaned_text


In [None]:
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3, use_faiss=True)
    print(rag_ans)
    print("========================================\n")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
Answer this question in Sinhala:

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Answer:

+++ With RAG +++
summarize: 
Context:
ය. තව ද, සෑම අවස්ථාවකම ශ්‍රේෂ්ඨාධිකරණයේ නඩු තීන්දු හා ආඥා එවැනි සෑම කාරණයක දීම අවසානාත්මක හා තීරණාත්මක වන්නේය. අභියාචන අධිකරණ බලය අභියාචනය කිරීමේ අයිතිවාසිකම් ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව අභියාචනාධිකරණයේ යම් ආඥාවකට, නඩු තීන්දුවකට, තීන්දු ප්‍රකාශයකට හෝ දණ්ඩන නියමයකට විරුද්ධව නීතිය අනුව ශ්‍රේෂ්ඨාධිකරණය වෙත අභියාචනයක් කළ හැකි අවස්ථාවක ශ්‍රේෂ්ඨාධිකරණය විසින් ස්වකීය අධිකරණ බලය ක්‍රියාත්මක කිරීමෙහි ලා ඒ අභියාචනය සංජානනය කිරීමේ තනි හා අනන්‍ය බලය ශ්‍රේෂ්ඨාධිකරණයට ඇත්තේ ය. එසේ ම, අභියාචනාධිකරණයේ එවැනි යම් ආඥාවක් නඩු තීන්දුවක්, තීන්දු ප්‍රකාශයක් හෝ දණ්ඩන නියමයක් ස්ථිර කිරීම, ප්‍රතිවර්තනය කිරීම හෝ වෙනස් කිරීම ශ්‍රේෂ්ඨාධිකරණය විසින් කළ හැක්කේය. තව ද, කවර වූ හෝ මුල් අවස්ථා අධිකරණයකට යුක්ති ධර්මය අනුව අ

In [None]:
!pip install gensim sentencepiece numpy

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import os
import numpy as np
import pandas as pd
import sentencepiece as spm
from gensim.models import Word2Vec
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from typing import List


CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

# We'll use google/mt5-base for generation
GENERATION_MODEL_NAME = "google/mt5-base"


In [None]:
# 1) Read the entire file
with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

# 2) Simple chunking by word count
def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    """
    Splits text into chunks of roughly `max_tokens` words.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0

    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    # leftover
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...")


Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...


In [None]:
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

True

In [None]:
w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

Word2Vec embedding dimension: 100


In [None]:
def embed_text_bpe_w2v(text: str) -> np.ndarray:
    """
    1) Tokenize `text` into subwords with the BPE model.
    2) For each token in Word2Vec's vocabulary, get the vector.
    3) Average them to get a single embedding. If no valid tokens, return zeros.
    """
    tokens = sp.encode(text, out_type=str)
    valid_vectors = []
    for tok in tokens:
        if tok in w2v_model.wv:
            valid_vectors.append(w2v_model.wv[tok])

    if not valid_vectors:
        return np.zeros(embedding_dim, dtype="float32")
    else:
        # Average
        mean_vec = np.mean(valid_vectors, axis=0)
        return mean_vec.astype("float32")


In [None]:
chunk_embeddings = []
for i, ctext in enumerate(chunks):
    vec = embed_text_bpe_w2v(ctext)
    chunk_embeddings.append(vec)

chunk_embeddings = np.array(chunk_embeddings, dtype="float32")
print("chunk_embeddings shape:", chunk_embeddings.shape)


chunk_embeddings shape: (321, 100)


In [None]:
dimension = embedding_dim  # e.g. 100 if your w2v dimension was 100
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

# Store chunk data in a DataFrame for easy referencing
chunk_data = pd.DataFrame({
    "chunk_text": chunks,
    "chunk_id": list(range(len(chunks)))
})


In [None]:
def retrieve_top_chunks(query: str, k=3) -> List[str]:
    """
    Embed the query using BPE + Word2Vec, search in FAISS index,
    return the top-k chunk strings.
    """
    query_vec = embed_text_bpe_w2v(query)
    query_vec = query_vec.reshape(1, -1)  # shape [1, dimension]
    distances, indices = index.search(query_vec, k)
    results = []
    for idx in indices[0]:
        results.append(chunks[idx])
    return results


In [None]:
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

qa_pipeline = pipeline(
    task="text2text-generation",  # or "text-generation" (experiment)
    model=gen_model,
    tokenizer=tokenizer
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def clean_output(text: str) -> str:
    # Remove or replace T5 placeholder tokens <extra_id_0>, etc.
    for i in range(10):
        text = text.replace(f"<extra_id_{i}>", "")
    return text.strip()


In [None]:
def answer_without_rag(question: str) -> str:
    """
    Baseline approach: just feed the question to the model,
    ignoring the constitution chunks.
    """
    prompt = f"Answer this question in Sinhala:\n\nQuestion: {question}\nAnswer:"
    response = qa_pipeline(prompt, max_length=128, do_sample=False)
    return clean_output(response[0]["generated_text"])


In [None]:
def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""You are a helpful assistant. Below is relevant context in Sinhala.
Answer strictly in Sinhala, do not use placeholders.

Context:
{context_str}

Question: {question}
Answer in Sinhala:
"""

    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=128,  # you can increase if you want more text
        do_sample=False,
        num_beams=2
    )
    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)


In [None]:
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
sinhala - sinhala

+++ With RAG +++

=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?

--- Without RAG ---
. Please check the following answers:

+++ With RAG +++
නීති හෝ නියෝගයක්

=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?

--- Without RAG ---
sinhala - sinhala

+++ With RAG +++
වන්නේය. 5.



In [None]:
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

# We'll use google/mt5-base for generation
GENERATION_MODEL_NAME = "google/mt5-large"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

qa_pipeline = pipeline(
    task="text-generation",  # or "text-generation" (experiment)
    model=gen_model,
    tokenizer=tokenizer
)


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
The model 'MT5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 

In [None]:
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
Answer this question in Sinhala:

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Answer:Answer this question in English: Question: අගමැතිද? Question: අගමැතිද? <extra_id_17>? <extra_id_18>? <extra_id_19>? <extra_id_20>? <extra_id_21>? <extra_id_22>? <extra_id_23>? <extra_id_24>? <extra_id_25>? <extra_id_26>? <extra_id_27>? <extra_id_28>? Question: <extra_id_29>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_30>? <extra_id_31>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_32>

+++ With RAG +++
You are a helpful assistant. Below is relevant context in Sinhala. 
Answer strictly in Sinhala, do not use placeholders.

Context:
ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රක

In [None]:
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

# We'll use google/mt5-base for generation
GENERATION_MODEL_NAME = "google/mt5-large"

In [None]:
# 1) Read the entire file
with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

# 2) Simple chunking by word count
def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    """
    Splits text into chunks of roughly `max_tokens` words.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0

    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    # leftover
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...")


Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...


In [None]:
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

True

In [None]:
w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

Word2Vec embedding dimension: 100


In [None]:
def embed_text_bpe_w2v(text: str) -> np.ndarray:
    """
    1) Tokenize `text` into subwords with the BPE model.
    2) For each token in Word2Vec's vocabulary, get the vector.
    3) Average them to get a single embedding. If no valid tokens, return zeros.
    """
    tokens = sp.encode(text, out_type=str)
    valid_vectors = []
    for tok in tokens:
        if tok in w2v_model.wv:
            valid_vectors.append(w2v_model.wv[tok])

    if not valid_vectors:
        return np.zeros(embedding_dim, dtype="float32")
    else:
        # Average
        mean_vec = np.mean(valid_vectors, axis=0)
        return mean_vec.astype("float32")


In [None]:
chunk_embeddings = []
for i, ctext in enumerate(chunks):
    vec = embed_text_bpe_w2v(ctext)
    chunk_embeddings.append(vec)

chunk_embeddings = np.array(chunk_embeddings, dtype="float32")
print("chunk_embeddings shape:", chunk_embeddings.shape)


chunk_embeddings shape: (321, 100)


In [None]:
dimension = embedding_dim  # e.g. 100 if your w2v dimension was 100
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)

# Store chunk data in a DataFrame for easy referencing
chunk_data = pd.DataFrame({
    "chunk_text": chunks,
    "chunk_id": list(range(len(chunks)))
})


In [None]:
def retrieve_top_chunks(query: str, k=3) -> List[str]:
    """
    Embed the query using BPE + Word2Vec, search in FAISS index,
    return the top-k chunk strings.
    """
    query_vec = embed_text_bpe_w2v(query)
    query_vec = query_vec.reshape(1, -1)  # shape [1, dimension]
    distances, indices = index.search(query_vec, k)
    results = []
    for idx in indices[0]:
        results.append(chunks[idx])
    return results


In [None]:
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

qa_pipeline = pipeline(
    task="text-generation",  # or "text-generation" (experiment)
    model=gen_model,
    tokenizer=tokenizer
)


Device set to use cpu
The model 'MT5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 

In [None]:
def clean_output(text: str) -> str:
    # Remove or replace T5 placeholder tokens <extra_id_0>, etc.
    for i in range(10):
        text = text.replace(f"<extra_id_{i}>", "")
    return text.strip()


In [None]:
def answer_without_rag(question: str) -> str:
    """
    Baseline approach: just feed the question to the model,
    ignoring the constitution chunks.
    """
    prompt = f"Answer this question in Sinhala:\n\nQuestion: {question}\nAnswer:"
    response = qa_pipeline(prompt, max_length=128, do_sample=False)
    return clean_output(response[0]["generated_text"])


In [None]:
def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""You are a knowledgeable Sinhala legal assistant.
Read the following Constitution excerpt. Then clearly answer the user’s question in Sinhala.
Provide a direct and concise response, referencing the relevant article or concept.
Do NOT merely repeat the entire excerpt.

Context:
{context_str}

Question: {question}
Direct Answer (in Sinhala, no placeholders):
"""

    # 3) Generate
    # response = qa_pipeline( # Fixed: Removed the unexpected indentation here
    #     prompt,
    #     max_new_tokens=256,  # you can increase if you want more text
    #     do_sample=False,
    #     num_beams=2
    # )
    response = qa_pipeline(
       prompt,
       max_new_tokens=256,
       do_sample=True,
       top_k=50,
       top_p=0.95,
       num_beams=1  # or 4 (experiment!)
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)

In [None]:
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
Answer this question in Sinhala:

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Answer:Answer this question in English: Question: අගමැතිද? Question: අගමැතිද? <extra_id_17>? <extra_id_18>? <extra_id_19>? <extra_id_20>? <extra_id_21>? <extra_id_22>? <extra_id_23>? <extra_id_24>? <extra_id_25>? <extra_id_26>? <extra_id_27>? <extra_id_28>? Question: <extra_id_29>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_30>? <extra_id_31>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_32>

+++ With RAG +++
You are a knowledgeable Sinhala legal assistant. 
Read the following Constitution excerpt. Then clearly answer the user’s question in Sinhala. 
Provide a direct and concise response, referencing the relevant article or concept. 
Do NOT merely repeat the entire excerpt.

Context:
ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් 

In [None]:
def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""You are a Sinhala legal assistant.
    Below is an example of how to answer a question directly:

    Example:
    Context snippet: "...ශ්‍රී ලංකා ජනරජය ඒකීය රජයක්..."
    Question: "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව රජය ඒකීයද?"
    Direct Answer (in Sinhala): "ඔව්, ඒකීය රජයක් ලෙස ව්‍යවස්ථාවෙහි සඳහන් වේ."

    Now follow the same style with the actual question:

    Context snippet:
    {context_str}

    Question: {question}
    Direct Answer (in Sinhala):
    """

    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=512,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_beams=1  # or 4 (experiment!)
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)

In [None]:
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
Answer this question in Sinhala:

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Answer:Answer this question in English: Question: අගමැතිද? Question: අගමැතිද? <extra_id_17>? <extra_id_18>? <extra_id_19>? <extra_id_20>? <extra_id_21>? <extra_id_22>? <extra_id_23>? <extra_id_24>? <extra_id_25>? <extra_id_26>? <extra_id_27>? <extra_id_28>? Question: <extra_id_29>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_30>? <extra_id_31>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_32>

+++ With RAG +++
You are a Sinhala legal assistant. 
    Below is an example of how to answer a question directly:
    
    Example:
    Context snippet: "...ශ්‍රී ලංකා ජනරජය ඒකීය රජයක්..."
    Question: "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව රජය ඒකීයද?"
    Direct Answer (in Sinhala): "ඔව්, ඒකීය රජයක් ලෙස ව්‍යවස්ථාවෙහි සඳහන් වේ."
    
    Now follow the same style with the actual quest

In [None]:
def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""
    You are a knowledgeable Sinhala legal assistant.
    You must provide a short, direct answer to the user’s question based on the context below.
    DO NOT repeat large portions of the context – just answer succinctly in Sinhala.

    Context:
    {context_str}

    Question: {question}

    Answer in Sinhala (1-2 sentences, no placeholders):
    """


    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=512,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_beams=4  # or 4 (experiment!)
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)

In [None]:
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
Answer this question in Sinhala:

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Answer:Answer this question in English: Question: අගමැතිද? Question: අගමැතිද? <extra_id_17>? <extra_id_18>? <extra_id_19>? <extra_id_20>? <extra_id_21>? <extra_id_22>? <extra_id_23>? <extra_id_24>? <extra_id_25>? <extra_id_26>? <extra_id_27>? <extra_id_28>? Question: <extra_id_29>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_30>? <extra_id_31>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_32>

+++ With RAG +++
You are a knowledgeable Sinhala legal assistant. 
    You must provide a short, direct answer to the user’s question based on the context below. 
    DO NOT repeat large portions of the context – just answer succinctly in Sinhala.

    Context:
    ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි)

In [None]:
def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""
    You are a knowledgeable Sinhala legal assistant.
    You must provide a short, direct answer to the user’s question based on the context below.
    DO NOT repeat large portions of the context – just answer succinctly in Sinhala.

    Context:
    {context_str}

    Question: {question}

    Answer in Sinhala (1-2 sentences, no placeholders):
    """


    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=128,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_beams=2  # or 4 (experiment!)
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)

In [None]:
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
Answer this question in Sinhala:

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Answer:Answer this question in English: Question: අගමැතිද? Question: අගමැතිද? <extra_id_17>? <extra_id_18>? <extra_id_19>? <extra_id_20>? <extra_id_21>? <extra_id_22>? <extra_id_23>? <extra_id_24>? <extra_id_25>? <extra_id_26>? <extra_id_27>? <extra_id_28>? Question: <extra_id_29>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_30>? <extra_id_31>? Question: අගමැතිද? Question: අගමැතිද? <extra_id_32>

+++ With RAG +++
You are a knowledgeable Sinhala legal assistant. 
    You must provide a short, direct answer to the user’s question based on the context below. 
    DO NOT repeat large portions of the context – just answer succinctly in Sinhala.

    Context:
    ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි)

In [None]:
import numpy as np
import pandas as pd
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from gensim.models import Word2Vec
import sentencepiece as spm

# Load models and tokenizer
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size

tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

qa_pipeline = pipeline(
    task="text2text-generation",
    model=gen_model,
    tokenizer=tokenizer
)

def clean_output(text: str) -> str:
    # Remove or replace T5 placeholder tokens <extra_id_0>, etc.
    for i in range(10):
        text = text.replace(f"<extra_id_{i}>", "")
    return text.strip()

def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""
    පහත සඳහන් ප්‍රශ්නයට පිළිතුරු සපයන්න. ප්‍රශ්නය: {question}
    සංදර්භය: {context_str}
    පිළිතුර:
    """

    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=128,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_beams=4
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)

# Sample questions
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")

Device set to use cpu


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

+++ With RAG +++
ජනාධිපතිවරයාගේ ධුරය දරන්නේය.   ජනාධිපතිවරයාගේ ධුරය දරන්නේය.   ජනාධිපතිවරයාගේ ධුරය දරන්නේය.   ජනාධිපතිවරයාගේ ධුරය දරන්නේය.  .  . .  .  .  .  <extra_id_10>.  <extra_id_11>.  <extra_id_12>.  <extra_id_13>.  <extra_id_14>.  <extra_id_15>.  <extra_id_16>.  <extra_id_17>.  <extra_id_18>.  <extra_id_19>.  <extra_id_20>.  <extra_id_21>.  <extra_id_22>.  <extra_id_23>.  <extra_id_24>.  <extra_id_25>.  <extra_id_26>.  <extra_id_27>.  <extra_id_28>.  <extra_id_29>.

=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?

+++ With RAG +++
ජනාධිපතිවරයාට හෝ නීතිපතිවරයාට හෝ ඇත්තේ ය. .  ජනාධිපතිවරයාට හෝ ඇත්තේ ය.  ජනාධිපතිවරයාට හෝ ඇත්තේ ය. . . . . . . ජනාධිපතිවරයාගේ ධුරය දරන කවර වූ හෝ තැනැත්තකු සම්බන්ධයෙන් හෝ  <extra_id_10>. <extra_id_11>. <extra_id_12>. <extra_id_13>. <extra_id_14>. <extra_id_15>. <extra_id_16>. <extra_id_17>. <extra_id_18>. <e

In [None]:
import numpy as np
import pandas as pd
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from gensim.models import Word2Vec
import sentencepiece as spm

# Load models and tokenizer
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size

tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

qa_pipeline = pipeline(
    task="text-generation",
    model=gen_model,
    tokenizer=tokenizer
)

def clean_output(text: str) -> str:
    # Remove or replace T5 placeholder tokens <extra_id_0>, etc.
    for i in range(10):
        text = text.replace(f"<extra_id_{i}>", "")
    return text.strip()

def answer_without_rag(question: str) -> str:
    """
    Baseline approach: just feed the question to the model,
    ignoring the constitution chunks.
    """
    prompt = f"පහත ප්‍රශ්නයට පිළිතුරු සපයන්න: {question}"
    response = qa_pipeline(prompt, max_length=128, do_sample=False)
    return clean_output(response[0]["generated_text"])

def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""
    පහත සඳහන් ප්‍රශ්නයට පිළිතුරු සපයන්න. ප්‍රශ්නය: {question}
    සංදර්භය: {context_str}
    පිළිතුර:
    """

    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=128,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_beams=4
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)

# Sample questions
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")

Device set to use cpu
The model 'MT5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 

=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
පහත ප්‍රශ්නයට පිළිතුරු සපයන්න: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?> - නීතිපති ප් රශ්නය: - <extra_id_14> - <extra_id_15> - - <extra_id_16> - - <extra_id_17> - - <extra_id_18> - - <extra_id_19> - - <extra_id_20>  <extra_id_21>  <extra_id_22>  <extra_id_23>ද?  <extra_id_24>?  <extra_id_25>?  <extra_id_26>?  <extra_id_27>?  <extra_id_28>?  <extra_id_29>? - <extra_id_30> - - <extra_id_31> - - <extra_id_32> - - - - - - -

+++ With RAG +++
පහත සඳහන් ප්‍රශ්නයට පිළිතුරු සපයන්න. ප්‍රශ්නය: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
    සංදර්භය: ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුව

In [None]:
import numpy as np
import pandas as pd
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from gensim.models import Word2Vec
import sentencepiece as spm

# Load models and tokenizer
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size

tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

# Use the correct task for MT5: text2text-generation
qa_pipeline = pipeline(
    task="text2text-generation",
    model=gen_model,
    tokenizer=tokenizer
)

def clean_output(text: str) -> str:
    # Remove or replace T5 placeholder tokens <extra_id_0>, etc.
    for i in range(10):
        text = text.replace(f"<extra_id_{i}>", "")
    return text.strip()

def answer_without_rag(question: str) -> str:
    """
    Baseline approach: just feed the question to the model,
    ignoring the constitution chunks.
    """
    prompt = f"පහත ප්‍රශ්නයට පිළිතුරු සපයන්න: {question}"
    response = qa_pipeline(prompt, max_length=128, do_sample=False)
    return clean_output(response[0]["generated_text"])

def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""
    පහත සඳහන් ප්‍රශ්නයට පිළිතුරු සපයන්න. ප්‍රශ්නය: {question}
    සංදර්භය: {context_str}
    පිළිතුර:
    """

    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=128,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_beams=4
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)

# Sample questions
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")

Device set to use cpu


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
ද? -  - නීතිපති ප් රශ්නය: -  - නීතිපති ප් රශ්නය: -  - නීතිපති ප් රශ්නය: - <extra_id_14> - <extra_id_15> - - <extra_id_16> - - <extra_id_17> - - <extra_id_18> - - <extra_id_19> - - <extra_id_20>  <extra_id_21>  <extra_id_22>  <extra_id_23>ද?  <extra_id_24>?  <extra_id_25>?  <extra_id_26>?  <extra_id_27>?  <extra_id_28>?  <extra_id_29>? - <extra_id_30> - - <extra_id_31> - - <extra_id_32> - - - - - - -

+++ With RAG +++
ජනාධිපතිවරයාගේ ධුරය දරන්නේය.   ජනාධිපතිවරයාගේ ධුරය දරන්නේය.   ජනාධිපතිවරයාගේ ධුරය දරන්නේය.   ජනාධිපතිවරයාගේ ධුරය දරන්නේය.  .  . .  .  .  .  <extra_id_10>.  <extra_id_11>.  <extra_id_12>.  <extra_id_13>.  <extra_id_14>.  <extra_id_15>.  <extra_id_16>.  <extra_id_17>.  <extra_id_18>.  <extra_id_19>.  <extra_id_20>.  <extra_id_21>.  <extra_id_22>.  <extra_id_23>.  <extra_id_24>.  <extra_id_25>.  <extra_id_26>.  <extra_id_27>.  <extra_id_28>.  <extra_id_29>.

=== QUESTION 

In [None]:
import numpy as np
import pandas as pd
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from gensim.models import Word2Vec
import sentencepiece as spm

# Load models and tokenizer
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size

tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

# Use the correct task for MT5: text2text-generation
qa_pipeline = pipeline(
    task="text2text-generation",
    model=gen_model,
    tokenizer=tokenizer
)

def clean_output(text: str) -> str:
    # Remove or replace T5 placeholder tokens <extra_id_0>, etc.
    for i in range(10):
        text = text.replace(f"<extra_id_{i}>", "")

    # Remove repetitive phrases
    phrases = text.split()
    unique_phrases = []
    for phrase in phrases:
        if phrase not in unique_phrases:
            unique_phrases.append(phrase)
    cleaned_text = " ".join(unique_phrases)

    return cleaned_text.strip()

def answer_without_rag(question: str) -> str:
    """
    Baseline approach: just feed the question to the model,
    ignoring the constitution chunks.
    """
    prompt = f"පහත ප්‍රශ්නයට පිළිතුරු සපයන්න: {question}"
    response = qa_pipeline(prompt, max_length=128, do_sample=False)
    return clean_output(response[0]["generated_text"])

def answer_with_rag(question: str, top_k=3) -> str:
    # 1) Retrieve relevant chunks
    retrieved_texts = retrieve_top_chunks(question, k=top_k)
    context_str = "\n\n".join(retrieved_texts)

    # 2) Construct a prompt with relevant context
    prompt = f"""
    පහත ප්‍රශ්නයට පිළිතුරු සපයන්න. ප්‍රශ්නය: {question}
    සංදර්භය: {context_str}
    පිළිතුර:
    """

    # 3) Generate
    response = qa_pipeline(
        prompt,
        max_new_tokens=128,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_beams=4,
        temperature=0.7  # Adjust temperature for less randomness
    )

    raw_text = response[0]["generated_text"]
    return clean_output(raw_text)
# Sample questions
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    print("=== QUESTION ===")
    print(q)
    print("\n--- Without RAG ---")
    naive_ans = answer_without_rag(q)
    print(naive_ans)

    print("\n+++ With RAG +++")
    rag_ans = answer_with_rag(q, top_k=3)
    print(rag_ans)
    print("========================================\n")

Device set to use cpu


=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

--- Without RAG ---
ද? - නීතිපති ප් රශ්නය: <extra_id_14> <extra_id_15> <extra_id_16> <extra_id_17> <extra_id_18> <extra_id_19> <extra_id_20> <extra_id_21> <extra_id_22> <extra_id_23>ද? <extra_id_24>? <extra_id_25>? <extra_id_26>? <extra_id_27>? <extra_id_28>? <extra_id_29>? <extra_id_30> <extra_id_31> <extra_id_32>

+++ With RAG +++
අමාත් ය මණ්ඩලයට ප් රධානයා වන්නේ අගමැතිද? - ජනාධිපතිවරයාගේ හෝ අග් රාමාත් යවරයාගේ ධුරය දරන්නේ . <extra_id_10>. <extra_id_11>. <extra_id_12>. <extra_id_13>. <extra_id_14>. <extra_id_15>. <extra_id_16>. <extra_id_17>. <extra_id_18>. <extra_id_19>. <extra_id_20>. <extra_id_21>. <extra_id_22>. <extra_id_23>. <extra_id_24>. <extra_id_25>. <extra_id_26>. <extra_id_27>. <extra_id_28>. <extra_id_29>.

=== QUESTION ===
ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?

--- Without RAG ---
? - YAMU නීතිඥ <extra_id_16> <extra_id_17> <extra_i

In [None]:
# Install required packages (if not already installed)
!pip install langchain sentencepiece gensim faiss-cpu transformers --quiet

import os
import numpy as np
import pandas as pd
import sentencepiece as spm
from gensim.models import Word2Vec
import faiss
from typing import List, Dict, Any, Optional

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# LangChain imports
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS as LC_FAISS
from langchain.chains import ConversationalRetrievalChain ,RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.llms.base import LLM


In [None]:
###############################################
# 1) Load Constitution and Chunk the Text
###############################################
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"

with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    """
    Splits text into chunks of roughly max_tokens words.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0
    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...\n")

# Convert each chunk into a LangChain Document
documents = []
for i, ctext in enumerate(chunks):
    doc = Document(page_content=ctext, metadata={"chunk_id": i})
    documents.append(doc)

###############################################
# 2) Load your Sinhala BPE Tokenizer & Word2Vec
###############################################
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

# Load SentencePiece BPE model
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

# Load Word2Vec model
w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

def embed_text_bpe_w2v(text: str) -> np.ndarray:
    """
    Tokenize text with the BPE model, look up Word2Vec vectors, and return the average.
    """
    tokens = sp.encode(text, out_type=str)
    valid_vectors = []
    for tok in tokens:
        if tok in w2v_model.wv:
            valid_vectors.append(w2v_model.wv[tok])
    if not valid_vectors:
        return np.zeros(embedding_dim, dtype="float32")
    return np.mean(valid_vectors, axis=0).astype("float32")

###############################################
# 3) Create a Custom LangChain Embeddings Wrapper
###############################################
class SinhalaBpeWord2VecEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [embed_text_bpe_w2v(t).tolist() for t in texts]
    def embed_query(self, text: str) -> List[float]:
        return embed_text_bpe_w2v(text).tolist()

embedding_fn = SinhalaBpeWord2VecEmbeddings()

###############################################
# 4) Create a FAISS Vector Store with LangChain
###############################################
faiss_store = LC_FAISS.from_documents(documents, embedding=embedding_fn)

###############################################
# 5) Define a Custom LLM Wrapper for mT5 (using google/mt5-large)
###############################################
from pydantic import Field

class MT5LocalLLM(LLM):
    local_pipeline: Any = Field(..., description="The pipeline for text generation")
    max_new_tokens: int = Field(default=128, description="Max new tokens to generate")

    @property
    def _llm_type(self) -> str:
        return "mT5Local"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.local_pipeline(
            prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            num_beams=2
        )
        return response[0]["generated_text"]

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {"model_type": "mt5", "max_new_tokens": self.max_new_tokens}

# Load mT5-large model for generation
GENERATION_MODEL_NAME = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)
local_pipeline = pipeline(
    task="text-generation",  # switched to text-generation
    model=model,
    tokenizer=tokenizer
)

mt5_llm = MT5LocalLLM(local_pipeline=local_pipeline, max_new_tokens=128)

###############################################
# 6) Build the Conversational Retrieval Chain with Memory
###############################################
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=mt5_llm,
    retriever=faiss_store.as_retriever(),
    memory=memory,
    verbose=True
)

# Optional: Customize the prompt template inside the chain to enforce concise answers.
qa_chain.combine_docs_chain.llm_chain.prompt.template = """
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
{context}

Question: {question}

Answer:
"""

###############################################
# 7) Run a Sample Conversational Session
###############################################
questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ඔව් නම්, එය කොයි වරදකට/අයිතිවාසිකමකට අදාලද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in questions:
    result = qa_chain({"question": q})
    print(f"User: {q}")
    print(f"Assistant: {result['answer']}\n")


Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...

Word2Vec embedding dimension: 100


Device set to use cpu
The model 'MT5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ පනත් කෙටුම්පත් කාර්යාංශය විසින් සංස්කරණය කරන ලද මෙම නිල නොවන ප්‍රතිශෝධිත මුද්‍රණය මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ විසිඑක්වන සංශෝධනය දක්වා පාර්ලිමේන්තුව විසින් වරින් වර සංශෝධනය කරන ලද ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව නැවත ප්‍රකාශයට පත් කරනු ලැබේ. අදාළ පිටු අග ඇති සටහන් මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාව සංශෝධනය කළ ඒ ඒ සංශෝධන දැක

In [None]:
import os
import numpy as np
import pandas as pd
import sentencepiece as spm
from gensim.models import Word2Vec
import faiss
from typing import List, Dict, Any, Optional

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# LangChain imports
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS as LC_FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms.base import LLM
from pydantic import Field

In [None]:
###############################################
# 1) Load and Chunk the Constitution Text
###############################################
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    """Naively splits text into chunks of roughly max_tokens words."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0
    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...\n")

# Convert each chunk into a LangChain Document
documents = [Document(page_content=ctext, metadata={"chunk_id": i}) for i, ctext in enumerate(chunks)]

###############################################
# 2) Load Sinhala BPE Tokenizer and Word2Vec Model
###############################################
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

sp_processor = spm.SentencePieceProcessor()
sp_processor.load(BPE_MODEL_PATH)

w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

def embed_text_bpe_w2v(text: str) -> np.ndarray:
    """Tokenize text with the BPE model and average Word2Vec vectors for tokens."""
    tokens = sp_processor.encode(text, out_type=str)
    valid_vectors = []
    for tok in tokens:
        if tok in w2v_model.wv:
            valid_vectors.append(w2v_model.wv[tok])
    if not valid_vectors:
        return np.zeros(embedding_dim, dtype="float32")
    return np.mean(valid_vectors, axis=0).astype("float32")

###############################################
# 3) Create a Custom Embeddings Class for LangChain
###############################################
class SinhalaBpeWord2VecEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [embed_text_bpe_w2v(t).tolist() for t in texts]
    def embed_query(self, text: str) -> List[float]:
        return embed_text_bpe_w2v(text).tolist()

embedding_fn = SinhalaBpeWord2VecEmbeddings()

###############################################
# 4) Create a FAISS Vector Store with LangChain
###############################################
faiss_store = LC_FAISS.from_documents(documents, embedding=embedding_fn)

###############################################
# 5) Create a Custom LLM Wrapper for mT5
###############################################
class MT5LocalLLM(LLM):
    local_pipeline: Any = Field(...)  # Explicit field declaration
    max_new_tokens: int = Field(default=128)

    @property
    def _llm_type(self) -> str:
        return "mT5Local"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.local_pipeline(
            prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            num_beams=2
        )
        return response[0]["generated_text"]

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {"model_type": "mt5", "max_new_tokens": self.max_new_tokens}

# Load the mT5-large model and tokenizer
GENERATION_MODEL_NAME = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

local_pipeline = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

mt5_llm = MT5LocalLLM(local_pipeline=local_pipeline, max_new_tokens=128)

###############################################
# 6) Build a Conversational Retrieval Chain with Memory
###############################################
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=mt5_llm,
    retriever=faiss_store.as_retriever(),
    memory=memory,
    verbose=True
)

# Override the prompt template to force concise, direct answers.
qa_chain.combine_docs_chain.llm_chain.prompt.template = """
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
{context}

Question: {question}

Answer:
"""

###############################################
# 7) Define a Function to Clean Output and Rephrase Follow-Up Questions
###############################################
def clean_output(text: str) -> str:
    """Remove T5 placeholder tokens."""
    for i in range(10):
        text = text.replace(f"<extra_id_{i}>", "")
    return text.strip()

def rephrase_followup(question: str, chat_history: str) -> str:
    """
    Rephrase the follow-up input as a standalone question in Sinhala.
    Only output the standalone question.
    """
    prompt = f"""
You are a highly knowledgeable Sinhala legal assistant.
Given the following conversation and the follow up input, rephrase the follow up input as a standalone question in Sinhala.
Do not include any extra context or explanations; output only the standalone question.

Chat History:
{chat_history}

Follow Up Input: {question}

Standalone Question (in Sinhala):
"""
    response = local_pipeline(
        prompt,
        max_new_tokens=64,
        do_sample=False,
        num_beams=4
    )
    return clean_output(response[0]["generated_text"])

###############################################
# 8) Run a Sample Conversational Session and Rephrase Follow-Up
###############################################
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ඔව් නම්, එය කොයි වරදකට/අයිතිවාසිකමකට අදාලද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    result = qa_chain({"question": q})
    print(f"User: {q}")
    print(f"Assistant: {result['answer']}\n")

# Simulate rephrasing a follow-up question.
chat_history = (
    "Human: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?\n"
    "Assistant: [Previous concise answer here]"
)
followup_input = "ඔව් නම්, එය කොයි වරදකට/අයිතිවාසිකමකට අදාලද?"
standalone_question = rephrase_followup(followup_input, chat_history)
print("Standalone Question:", standalone_question)


Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...

Word2Vec embedding dimension: 100


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
  result = qa_chain



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ පනත් කෙටුම්පත් කාර්යාංශය විසින් සංස්කරණය කරන ලද මෙම නිල නොවන ප්‍රතිශෝධිත මුද්‍රණය මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ විසිඑක්වන සංශෝධනය දක්වා පාර්ලිමේන්තුව විසින් වරින් වර සංශෝධනය කරන ලද ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව නැවත ප්‍රකාශයට පත් කරනු ලැබේ. අදාළ පිටු අග ඇති සටහන් මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාව සංශෝධනය කළ ඒ ඒ සංශෝධන දැක

KeyboardInterrupt: 

In [None]:
###############################################
# 1) Load and Chunk the Constitution Text
###############################################
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    """Naively splits text into chunks of roughly max_tokens words."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0
    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...\n")

# Convert each chunk into a LangChain Document
documents = [Document(page_content=ctext, metadata={"chunk_id": i}) for i, ctext in enumerate(chunks)]

###############################################
# 2) Load Sinhala BPE Tokenizer and Word2Vec Model
###############################################
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

sp_processor = spm.SentencePieceProcessor()
sp_processor.load(BPE_MODEL_PATH)

w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

def embed_text_bpe_w2v(text: str) -> np.ndarray:
    """Tokenize text with the BPE model and average Word2Vec vectors for tokens."""
    tokens = sp_processor.encode(text, out_type=str)
    valid_vectors = []
    for tok in tokens:
        if tok in w2v_model.wv:
            valid_vectors.append(w2v_model.wv[tok])
    if not valid_vectors:
        return np.zeros(embedding_dim, dtype="float32")
    return np.mean(valid_vectors, axis=0).astype("float32")

###############################################
# 3) Create a Custom Embeddings Class for LangChain
###############################################
class SinhalaBpeWord2VecEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [embed_text_bpe_w2v(t).tolist() for t in texts]
    def embed_query(self, text: str) -> List[float]:
        return embed_text_bpe_w2v(text).tolist()

embedding_fn = SinhalaBpeWord2VecEmbeddings()

###############################################
# 4) Create a FAISS Vector Store with LangChain
###############################################
faiss_store = LC_FAISS.from_documents(documents, embedding=embedding_fn)

###############################################
# 5) Create a Custom LLM Wrapper for mT5
###############################################
# Explicitly declare fields to satisfy Pydantic's requirements.
class MT5LocalLLM(LLM):
    local_pipeline: Any = Field(...)  # Explicit field declaration
    max_new_tokens: int = Field(default=128)

    @property
    def _llm_type(self) -> str:
        return "mT5Local"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.local_pipeline(
            prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            num_beams=2
        )
        return response[0]["generated_text"]

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {"model_type": "mt5", "max_new_tokens": self.max_new_tokens}

# Load the mT5-large model and tokenizer
GENERATION_MODEL_NAME = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)

local_pipeline = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

mt5_llm = MT5LocalLLM(local_pipeline=local_pipeline, max_new_tokens=128)

###############################################
# 6) Build a Conversational Retrieval Chain with Memory
###############################################
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=mt5_llm,
    retriever=faiss_store.as_retriever(),
    memory=memory,
    verbose=True
)

# Override the prompt template for concise answers.
qa_chain.combine_docs_chain.llm_chain.prompt.template = """
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
{context}

Question: {question}

Answer:
"""

###############################################
# 7) Run a Sample Conversational Session
###############################################
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ඔව් නම්, එය කොයි වරදකට/අයිතිවාසිකමකට අදාලද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    result = qa_chain({"question": q})
    print(f"User: {q}")
    print(f"Assistant: {result['answer']}\n")


Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...

Word2Vec embedding dimension: 100


Device set to use cpu




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ පනත් කෙටුම්පත් කාර්යාංශය විසින් සංස්කරණය කරන ලද මෙම නිල නොවන ප්‍රතිශෝධිත මුද්‍රණය මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ විසිඑක්වන සංශෝධනය දක්වා පාර්ලිමේන්තුව විසින් වරින් වර සංශෝධනය කරන ලද ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව නැවත ප්‍රකාශයට පත් කරනු ලැබේ. අදාළ පිටු අග ඇති සටහන් මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාව සංශෝධනය කළ ඒ ඒ සංශෝධන දැක

In [None]:
import re  # Import the 're' module

In [None]:
# -----------------------------
# 1) Load and Chunk the Constitution Text
# -----------------------------
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"
with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    """
    Naively split text into chunks of roughly `max_tokens` words.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0
    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...\n")

# Convert chunks to LangChain Documents
from langchain.docstore.document import Document
documents = [Document(page_content=chunk, metadata={"chunk_id": i}) for i, chunk in enumerate(chunks)]

# -----------------------------
# 2) Load the Sinhala Tokenizer (SentencePiece) and Word2Vec Model
# -----------------------------
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

sp_processor = spm.SentencePieceProcessor()
sp_processor.load(BPE_MODEL_PATH)

w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

def embed_text_bpe_w2v(text: str) -> np.ndarray:
    """
    Tokenize the text using the SentencePiece model and average the Word2Vec vectors.
    """
    tokens = sp_processor.encode(text, out_type=str)
    vectors = []
    for token in tokens:
        if token in w2v_model.wv:
            vectors.append(w2v_model.wv[token])
    if not vectors:
        return np.zeros(embedding_dim, dtype="float32")
    return np.mean(vectors, axis=0).astype("float32")

# -----------------------------
# 3) Create a Custom Embeddings Class for LangChain
# -----------------------------
from langchain.embeddings.base import Embeddings

class SinhalaBpeWord2VecEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [embed_text_bpe_w2v(t).tolist() for t in texts]
    def embed_query(self, text: str) -> List[float]:
        return embed_text_bpe_w2v(text).tolist()

embedding_fn = SinhalaBpeWord2VecEmbeddings()

# -----------------------------
# 4) Build a FAISS Vector Store Using LangChain
# -----------------------------
from langchain.vectorstores import FAISS as LC_FAISS
faiss_store = LC_FAISS.from_documents(documents, embedding=embedding_fn)

# -----------------------------
# 5) Build a Custom LLM Wrapper for mT5-large
# -----------------------------
from langchain.llms.base import LLM
from pydantic import Field

class MT5LocalLLM(LLM):
    local_pipeline: Any = Field(...)  # Explicit field declaration for Pydantic
    max_new_tokens: int = Field(default=128)

    @property
    def _llm_type(self) -> str:
        return "mt5local"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.local_pipeline(
            prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            num_beams=2
        )
        return response[0]["generated_text"]

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {"model_type": "mt5local", "max_new_tokens": self.max_new_tokens}

GENERATION_MODEL_NAME = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)
local_pipeline = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)
mt5_llm = MT5LocalLLM(local_pipeline=local_pipeline, max_new_tokens=128)

# -----------------------------
# 6) Define a Clean-Up Function to Remove Placeholder Tokens
# -----------------------------
def clean_output(text: str) -> str:
    """
    Remove T5 placeholder tokens like <extra_id_0>, <extra_id_1>, etc.
    """
    return re.sub(r"<extra_id_\d+>", "", text).strip()

# -----------------------------
# 7) Build a Conversational Retrieval Chain (Without Follow-Up Rephrasing)
# -----------------------------
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=mt5_llm,
    retriever=faiss_store.as_retriever(),
    memory=memory,
    verbose=True
)

# Override the prompt template to request a concise answer in Sinhala.
qa_chain.combine_docs_chain.llm_chain.prompt.template = """
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
{context}

Question: {question}

Answer:
"""

# -----------------------------
# 8) Run a Sample Conversational Session
# -----------------------------
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in sample_questions:
    result = qa_chain({"question": q})
    answer = clean_output(result["answer"])
    print(f"User: {q}")
    print(f"Assistant: {answer}\n")


Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...

Word2Vec embedding dimension: 100


Device set to use cpu




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context.

Context:
ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ පනත් කෙටුම්පත් කාර්යාංශය විසින් සංස්කරණය කරන ලද මෙම නිල නොවන ප්‍රතිශෝධිත මුද්‍රණය මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ විසිඑක්වන සංශෝධනය දක්වා පාර්ලිමේන්තුව විසින් වරින් වර සංශෝධනය කරන ලද ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව නැවත ප්‍රකාශයට පත් කරනු ලැබේ. අදාළ පිටු අග ඇති සටහන් මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාව සංශෝධනය කළ ඒ ඒ සංශෝධන දැක

KeyboardInterrupt: 

deepseek

In [None]:
# 1) Load Constitution and Chunk the Text
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"

with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0
    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...\n")

# Convert each chunk into a LangChain Document
documents = []
for i, ctext in enumerate(chunks):
    doc = Document(page_content=ctext, metadata={"chunk_id": i})
    documents.append(doc)

# 2) Load Sinhala BPE Tokenizer & Word2Vec
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

# Load SentencePiece BPE model
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

# Load Word2Vec model
w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

def embed_text_bpe_w2v(text: str) -> np.ndarray:
    tokens = sp.encode(text, out_type=str)
    valid_vectors = []
    for tok in tokens:
        if tok in w2v_model.wv:
            valid_vectors.append(w2v_model.wv[tok])
    if not valid_vectors:
        return np.zeros(embedding_dim, dtype="float32")
    return np.mean(valid_vectors, axis=0).astype("float32")

# 3) Create a Custom LangChain Embeddings Wrapper
class SinhalaBpeWord2VecEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [embed_text_bpe_w2v(t).tolist() for t in texts]
    def embed_query(self, text: str) -> List[float]:
        return embed_text_bpe_w2v(text).tolist()

embedding_fn = SinhalaBpeWord2VecEmbeddings()

# 4) Create a FAISS Vector Store with LangChain
faiss_store = LC_FAISS.from_documents(documents, embedding=embedding_fn)

# 5) Define a Custom LLM Wrapper for mT5 (using google/mt5-large)
class MT5LocalLLM(LLM):
    local_pipeline: Any = Field(..., description="The pipeline for text generation")
    max_new_tokens: int = Field(default=128, description="Max new tokens to generate")

    @property
    def _llm_type(self) -> str:
        return "mT5Local"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.local_pipeline(
            prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            num_beams=2
        )
        return response[0]["generated_text"]

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {"model_type": "mt5", "max_new_tokens": self.max_new_tokens}

# Load mT5-large model for generation
GENERATION_MODEL_NAME = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)
local_pipeline = pipeline(
    task="text2text-generation",  # Correct task for mT5
    model=model,
    tokenizer=tokenizer
)

mt5_llm = MT5LocalLLM(local_pipeline=local_pipeline, max_new_tokens=128)

# 6) Build the Conversational Retrieval Chain with Memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=mt5_llm,
    retriever=faiss_store.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 relevant chunks
    memory=memory,
    verbose=True,
    chain_type="stuff"
)

# Customize the prompt template
qa_chain.combine_docs_chain.llm_chain.prompt.template = """
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context. Be precise and accurate.

Context:
{context}

Question: {question}

Answer:
"""

# 7) Run a Sample Conversational Session
def query_qa_chain(question):
    result = qa_chain({"question": question})
    if not result["answer"].strip():  # Check if the answer is empty
        return "මට කණගාටුයි, මම ඔබගේ ප්‍රශ්නයට පිළිතුරු සපයා ගත නොහැකි විය."
    return result["answer"]

questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ඔව් නම්, එය කොයි වරදකට/අයිතිවාසිකමකට අදාලද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in questions:
    result = query_qa_chain(q)
    print(f"User: {q}")
    print(f"Assistant: {result}\n")

Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...

Word2Vec embedding dimension: 100


Device set to use cpu




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are a highly knowledgeable Sinhala legal assistant. 
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context. Be precise and accurate.

Context:
ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ පනත් කෙටුම්පත් කාර්යාංශය විසින් සංස්කරණය කරන ලද මෙම නිල නොවන ප්‍රතිශෝධිත මුද්‍රණය මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ විසිඑක්වන සංශෝධනය දක්වා පාර්ලිමේන්තුව විසින් වරින් වර සංශෝධනය කරන ලද ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව නැවත ප්‍රකාශයට පත් කරනු ලැබේ. අදාළ පිටු අග ඇති සටහන් මගින් ආණ්ඩුක්‍රම ව්‍යවස්ථාව

In [None]:
###############################################
# 1) Load Constitution and Chunk the Text
###############################################
CONSTITUTION_PATH = "/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt"

with open(CONSTITUTION_PATH, "r", encoding="utf-8") as f:
    full_text = f.read()

def simple_chunk_text(text: str, max_tokens: int = 200) -> List[str]:
    """
    Splits text into chunks of roughly max_tokens words.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_len = 0
    for w in words:
        current_chunk.append(w)
        current_len += 1
        if current_len >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_len = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunks = simple_chunk_text(full_text, max_tokens=200)
print(f"Number of chunks: {len(chunks)}")
print("Sample chunk:\n", chunks[0][:300], "...\n")

# Convert each chunk into a LangChain Document
documents = []
for i, ctext in enumerate(chunks):
    doc = Document(page_content=ctext, metadata={"chunk_id": i})
    documents.append(doc)

###############################################
# 2) Load your Sinhala BPE Tokenizer & Word2Vec
###############################################
BPE_MODEL_PATH = "sinhala_spm_bpe.model"
W2V_MODEL_PATH = "word2vec_sinhala_bpe.model"

# Load SentencePiece BPE model
sp = spm.SentencePieceProcessor()
sp.load(BPE_MODEL_PATH)

# Load Word2Vec model
w2v_model = Word2Vec.load(W2V_MODEL_PATH)
embedding_dim = w2v_model.vector_size
print("Word2Vec embedding dimension:", embedding_dim)

def embed_text_bpe_w2v(text: str) -> np.ndarray:
    """
    Tokenize text with the BPE model, look up Word2Vec vectors, and return the average.
    """
    tokens = sp.encode(text, out_type=str)
    valid_vectors = []
    for tok in tokens:
        if tok in w2v_model.wv:
            valid_vectors.append(w2v_model.wv[tok])
    if not valid_vectors:
        return np.zeros(embedding_dim, dtype="float32")
    return np.mean(valid_vectors, axis=0).astype("float32")

###############################################
# 3) Create a Custom LangChain Embeddings Wrapper
###############################################
class SinhalaBpeWord2VecEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [embed_text_bpe_w2v(t).tolist() for t in texts]
    def embed_query(self, text: str) -> List[float]:
        return embed_text_bpe_w2v(text).tolist()

embedding_fn = SinhalaBpeWord2VecEmbeddings()

###############################################
# 4) Create a FAISS Vector Store with LangChain
###############################################
faiss_store = LC_FAISS.from_documents(documents, embedding=embedding_fn)

###############################################
# 5) Define a Custom LLM Wrapper for mT5 (using google/mt5-large)
###############################################
from pydantic import Field

class MT5LocalLLM(LLM):
    local_pipeline: Any = Field(..., description="The pipeline for text generation")
    max_new_tokens: int = Field(default=128, description="Max new tokens to generate")

    @property
    def _llm_type(self) -> str:
        return "mT5Local"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        response = self.local_pipeline(
            prompt,
            max_new_tokens=self.max_new_tokens,
            do_sample=False,
            num_beams=2
        )
        return response[0]["generated_text"]

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {"model_type": "mt5", "max_new_tokens": self.max_new_tokens}

# Load mT5-large model for generation
GENERATION_MODEL_NAME = "google/mt5-large"
tokenizer = AutoTokenizer.from_pretrained(GENERATION_MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(GENERATION_MODEL_NAME)
local_pipeline = pipeline(
    task="text2text-generation",  # Correct task for mT5
    model=model,
    tokenizer=tokenizer
)

mt5_llm = MT5LocalLLM(local_pipeline=local_pipeline, max_new_tokens=128)

###############################################
# 6) Build the Retrieval Chain (No Memory)
###############################################
qa_chain = RetrievalQA.from_chain_type(
    llm=mt5_llm,
    chain_type="stuff",
    retriever=faiss_store.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 relevant chunks
    verbose=True
)

# Customize the prompt template
qa_chain.combine_documents_chain.llm_chain.prompt.template = """
You are a highly knowledgeable Sinhala legal assistant.
Using the context provided below, answer the following question in 1-2 concise sentences in Sinhala.
Do not repeat long excerpts from the context. Be precise and accurate.

Context:
{context}

Question: {question}

Answer:
"""

###############################################
# 7) Run a Sample Session
###############################################
def query_qa_chain(question):
    result = qa_chain({"query": question})
    if not result["result"].strip():  # Check if the answer is empty
        return "මට කණගාටුයි, මම ඔබගේ ප්‍රශ්නයට පිළිතුරු සපයා ගත නොහැකි විය."
    return result["result"]

questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

for q in questions:
    result = query_qa_chain(q)
    print(f"User: {q}")
    print(f"Assistant: {result}\n")

Number of chunks: 321
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර් මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද) ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි. ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ ප ...

Word2Vec embedding dimension: 100


Device set to use cpu




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
User: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Assistant: <extra_id_0> ජනාධිපතිවරයා විසින් හෝ අග් රාමාත් යවරයා විසින් හෝ පත් කරනු ලැබිය යුත්තේ ය.  <extra_id_1> ජනාධිපතිවරයා විසින් හෝ පත් කරනු ලැබිය යුත්තේ ය.  <extra_id_2> ය.  <extra_id_3>. <extra_id_4>.  <extra_id_5>.  <extra_id_6>.  <extra_id_7>.  <extra_id_8>.  <extra_id_9>.  <extra_id_20>.  <extra_id_21>.  <extra_id_22>.  <extra_id_23>.  <extra_id_24>.  <extra_id_25>.  <extra_id_26>.  <extra_id_27>.  <extra_id_28>.  <extra_id_29>.  <extra_id_30>.  <extra_id_31>.  <extra_id_32>.  <extra_id_33>.  <extra_id_34>.  <extra_id_35>.  <extra_id_36>.  <extra_id_37>.  <extra_id_38>.  <extra_id_39>. 



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
User: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?
Assistant: <extra_id_0> හෝ <extra_id_1> හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ හෝ  <extra_id_2> හෝ හෝ <extr

In [None]:
# Install required libraries
!pip install sentencepiece transformers torch pandas numpy faiss-cpu PyPDF2

import pandas as pd
import numpy as np
import sentencepiece as spm
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
import torch
from google.colab import drive
import PyPDF2
import faiss
import textwrap



In [None]:
# -------------------------------
# 1. Load the Sinhala Constitution
# -------------------------------
CONSTITUTION_PATH = '/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt'  # Update path if needed

print("[INFO] Loading Sinhala Constitution...")
if CONSTITUTION_PATH.endswith('.pdf'):
    with open(CONSTITUTION_PATH, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
else:
    with open(CONSTITUTION_PATH, 'r', encoding='utf-8') as f:
        text = f.read()
print(f"Constitution Length: {len(text)} characters")

# -------------------------------
# 2. Split the Constitution into Chunks
# -------------------------------
def split_text(text, chunk_size=500):
    """Split text into chunks for vector storage."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

chunks = split_text(text)
print(f"Number of Chunks: {len(chunks)}")
print("Sample chunk:\n", textwrap.fill(chunks[0], width=80), "...\n")

# -------------------------------
# 3. Load the SentencePiece BPE Tokenizer
# -------------------------------
BPE_MODEL_PATH = 'sinhala_spm_bpe.model'  # Update path if needed
sp_processor = spm.SentencePieceProcessor(model_file=BPE_MODEL_PATH)
print("[INFO] BPE Tokenizer loaded.")

# -------------------------------
# 4. Load a Multilingual Embedding Model (DistilBERT)
# -------------------------------
embedding_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')
embedding_model = AutoModel.from_pretrained('distilbert-base-multilingual-cased')
print("[INFO] DistilBERT Embedding Model loaded.")

def get_embedding(text, tokenizer, model, max_length=128):
    """Generate an embedding for the given text using DistilBERT."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    # Average over the sequence dimension
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Create embeddings for each chunk
print("[INFO] Generating embeddings for constitution chunks...")
chunk_embeddings = np.array([get_embedding(chunk, embedding_tokenizer, embedding_model) for chunk in chunks])
embedding_dim = chunk_embeddings.shape[1]
print(f"Embedding Dimension: {embedding_dim}")

# -------------------------------
# 5. Build a FAISS Index for Retrieval
# -------------------------------
index = faiss.IndexFlatL2(embedding_dim)
index.add(chunk_embeddings)
print("[INFO] FAISS Index built with", index.ntotal, "entries.")

# -------------------------------
# 6. Load a Generative Model for Text Generation
# -------------------------------
# We use DistilGPT-2 (an English model) as a lightweight generator.
generator = pipeline('text-generation', model='distilgpt2', tokenizer='distilgpt2')
print("[INFO] DistilGPT-2 loaded for generation.")

# -------------------------------
# 7. Define the Retrieval-Augmented Generation (RAG) Function
# -------------------------------
def rag_answer(question, tokenizer, embedding_model, index, chunks, generator, top_k=3):
    """
    Retrieve the top_k most relevant chunks using FAISS and generate an answer.
    The prompt includes the question and the retrieved context.
    """
    # Get question embedding
    q_embedding = get_embedding(question, embedding_tokenizer, embedding_model).reshape(1, -1)

    # Retrieve top-k chunks using FAISS search
    distances, indices = index.search(q_embedding, top_k)
    context = " ".join([chunks[idx] for idx in indices[0]])

    # Build the prompt
    prompt = f"Question: {question}\nContext: {context}\nAnswer in Sinhala:"

    # Encode the prompt with truncation to ensure it does not exceed max tokens (1024)
    prompt_tokens = generator.tokenizer.encode(prompt, truncation=True, max_length=1024)
    prompt = generator.tokenizer.decode(prompt_tokens)

    # Generate answer using max_new_tokens to avoid repeating the prompt
    response = generator(prompt, max_new_tokens=150, num_return_sequences=1, truncation=True)[0]['generated_text']
    # Split out the answer part and remove any placeholder tokens
    answer = response.split("Answer in Sinhala:")[-1]
    for i in range(10):
        answer = answer.replace(f"<extra_id_{i}>", "")
    return answer.strip()

# -------------------------------
# 8. Define a Baseline Function (LLM-only without retrieval)
# -------------------------------
def baseline_answer(question, generator):
    """
    Generate an answer using only the generative model (without any retrieval).
    """
    prompt = f"Question: {question}\nAnswer in Sinhala:"
    prompt_tokens = generator.tokenizer.encode(prompt, truncation=True, max_length=1024)
    prompt = generator.tokenizer.decode(prompt_tokens)
    response = generator(prompt, max_new_tokens=150, num_return_sequences=1, truncation=True)[0]['generated_text']
    answer = response.split("Answer in Sinhala:")[-1]
    for i in range(10):
        answer = answer.replace(f"<extra_id_{i}>", "")
    return answer.strip()

# -------------------------------
# 9. Evaluate the Chatbot: Compare RAG vs. Baseline Responses
# -------------------------------
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

print("\n=== Chatbot Evaluation ===")
for question in sample_questions:
    print(f"\nQuestion: {question}\n")

    # RAG Response
    rag_response = rag_answer(question, embedding_tokenizer, embedding_model, index, chunks, generator, top_k=3)
    print("RAG Response:")
    print(textwrap.fill(rag_response, width=80))

    # Baseline Response
    baseline_response = baseline_answer(question, generator)
    print("\nBaseline Response:")
    print(textwrap.fill(baseline_response, width=80))


[INFO] Loading Sinhala Constitution...
Constitution Length: 413826 characters
Number of Chunks: 828
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව (2022 ඔක්තෝබර්
මස 31 වැනි දින දක්වා සංශෝධිතයි) (විසිඑක්වන සංශෝධනය දක්වා සංශෝධන අන්තර්ගත කරන ලද)
ප්‍රතිශෝධිත මුද්‍රණය 2023 පාර්ලිමේන්තු මහ ලේකම් කාර්යාලය මගින් ප්‍රකාශිතයි.
ශ්‍රී ලංකා පාර්ලිමේන්තුවේ ව්‍යවස්ථාදායක සේවා දෙපාර්තමේන්තුවේ පනත් කෙටුම්පත්
කාර්යාංශය විසින් සංස්කරණය කරන ලද මෙම නිල නොවන ප්‍රතිශෝධිත මුද්‍රණය මගින්
ආණ්ඩුක්‍රම ව්‍යවස්ථාවේ විසිඑක්වන සංශෝධනය දක්වා පාර්ලිමේන්තුව විසින් වරින් වර
සංශෝධනය කරන ලද ශ්‍රී ලංකා ප්‍රජාතාන ...

[INFO] BPE Tokenizer loaded.
[INFO] DistilBERT Embedding Model loaded.
[INFO] Generating embeddings for constitution chunks...
Embedding Dimension: 768
[INFO] FAISS Index built with 828 entries.


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[INFO] DistilGPT-2 loaded for generation.

=== Chatbot Evaluation ===

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RAG Response:
Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?
Context: .  පරිසරය ආරක්ෂා කිරීම.  ජාතික වැදගත්කමකින්‌ යුක්ත බවට පාර්ලිමේන්තුව
විසින්‌ පනවන ලද නීතියක්‌ මගින්‌ හෝ නීතියක්‌ යටතේ ප්‍රකාශයට පත්‌ කරනු නොලැබ ඇති
පුරාවිද්‍යාත්මක භූමි සහ නටබුන්‌.  මනුෂ්‍යයන්ට, සතුන්ට භෝ ශාකවලට, බලපාන ආසාදක හො
ස්පර්ශක රොග හො පලිබෝධ එක්‌ පළාතකින්‌ තවත්‌ පළාතකට පැතිර යාම වැළැක්වීම.  වන්දනා
ගමන්‌.]             ෑගල දිස්


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Baseline Response:
‏َهْ قَِذَرَّْ وَذَاّةَُ عَلَيْ عَلِيْ اللهَ وَلَهُمَ اللهَ عَلَيْ عَلَيْ عَلَيْ
اللهَ وَلَهُمَ اللهَ عَلَيْ مَنَ عَلَيْ ع`مَ عَلَيْ عَلَيْ وَلَهُمَ الله\ ع

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?



IndexError: index out of range in self

In [None]:
# Install required libraries
!pip install sentencepiece transformers torch pandas numpy faiss-cpu PyPDF2

import pandas as pd
import numpy as np
import sentencepiece as spm
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
from google.colab import drive
import PyPDF2
import faiss
import textwrap



In [None]:
# -------------------------------
# 1. Load the Sinhala Constitution
# -------------------------------
CONSTITUTION_PATH = '/content/drive/MyDrive/Sri Lanka Constitution-Sinhala.txt'  # Update path if needed

print("[INFO] Loading Sinhala Constitution...")
if CONSTITUTION_PATH.endswith('.pdf'):
    with open(CONSTITUTION_PATH, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
else:
    with open(CONSTITUTION_PATH, 'r', encoding='utf-8') as f:
        text = f.read()
print(f"Constitution Length: {len(text)} characters")

# -------------------------------
# 2. Split the Constitution into Chunks
# -------------------------------
def split_text_by_paragraph(text):
    """Split text into paragraphs, filtering out empty paragraphs."""
    paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    return paragraphs

chunks = split_text_by_paragraph(text)
print(f"Number of Chunks: {len(chunks)}")
print("Sample chunk:\n", textwrap.fill(chunks[0], width=80), "...\n")

# -------------------------------
# 3. Load the SentencePiece BPE Tokenizer
# -------------------------------
BPE_MODEL_PATH = 'sinhala_spm_bpe.model'  # Update path if needed
sp_processor = spm.SentencePieceProcessor(model_file=BPE_MODEL_PATH)
print("[INFO] BPE Tokenizer loaded.")

# -------------------------------
# 4. Load a Multilingual Embedding Model (DistilBERT)
# -------------------------------
embedding_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')
embedding_model = AutoModel.from_pretrained('distilbert-base-multilingual-cased')
print("[INFO] DistilBERT Embedding Model loaded.")

def get_embedding(text, tokenizer, model, max_length=128):
    """Generate an embedding for the given text using DistilBERT."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Create embeddings for each chunk
print("[INFO] Generating embeddings for constitution chunks...")
chunk_embeddings = np.array([get_embedding(chunk, embedding_tokenizer, embedding_model) for chunk in chunks])
embedding_dim = chunk_embeddings.shape[1]
print(f"Embedding Dimension: {embedding_dim}")

# -------------------------------
# 5. Build a FAISS Index for Retrieval
# -------------------------------
index = faiss.IndexFlatL2(embedding_dim)
index.add(chunk_embeddings)
print("[INFO] FAISS Index built with", index.ntotal, "entries.")

# -------------------------------
# 6. Load a Generative Model for Text Generation
# -------------------------------
generator = pipeline('text-generation', model='google/mt5-large', tokenizer='google/mt5-large')
print("[INFO] Generative model loaded for generation.")

# -------------------------------
# 7. Define the Retrieval-Augmented Generation (RAG) Function
# -------------------------------
def rag_answer(question, tokenizer, embedding_model, index, chunks, generator, top_k=3):
    """
    Retrieve the top_k most relevant chunks using FAISS and generate an answer.
    The prompt includes the question and the retrieved context.
    """
    # Get question embedding
    q_embedding = get_embedding(question, embedding_tokenizer, embedding_model).reshape(1, -1)

    # Retrieve top-k chunks
    distances, indices = index.search(q_embedding, top_k)
    context = " ".join([chunks[idx] for idx in indices[0]])

    # Build the prompt
    prompt = f"Question: {question}\nContext: {context}\nAnswer in Sinhala:"

    # Ensure prompt length does not exceed the model's maximum input tokens
    prompt_tokens = generator.tokenizer.encode(prompt, truncation=True, max_length=1024)
    prompt = generator.tokenizer.decode(prompt_tokens)

    # Generate answer using max_new_tokens parameter
    response = generator(prompt, max_new_tokens=150, num_return_sequences=1, truncation=True)[0]['generated_text']
    answer = response.split("Answer in Sinhala:")[-1]
    for i in range(10):
        answer = answer.replace(f"<extra_id_{i}>", "")
    return answer.strip()

# -------------------------------
# 8. Define a Baseline Function (LLM-only without retrieval)
# -------------------------------
def baseline_answer(question, generator):
    """
    Generate an answer using only the generative model (without retrieval).
    """
    prompt = f"Question: {question}\nAnswer in Sinhala:"
    prompt_tokens = generator.tokenizer.encode(prompt, truncation=True, max_length=1024)
    prompt = generator.tokenizer.decode(prompt_tokens)
    response = generator(prompt, max_new_tokens=150, num_return_sequences=1, truncation=True)[0]['generated_text']
    answer = response.split("Answer in Sinhala:")[-1]
    for i in range(10):
        answer = answer.replace(f"<extra_id_{i}>", "")
    return answer.strip()

# -------------------------------
# 9. Evaluate the Chatbot: Compare RAG vs. Baseline Responses
# -------------------------------
sample_questions = [
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව ජනාධිපතිවරයාට යම් වරදක් වෙනුවෙන් සම්පූර්ණ සමාව ලබා දීමේ බලතල තිබේද?",
    "ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව මෙරට ජාතික භාෂා මොනවාද?"
]

print("\n=== Chatbot Evaluation ===")
for question in sample_questions:
    print(f"\nQuestion: {question}\n")

    # RAG Response
    rag_response = rag_answer(question, embedding_tokenizer, embedding_model, index, chunks, generator, top_k=3)
    print("RAG Response:")
    print(textwrap.fill(rag_response, width=80))

    # Baseline Response
    baseline_response = baseline_answer(question, generator)
    print("\nBaseline Response:")
    print(textwrap.fill(baseline_response, width=80))


[INFO] Loading Sinhala Constitution...
Constitution Length: 413826 characters
Number of Chunks: 2280
Sample chunk:
 ශ්‍රී ලංකා ප්‍රජාතාන්ත්‍රික සමාජවාදී ජනරජයේ ආණ්ඩුක්‍රම ව්‍යවස්ථාව ...

[INFO] BPE Tokenizer loaded.
[INFO] DistilBERT Embedding Model loaded.
[INFO] Generating embeddings for constitution chunks...
Embedding Dimension: 768
[INFO] FAISS Index built with 2280 entries.


Device set to use cpu
The model 'MT5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 

[INFO] Generative model loaded for generation.

=== Chatbot Evaluation ===

Question: ශ්‍රී ලංකා ව්‍යවස්ථාව අනුව අමාත්‍ය මණ්ඩලයට ප්‍රධානයා වන්නේ අගමැතිද?

RAG Response:
</s>>? <extra_id_14>? <extra_id_15>? Answer in English: අගමැතිද? <extra_id_16>?
<extra_id_17>? <extra_id_18>? Answer in English: අගමැතිද? <extra_id_19>?
<extra_id_20>? <extra_id_21>? Answer in English: අගමැතිද? <extra_id_22>?
<extra_id_23>? <extra_id_24>? <extra_id_25>? <extra_id_26>? Answer in English:
අගමැතිද? <extra_id_27>? Answer in English: අගමැතිද? <extra_id_28>?
<extra_id_29>? <extra_id_30>? ශ් රේෂ්ඨාධිකරණය <extra_id_31>? <extra_id_32>?
<extra_id_33>?

Baseline Response:
</s>නේ අගමැතිද? Question in Sinhala: අගමැති වන්නේ අගමැතිද? Question in Sinhala:
අගමැති වන්නේ <extra_id_13> අගමැතිද? <extra_id_14>? <extra_id_15>? <extra_id_16>?
<extra_id_17>? <extra_id_18>? <extra_id_19>? <extra_id_20>? <extra_id_21>?
<extra_id_22>? <extra_id_23>? <extra_id_24>? <extra_id_25>? <extra_id_26>?
<extra_id_27>? අගමැතිද? <extra_id_28>