# Kenyan Constitution RAG System

In [None]:
!pip install sentence-transformers chromadb fastapi uvicorn pdfplumber python-multipart faiss-cpu spacy groq

Collecting chromadb
  Downloading chromadb-1.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting groq
  Downloading groq-0.22.0-py3-none-any.whl.metadata (15 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylin

In [None]:
!pip install google-cloud-translate



In [None]:
import pdfplumber
import spacy
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
import os
from dotenv import load_dotenv
import warnings
import re
import openai
from groq import Groq
from langchain.text_splitter import RecursiveCharacterTextSplitter

warnings.filterwarnings("ignore")
load_dotenv()

False

In [None]:
!nvidia-smi

Sat Apr 19 20:22:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Read the PDF
def load_pdf(path):
    with pdfplumber.open(path) as pdf:
        text = ""
        for page in pdf.pages:  # Skip TOC/intros if needed
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

## Chunking by Article
- chunking by Article is a very effective strategy for legal documents like a Constitution, since each article is usually self-contained and semantically meaningful.

### Good approaches
- ✔️ Splitting by 'Article' matches the legal structure of the Constitution.
- ✔️ Using SpaCy's sentence segmentation improves boundary recognition, especially where articles span multiple sentences.
- ✔️ Trimming whitespace (strip()) which is important for clean chunks.
- ✔️ Each chunk becomes a standalone unit for embedding, retrieval, and citation.
- ✔️ Using Regular Expressions for Precision i.e `^Article\s+\d+[A-Za-z]*`. This helps catch:
  - Article 1
  - Article 10A
  - Article 43B
- ✔️ Including an Article Title and Number in Metadata helps in retrieval and citation
- Preserve titles like: `"Article 43 — Economic and social rights"`

In [None]:
# Split the full text into articles (preserve exact titles/content)
def chunk_by_article(text):
    # Split while keeping the delimiters
    parts = re.split(r"(Article\s+\d+[A-Za-z]*[^:\n]*)", text, flags=re.IGNORECASE)

    chunks = []
    for i in range(1, len(parts), 2):
        title = parts[i].strip()
        content = parts[i+1].strip() if i+1 < len(parts) else ""
        chunks.append({
            "title": title,
            "content": content
        })

    return chunks

# Optional: further chunk large articles
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

def chunk_articles_finely(articles):
    final_chunks = []
    for article in articles:
        sub_chunks = text_splitter.split_text(article["content"])
        for chunk in sub_chunks:
            final_chunks.append({
                "title": article["title"],
                "content": chunk
            })
    return final_chunks

In [None]:
# Load and chunk
text = load_pdf("/content/constitution.pdf")
articles = chunk_by_article(text)
docs = chunk_articles_finely(articles)



### Creating Embeddings
- Embeddings are numerical vector representations of text, allowing you to measure semantic similarity between a query and the chunks (articles).

In [None]:
# creating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([doc['content'] for doc in docs], show_progress_bar=True)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
# Setting up Chroma
client = chromadb.PersistentClient(path = "chroma_store")
collection = client.get_or_create_collection(name = "Kenya_constitution")

# Saving to ChromaDB
collection.add(
    ids = [str(i) for i in range(len(docs))],
    documents = [doc["content"] for doc in docs],
    metadatas = [{"title": doc["title"]} for doc in docs],
    embeddings = embeddings
)

### Setting up Groq

In [None]:
# Setting manually the api_key notebook session
os.environ["GROQ_API_KEY"] = "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"
# openai.base_url = "https://api.groq.com/openai/v1" # Base URL

In [None]:
def generate_response(prompt, language = "en"):
  # querying chroma
  results = collection.query(query_texts= [prompt], n_results = 3)
  context = "\n\n".join(results["documents"][0])

  # Setting prompt based on language
  system_instrusctions = {
      "en": "You are a legal assistant answering only based on the Kenyan Constitution",
      "sw" : "Wewe ni mtaalamu wa sheria unayejibu maswali kwa kutumia Katiba ya Kenya. Jibu kwa Kiswahili pekee."
  }

  # initializing groq
  client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),  # This is the default and can be omitted
  )

  # Sending to Groq-compatible LLM
  completion = client.chat.completions.create(
      model = "llama3-8b-8192",
      messages = [
           {"role": "system", "content": system_instrusctions.get(language, system_instrusctions["en"])},
           {"role": "user", "content": f"Context:{context} Question: {prompt}"}
          ],
      temperature = 0.3,
      max_tokens = 300,
  )

  return completion.choices[0].message.content


In [None]:
generate_response("What are the rights of an arrested person?", language="en")

'According to Article 49(1) of the Constitution of Kenya, 2010, an arrested person has the right to be informed promptly, in a language they understand, of:\n\n(a) The reason for the arrest\n(b) The right to remain silent\n(c) The consequences of not remaining silent\n\nAdditionally, Article 49(2) provides that an arrested person has the right to assist a complainant or an accused person to communicate with the court.\n\nFurthermore, Article 49(3) provides that an arrested person has the right to:\n\n(a) Remain silent\n(b) Communicate with an advocate, and other persons whose assistance is necessary\n(c) Not be compelled to make any confession or admission that could be used in evidence against the person\n(d) Be held separately from persons who are serving a sentence\n(e) Be brought before a court as soon as reasonably possible, but not later than 24 hours after being arrested or the end of the next court day\n(f) At the first court appearance, be charged or informed of the reason for

In [None]:
generate_response("What are the rights of an arrested person?", language="sw")

'Kwa mujibu wa Mwakilishi wa Taifa ya Kenya, 2010, klabu 49, klaus 1, klabu (a), klabu (i), klabu (ii) na klabu (iii), haki za mtu aliyejaribiwa ni pamoja na:\n\n* Kuwaambia kwa njia ambayo mtu anaweza kuelewa, sababu ya kuwajibika;\n* Kuwa na haki ya kutulia;\n* Kuwa na haki ya kujua madhara ya kutulia.\n\nPia, klabu 49, klaus 1, klabu (c), klabu (d), klabu (e), klabu (f), klabu (g) na klabu (h), haki za mtu aliyejaribiwa pia ni pamoja na:\n\n* Kuwa na haki ya kuwasiliana na mwendesha mashtaka;\n* Kuwa na haki ya kutalia;\n* Kuwa na haki ya kuwasiliana na mwendesha mashtaka;\n* Kuwa na haki ya kutalia;\n* Kuwa na haki ya kufunguliwa kwa bondi au kodi, kwa masharti mazuri, kabla ya kufunguliwa kwa mashtaka au kesi'

In [None]:
generate_response("What are the rights of a disabled person?", language="en")

'According to Article 53 of the Constitution of Kenya, every person with a disability has the right to equal opportunities and access to education, employment, healthcare, transportation, and other social services. Additionally, Article 54 of the Constitution provides that every person with a disability has the right to reasonable accommodation in all aspects of life, including education, employment, and access to information.\n\nFurthermore, Article 55 of the Constitution provides that the State shall take measures to ensure that persons with disabilities have equal opportunities to participate in the political, economic, social, and cultural life of the country. This includes measures to ensure that persons with disabilities have access to information, education, and employment, and that they are not discriminated against in any way.\n\nIn terms of the rights of a disabled person in a court of law, Article 165 of the Constitution provides that the court shall ensure that persons with

In [None]:
generate_response("What are the rights of an arrested person?", language="sw")

'Kwa Katiba ya Kenya, 2010, hakika hakuna mtu anayeweza kuteswa bila kufahamika sababu ya kuwepo kwa upelelezi. Hivyo, kama mtu anakuwa amewekwa kizuizi, ana haki ya:\n\n* Kupewa sababu za upelelezi kwa lugha ambayo anajifahamu;\n* Kuwa na haki ya kutembea kwa kujulikana;\n* Kuwa na haki ya kutembea kwa kujulikana kuhusiana na sababu ya kuwepo kwa upelelezi;\n* Kuwa na haki ya kutembea kwa kujulikana kuhusiana na madhara ya kutokutembea kwa kujulikana;\n* Kuwa na haki ya kutembea na mwendesha wa maswali, na watu wengine ambao wanaweza kuwasaidia katika mahakamani;\n* Kuwa na haki ya kutembea kwa kujulikana kuhusiana na sababu ya kuwepo kwa upelelezi;\n* Kuwa na haki ya kutembea kwa kujulikana kuhusiana na mad'

### Intergratin Retriever and Generator