In [40]:
# Core
!pip install -U pip

# Pdfs text extraction
! pip install pdfplumber

# RAG stack
!pip install langchain
!pip install langchain-community
!pip install langchain-core

# Embeddings & Vector DB
!pip install sentence-transformers
!pip install faiss-cpu

# LLM
!pip install langchain-google-genai


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 M

In [4]:
import json

ARABIC_FILE = "/content/data/quran-uthmani.txt"
ENGLISH_FILE = "/content/data/en.sahih.txt"
OUTPUT_FILE = "/content/data/quran_combined.json"

def load_quran(file_path):
    verses = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            parts = line.split("|")
            if len(parts) != 3:
                continue

            surah, ayah, text = parts
            key = f"{surah}:{ayah}"

            verses[key] = {
                "surah": int(surah),
                "ayah": int(ayah),
                "text": text.strip()
            }
    return verses

arabic = load_quran(ARABIC_FILE)
english = load_quran(ENGLISH_FILE)

# Combining Arabic and English translated verses into .json

combined = []

for key in arabic:
    if key in english:
        combined.append({
            "source": "Quran",
            "surah": arabic[key]["surah"],
            "ayah": arabic[key]["ayah"],
            "arabic": arabic[key]["text"],
            "english": english[key]["text"],
            "translation": "Sahih International"
        })

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(combined, f, ensure_ascii=False, indent=2)

print(f"Combined Qur’an saved to {OUTPUT_FILE}")
print(f"Total verses: {len(combined)}")


Combined Qur’an saved to /content/data/quran_combined.json
Total verses: 6236


In [5]:
import pdfplumber
import json
import os
import re

PDF_DIR = "/content/data/Sahih al-bukhari"
OUTPUT_JSON = "/content/data/Sahih al-bukhari.json"

def extract_english_text(pdf_path):
    pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue

            lines = []
            for line in text.split("\n"):
                # keep English-only lines
                if any("a" <= c.lower() <= "z" for c in line):
                    lines.append(line.strip())

            pages.append(" ".join(lines))

    return " ".join(pages)

# starts with Narrated
# captures until just before next Narrated
HADITH_PATTERN = re.compile(
    r"(Narrated\b.*?)(?=\bNarrated\b|$)",
    re.IGNORECASE | re.DOTALL
)

all_hadiths = []
global_id = 1

for pdf_file in sorted(os.listdir(PDF_DIR)):
    if not pdf_file.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(PDF_DIR, pdf_file)

    # Extract volume number if present
    vol_match = re.search(r"\d+", pdf_file)
    volume = int(vol_match.group()) if vol_match else None

    full_text = extract_english_text(pdf_path)

    matches = HADITH_PATTERN.findall(full_text)

    for match in matches:
        hadith_text = " ".join(match.split())

        # Strong noise filter
        if len(hadith_text) < 50:
            continue

        all_hadiths.append({
            "source": "Hadith",
            "collection": "Sahih Bukhari",
            "volume": volume,
            "temp_id": global_id,
            "english": hadith_text
        })
        global_id += 1

with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(all_hadiths, f, indent=2, ensure_ascii=False)

print(f"Extracted {len(all_hadiths)} hadiths")
print(f"Saved to {OUTPUT_JSON}")


Extracted 7576 hadiths
Saved to /content/data/Sahih al-bukhari.json


In [8]:
import json
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

docs = []

# Hadith
for h in load_json("/content/data/Sahih al-bukhari.json"):
    docs.append(Document(
        page_content=h["english"],
        metadata={
            "source": "Hadith",
            "collection": "Sahih Bukhari",
            "volume": h.get("volume"),
            "temp_id": h.get("temp_id")
        }
    ))

# Quran
for q in load_json("/content/data/quran_combined.json"):
    docs.append(Document(
        page_content=q["english"],
        metadata={
            "source": "Quran",
            "surah": q["surah"],
            "ayah": q["ayah"]
        }
    ))

embeddings = HuggingFaceEmbeddings(model_name=EMB_MODEL)
db = FAISS.from_documents(docs, embeddings)
db.save_local("/content/vectorstore")

print("Vector store created successfully")


  embeddings = HuggingFaceEmbeddings(model_name=EMB_MODEL)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store created successfully


In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Setting up the knowledge base
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

db = FAISS.load_local(  # loading vectorstore
    "/content/vectorstore",
    embeddings,
    allow_dangerous_deserialization=True
)


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Create a retriever to fetch top 5 relevant documents from the vector store
retriever = db.as_retriever(search_kwargs={"k": 5})


In [5]:
# 5 most relevent chunks retrieved from vector store

query = "who is Dajjal?"

docs = retriever.invoke(query)

for d in docs:
    print("SOURCE:", d.metadata["source"])
    print(d.page_content[:200])
    print("-" * 40)



SOURCE: Hadith
narrated to us a long 92— THE BOOK OFAL-FITAN (cid:9) i (cid:9) -(cid:9) 158 narration about Ad-Dajjãl and among the things he
----------------------------------------
SOURCE: Hadith
Narrated 'Abdullãh bin 'Umar e : Allah's Messenger(cid:9) stood up amongst the people and then praised and glorified Allah as He deserved and then he mentioned Ad-Dajjãl , saying, "I warn you of him, 
----------------------------------------
SOURCE: Hadith
Narrated AbU Hurairah Li-(cid:9) — rrr Allah's Messenger (cid:9) said, "Shall I not tell L you about Ad-Dajjal, and a thing about him which no Prophet told his people (before)? Ad-Dajjal is one-eyed a
----------------------------------------
SOURCE: Hadith
Narrated 'Abdullah(cid:9) Ad- Dajjãl was mentioned in the presence of the Prophet jit. The Prophet j said, "Allah is not hidden from you; He is not one-eyed," and pointed with his hand towards his eye
----------------------------------------
SOURCE: Hadith
narrated to us, was: "Ad-Dajjal

In [44]:
# LLM core logic

from langchain_google_genai import ChatGoogleGenerativeAI
import os

os.environ["GOOGLE_API_KEY"] = "AIzaSyABEYcTNc5R7Y4Mk0f9Q5AWF8Hg6XRYQkA"

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

# controlling LLM behaviour and outcomes

PROMPT = """
You are an Islamic knowledge assistant.
Answer ONLY using the provided context.
Do not add personal opinions.
If the answer is not found, say:
"Please consult a qualified Islamic scholar."

Context:
{context}

Question:
{question}
"""

# outcome logic

def rag_answer(question):
    docs = retriever.invoke(question)

    context = "\n\n".join(d.page_content for d in docs)

    response = llm.invoke(
        PROMPT.format(context=context, question=question)
    )

    # ---- Determine source ----
    source = "Unknown"

    for d in docs:
        if d.metadata.get("source") == "Quran":
            source = "Quran"
            break   # Quran takes priority
        elif d.metadata.get("source") == "Hadith":
            source = "Sahih al-Bukhari"

    return {
        "answer": response.content,
        "source": source
    }




In [48]:
# Final LLM generated outcome

print(rag_answer("what is importance of friday?"))

{'answer': "The importance of Friday is highlighted in several ways:\n\n*   Muslims are the last to come but will be the foremost on the Day of Resurrection because Allah gave them guidance for Friday, a day about which former nations differed.\n*   Religious ceremonies like Khutba (religious talk) and Jumu'ah prayer were made compulsory for former nations on this day, but they differed about it. Allah gave Muslims guidance for it.\n*   It is obligatory for every Muslim to take a bath once in seven days, specifically on Friday, washing their head and body.\n*   Whoever takes a bath on Friday, purifies himself, uses oil or perfume, proceeds for the Salat-ul-Jumu'ah (Jumu'ah prayer), does not separate two persons sitting together, offers Salat as much as Allah has written for him, and remains silent while the Imam is delivering the Khutba, his sins in-between the present and the last Friday would be forgiven.", 'source': 'Sahih al-Bukhari'}
