In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
import faiss
import re

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
# Load models
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
gen_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
# gen_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)

In [3]:
from transformers import AutoTokenizer as HFTokenizer

# Tokenizer for chunking
chunk_tokenizer = HFTokenizer.from_pretrained("bert-base-uncased")

def chunk_text_token_based(text, chunk_size=300, overlap=100):
    tokens = chunk_tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunk_text = chunk_tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk_text)
    return chunks


In [4]:
def build_index(documents):
    all_chunks = []
    metadata = []

    for doc in documents:
        chunks = chunk_text_token_based(doc['text'])
        for chunk in chunks:
            all_chunks.append(chunk)
            metadata.append({'title': doc['title'], 'url': doc['url'], 'text': chunk})

    # BM25
    tokenized_corpus = [chunk.split(" ") for chunk in all_chunks]
    bm25 = BM25Okapi(tokenized_corpus)

    # Dense Embeddings
    embeddings = embedding_model.encode(all_chunks, convert_to_numpy=True, normalize_embeddings=True)

    # Faiss HNSW Index
    dim = embeddings.shape[1]
    index = faiss.IndexHNSWFlat(dim, 32)
    index.hnsw.efConstruction = 40
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    return bm25, index, embeddings, all_chunks, metadata


In [None]:
def retrieve_relevant_context(query, bm25, index, corpus, metadata, top_k=50, rerank_k=10):
    # Sparse retrieval
    tokenized_query = query.split(" ")
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_idx = np.argsort(bm25_scores)[::-1][:top_k]

    # Dense retrieval
    query_emb = embedding_model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
    faiss.normalize_L2(query_emb.reshape(1, -1))
    _, dense_top_idx = index.search(query_emb.reshape(1, -1), top_k)

    # Merge candidates
    candidate_indices = set(bm25_top_idx) | set(dense_top_idx[0])
    candidates = [(i, corpus[i], metadata[i]) for i in candidate_indices]

    # Rerank with Cross-Encoder
    pairs = [[query, text] for _, text, _ in candidates]
    scores = cross_encoder.predict(pairs)
    reranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)[:rerank_k]

    # Return top reranked contexts and metadata
    contexts = [text for _, (_, text, _) in reranked]
    docs = [meta for _, (_, _, meta) in reranked]
    return contexts, docs


In [16]:
def generate_response(review: str, bm25, index, corpus, metadata) -> str:
    contexts, docs = retrieve_relevant_context(review, bm25, index, corpus, metadata)
    combined_context = "\n\n".join([f"{doc['title']}:\n{ctx}" for ctx, doc in zip(contexts, docs)])
    # prompt = f"Context:\n{combined_context}\n\nReview: {review}\n\nResponse:" 
    prompt = f"Retrieved Chunks:\n{combined_context}" 
    print(prompt)
    # inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    # outputs = gen_model.generate(**inputs, max_new_tokens=150)
    # return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)


In [17]:
import json

# Load your Islamic etiquette knowledge base
with open('islamic_etiquette_knowledge_base.json') as f:
    etiquette_data = json.load(f)

# Load Quran app documentation
with open('Quran_app_Documentation.json') as f:
    quran_app_data = json.load(f)

# Prepare chunked documents from both sources
documents = []
for entry in etiquette_data + quran_app_data:
    chunks = chunk_text_token_based(entry['text'])  # use the token-based chunker
    for chunk in chunks:
        documents.append({
            'title': entry['title'],
            'url': entry['url'],
            'text': chunk
        })

In [18]:
# Build index
bm25, index, embeddings, corpus, metadata = build_index(documents)

# Run generation
review_text = "Quran app audio is not working properly. I can't understand how to use the audio feature clearly."
response = generate_response(review_text, bm25, index, corpus, metadata)

# print with wraptext
print("Generated Response:\n", response)

Retrieved Chunks:
Audio – Listen to the Quran to memorise and recharge your soul:
audio – listen to the quran to memorise and recharge your soul recharge your soul and iman by listening to beautiful quran recitations. whether you ’ re memorizing, learning tajweed, or seeking comfort, our audio features support your journey. overview over 50 renowned reciters with user - friendly controls download audio for offline listening or stream to save storage space use the repeat option to assist with quran memorization ( hifz ) audio translations in english, bangla, and urdu to aid understanding by listening. more translations are planned, in sha allah. rich collection of reciters choose from over 50 renowned reciters including mishary al afasy, husary ( muallim ), ayman suwaid, abdur rahman as - sudais, abdul basit, minshawi and many others. for improving recitation and tajweed : try ayman suwaid or husary ( muallim ). prefer faster recitation? listen to yasser salama hadr. we have a few audio

In [20]:
# Run generation
review_text = "The quran app is good but i can't understand how to use the scheduling feature."
response = generate_response(review_text, bm25, index, corpus, metadata)

# print with wraptext
print("Generated Response:\n", response)

Retrieved Chunks:
Planner – Complete the Quran on your schedule:
planner – complete the quran on your schedule want to finish reading the quran in a set time? our quran planner helps you break down your reading into manageable daily sessions. whether it ’ s throughout the year or during the blessed month of ramadan, the planner keeps you organized and motivated to reach your goal. set your reading goals choose what works for you : read the quran in 29 days ( ideal for ramadan ) complete it in one year ( about 2 pages daily ) read the quran translation in 90 days to improve your understanding create your own custom schedule how to start using the quran planner? create your plan : go to the planner section and tap add planner to create a new plan. enter the details : select the start and end range, and set the number of days. don ’ t forget to set a notification time to stay on track. start reading : after creating your plan, tap read to begin. read the quran following your schedule. sav

In [19]:
# Run generation
review_text = "The quran app is good but not helpful in understanding the Quranic verses."
response = generate_response(review_text, bm25, index, corpus, metadata)

# print with wraptext
print("Generated Response:\n", response)

Retrieved Chunks:
Deepening Quran Understanding: Tools for Students of Knowledge and Arabic Learners:
deepening quran understanding : tools for students of knowledge and arabic learners deepen your understanding of the quran through our comprehensive arabic analysis tools. whether you ’ re a beginner in arabic or an advanced student of knowledge, our quran app offers tools that make learning arabic easier and more meaningful. let ’ s look at how features like e3rab ( morphology ), gharib ( uncommon arabic word meanings ), word root, lemma, grammar details, verb forms, word occurrences can help you. e3rab ( morphology ) e3rab explains the grammatical structure of each word in the quran. it helps you : understand how words fit into an ayah see how grammar changes the meaning of ayahs improve your knowledge of quranic arabic grammar for deeper studies for example, when you read surah al - fatiha, you ’ ll see why “ الحمد ” and “ لله ” are written this way and how they connect. to study e3