In [7]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
import faiss
import re

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [8]:
# Load models
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)
gen_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
# gen_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)

In [9]:
from transformers import AutoTokenizer as HFTokenizer

# Tokenizer for chunking
chunk_tokenizer = HFTokenizer.from_pretrained("bert-base-uncased")

def chunk_text_token_based(text, chunk_size=300, overlap=100):
    tokens = chunk_tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunk_text = chunk_tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk_text)
    return chunks


In [10]:
def build_index(documents):
    all_chunks = []
    metadata = []

    for doc in documents:
        chunks = chunk_text_token_based(doc['text'])
        for chunk in chunks:
            all_chunks.append(chunk)
            metadata.append({'title': doc['title'], 'url': doc['url'], 'text': chunk})

    # BM25
    tokenized_corpus = [chunk.split(" ") for chunk in all_chunks]
    bm25 = BM25Okapi(tokenized_corpus)

    # Dense Embeddings
    embeddings = embedding_model.encode(all_chunks, convert_to_numpy=True, normalize_embeddings=True)

    # Faiss HNSW Index
    dim = embeddings.shape[1]
    index = faiss.IndexHNSWFlat(dim, 32)
    index.hnsw.efConstruction = 40
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    return bm25, index, embeddings, all_chunks, metadata


In [18]:
def retrieve_relevant_context(query, bm25, index, corpus, metadata, top_k=50, rerank_k=5):
    # Sparse retrieval
    tokenized_query = query.split(" ")
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_idx = np.argsort(bm25_scores)[::-1][:top_k]

    # Dense retrieval
    query_emb = embedding_model.encode(query, convert_to_numpy=True, normalize_embeddings=True)
    faiss.normalize_L2(query_emb.reshape(1, -1))
    _, dense_top_idx = index.search(query_emb.reshape(1, -1), top_k)

    # Merge candidates
    candidate_indices = set(bm25_top_idx) | set(dense_top_idx[0])
    candidates = [(i, corpus[i], metadata[i]) for i in candidate_indices]

    # Rerank with Cross-Encoder
    pairs = [[query, text] for _, text, _ in candidates]
    scores = cross_encoder.predict(pairs)
    reranked = sorted(zip(scores, candidates), key=lambda x: x[0], reverse=True)[:rerank_k]

    # Return top reranked contexts and metadata
    contexts = [text for _, (_, text, _) in reranked]
    docs = [meta for _, (_, _, meta) in reranked]
    return contexts, docs


In [12]:
def generate_response(review: str, bm25, index, corpus, metadata) -> str:
    contexts, docs = retrieve_relevant_context(review, bm25, index, corpus, metadata)
    combined_context = "\n\n".join([f"{doc['title']}:\n{ctx}" for ctx, doc in zip(contexts, docs)])
    prompt = f"Context:\n{combined_context}\n\nReview: {review}\n\nResponse:"
    print(prompt)
    # inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    # outputs = gen_model.generate(**inputs, max_new_tokens=150)
    # return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)


In [16]:
import json

# Load your Islamic etiquette knowledge base
with open('islamic_etiquette_knowledge_base.json') as f:
    etiquette_data = json.load(f)

# Prepare chunked documents
documents = []
for entry in etiquette_data:
    chunks = chunk_text_token_based(entry['text'])  # use the token-based chunker
    for chunk in chunks:
        documents.append({
            'title': entry['title'],
            'url': entry['url'],
            'text': chunk
        })

In [20]:
# Build index
bm25, index, embeddings, corpus, metadata = build_index(documents)

# Run generation
review_text = "The app is great. But it lacks in some manner issue"
response = generate_response(review_text, bm25, index, corpus, metadata)

# print with wraptext
print("Generated Response:\n", response)

Context:
SAHIH MUSLIM, Book 26 : The Book of Salutions and Greetings (KITAB AS-SALAM):
the great curer, there is no cure but through thine healing power, which leaves nothing of the disease. ' a ' isha reported that when allah ' s messenger ( may peace be upon him ) came to visit any sick he supplicated for him and said : lord of the people, remove the malady, cure him for thou art a great curer. there is no cure but through thine healing power which leaves no trouble, and in the narration transmitted on the authority of abu bakr there is a slight variation of wording. this hadith has been reported on the authority of ' a ' isha through another chain of transmitters with a slight variation of wording. ' a ' isha reported : allah ' s messenger ( may peace be upon him ) used to recite ( this supplication ) as the words of incantation : " lord of the people, remove the trouble for in thine hand is the cure ; none is there to relieve him ( the burden of disease ) but only thou. this hadith