In [3]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DPRQuestionEncoder, DPRContextEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer
from sentence_transformers import SentenceTransformer, CrossEncoder
from rank_bm25 import BM25Okapi
import faiss

In [None]:
# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [5]:
dpr_question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

In [6]:
dpr_question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
dpr_context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [8]:
# Load encoder models
dpr_context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# Cross-encoder for reranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)

In [10]:
# Generator model
gen_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

In [11]:
gen_model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base').to(device)

In [12]:
def chunk_text(text: str, chunk_size: int = 200, overlap: int = 50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunks.append(' '.join(words[i:i+chunk_size]))
    return chunks

In [13]:
with open('islamic_etiquette_knowledge_base.json') as f:
    etiquette_data = json.load(f)

raw_entries = etiquette_data
# Create passages
passages = []
metadata = []
for entry in raw_entries:
    for chunk in chunk_text(entry['text']):
        passages.append(chunk)
        metadata.append({'url': entry['url'], 'title': entry['title'], 'text': chunk})

In [14]:
# Build BM25 index
tokenized_corpus = [p.split() for p in passages]
bm25 = BM25Okapi(tokenized_corpus)

# Move context encoder to the correct device
dpr_context_encoder = dpr_context_encoder.to(device)

# Build DPR (FAISS) index
ctx_embeddings = []
for txt in passages:
    inputs = dpr_context_tokenizer(txt, return_tensors='pt', truncation=True, max_length=256).to(device)
    with torch.no_grad():
        emb = dpr_context_encoder(**inputs).pooler_output
    ctx_embeddings.append(emb.cpu().numpy())
ctx_embeddings = np.vstack(ctx_embeddings)
index = faiss.IndexFlatIP(ctx_embeddings.shape[1])
faiss.normalize_L2(ctx_embeddings)
index.add(ctx_embeddings)

In [15]:
# Retrieval function
def retrieve_relevant_context(query: str, top_k: int = 5):
    # BM25 retrieval
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k]

    # DPR retrieval
    q_inputs = dpr_question_tokenizer(query, return_tensors='pt', truncation=True, max_length=64).to(device)
    with torch.no_grad():
        q_emb = dpr_question_encoder(**q_inputs).pooler_output.cpu().numpy()
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)
    dpr_top = I[0]

    # Combine BM25 and DPR
    combined = set(bm25_top) | set(dpr_top)
    candidates = list(combined)
    # Rerank with cross-encoder
    cross_in = [(query, passages[idx]) for idx in candidates]
    cross_scores = cross_encoder.predict(cross_in)
    reranked = [candidates[i] for i in sorted(range(len(cross_scores)), key=lambda i: cross_scores[i], reverse=True)][:top_k]

    return [passages[i] for i in reranked], [metadata[i] for i in reranked]

In [22]:
def rag_response(review: str) -> str:
    contexts, docs = retrieve_relevant_context(review, top_k=3)
    combined_context = "\n\n".join([f"[{doc['title']}]({doc['url']}): {ctx}" for ctx, doc in zip(contexts, docs)])
    prompt = f"Context:\n{combined_context}\n\nReview: {review}"
    print(f"Prompt: {prompt}")

In [23]:
review_text = "The app is great. But it lacks in some manner issue"
print(rag_response(review_text))

Prompt: Context:
[The 75 Good Manners (Commandments) in The Quran](https://blog.une.edu.au/new-england-muslims/2019/11/14/the-75-good-manners-commandments-in-the-quran/comment-page-1/): pure (53:32) Speak nicely, even to the ignorant (25:63) Don’t ask for repayment for favours (76:9) Make room for others at gatherings (58:11) If enemy wants peace, then accept it (8:61) Return a greeting in a better manner (4:86) Don’t remind others of your favours (2:264) Make peace between fighting groups (49:9) Lower your voice and talk moderately (31:19) Don’t let hatred cause you to be unjust (6:108) Don’t ask too many favours from people (2:273) Greet people when entering their home (24:27) Be just, even against yourself & relatives (4:135) Speak gently, even to leaders of disbelief (20:44) Don’t criticize small contributions of others (9:79) Don’t call the Prophet how you call others’ (24:63) Try to make peace between husband & wife (4:128) Don’t call the Prophet from outside his rooms (49:4) Opp

In [18]:
# # Response generation
# def generate_response(review: str) -> str:
#     contexts, docs = retrieve_relevant_context(review, top_k=3)
#     combined_context = "\n\n".join([f"[{doc['title']}]({doc['url']}): {ctx}" for ctx, doc in zip(contexts, docs)])
#     prompt = f"Context:\n{combined_context}\n\nReview: {review}\n\nResponse:"
#     print(f"Prompt: {prompt}") 

#     inputs = gen_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
#     outputs = gen_model.generate(**inputs, max_new_tokens=150)
#     return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [19]:
# review_text = "The app is great. But it lacks in some manner issue"
# print(generate_response(review_text))
