In [1]:
import json
import re
import os 
import random

In [2]:
import json

# Load file.json from the books_with_matn folder
with open("books_with_matn/abudawud_with_matn.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Check first few entries
print(data[:2])  # if it's a list of hadiths


[{'id': 1, 'chapter_number': 1, 'chapter_title': 'Purification (Kitab Al-Taharah)', 'cleaned_arabic': 'حَدَّثَنَا عَبْدُ اللَّهِ بْنُ مَسْلَمَةَ بْنِ قَعْنَبٍ الْقَعْنَبِيُّ، حَدَّثَنَا عَبْدُ الْعَزِيزِ، - يَعْنِي ابْنَ مُحَمَّدٍ - عَنْ مُحَمَّدٍ، - يَعْنِي ابْنَ عَمْرٍو - عَنْ أَبِي سَلَمَةَ، عَنِ الْمُغِيرَةِ بْنِ شُعْبَةَ، أَنَّ النَّبِيَّ صلى الله عليه وسلم كَانَ إِذَا ذَهَبَ الْمَذْهَبَ أَبْعَدَ .', 'extracted_arabic': 'حدثنا عبد الله بن مسلمة بن قعنب القعنبي، حدثنا عبد العزيز، - يعني ابن محمد - عن محمد، - يعني ابن عمرو - عن أبي سلمة، عن المغيرة بن شعبة، أن النبي صلى الله عليه وسلم كان إذا ذهب المذهب أبعد .', 'english': 'When the Prophet (ﷺ) went (outside) to relieve himself, he went to a far-off place.', 'source': 'Sunan Abi Dawud', 'reference': 'Sunan Abi Dawud - Chapter 1 - Hadith 1'}, {'id': 2, 'chapter_number': 1, 'chapter_title': 'Purification (Kitab Al-Taharah)', 'cleaned_arabic': 'حَدَّثَنَا مُسَدَّدُ بْنُ مُسَرْهَدٍ، حَدَّثَنَا عِيسَى بْنُ يُونُسَ، أَخْبَرَنَا إِسْمَاعِي

In [3]:
# Remove diacritics (tashkeel)
def remove_diacritics(text):
    # Arabic diacritics Unicode range
    diacritics_pattern = re.compile(r'[\u064B-\u065F\u0610-\u061A\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED]')
    return re.sub(diacritics_pattern, '', text)

# Attempt to remove the isnād — basic pattern

def detect_matn(text):
    """
    Detects common phrases that indicate the start of the hadith matn.
    Returns the text starting from the detected phrase.
    """
    text = remove_diacritics(text)
    patterns = [
        r'قال\s+رسول\s+الله\s+صلى\s+الله\s+عليه\s+وسلم',
        r'رسول\s+الله\s+صلى\s+الله\s+عليه\s+وسلم\s+قال',
        r'سمعت\s+رسول\s+الله\s+صلى\s+الله\s+عليه\s+وسلم',
        r'كان\s+رسول\s+الله\s+صلى\s+الله\s+عليه\s+وسلم',
        r'حدثنا\s+رسول\s+الله\s+صلى\s+الله\s+عليه\s+وسلم',
        r'حدثنا\s+محمد\s+صلى\s+الله\s+عليه\s+وسلم',
        r'سأل\s+رسول\s+الله\s+صلى\s+الله\s+عليه\s+وسلم',
        r'قال\s+النبي\s+صلى\s+الله\s+عليه\s+وسلم',
        r'عن\s+النبي\s+صلى\s+الله\s+عليه\s+وسلم',
        r'أن\s+النبي\s+صلى\s+الله\s+عليه\s+وسلم\s+قال',
        r'النبي\s+صلى\s+الله\s+عليه\s+وسلم\s+قال',
        r'عن\s+عائشة\s+رضي\s+الله\s+عنها\s+تقول',
        r'عن\s+عائشة\s+أم\s+المؤمنين',
        r'يا\s+رسول\s+الله',
        r'سأل\s+الرسول',
        r'سأل\s+النبي',
        r'سأل\s+النبي\s+صلى\s+الله\s+عليه\s+وسلم',
        r'سأل\s+الرسول\s+صلى\s+الله\s+عليه\s+وسلم',
        r'قال\s+النبي',
        r'النبي\s+قال',
        r'يقول\s+النبي',
        r'النبي\s+يقول',
        r'جاء\s+النبي',
    ]


    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return text[match.start():]

    return text  # fallback to full if nothing found

# Updated Arabic cleaner
def clean_arabic(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # normalize spaces
    text = re.sub(r'[“”]', '"', text)  # normalize quotes
    text = re.sub(r'[ـ]+', '', text)  # remove tatweel
    text = re.sub(r'[^\u0600-\u06FF\s.,؛؟!"«»"()\-]', '', text)
    return text.strip()

# English cleaner (unchanged)
def clean_english(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[“”]', '"', text)
    text = re.sub(r'\.{2,}', '.', text)
    return text.strip()


In [4]:
def clean_structure_hadith_book(input_json_path, output_json_path):
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    book_title = data.get("metadata", {}).get("arabic", {}).get("title") or \
                 data.get("metadata", {}).get("title") or \
                 input_json_path.split("/")[-1].replace(".json", "")

    chapters = data.get("chapters", [])
    hadiths = data.get("hadiths", [])
    all_cleaned_hadiths = []

    for chapter in chapters:
        chapter_id = chapter.get("id")
        chapter_arabic = chapter.get("arabic", chapter.get("title", ""))
        hadiths_in_chapter = [h for h in hadiths if h.get("chapterId") == chapter_id]

        for idx, hadith in enumerate(hadiths_in_chapter, start=1):
            arabic_text = hadith.get("arabic", "")
            english_text = hadith.get("english", "")
            if isinstance(english_text, dict):
                english_text = english_text.get("text", "")

            cleaned_ar = clean_arabic(arabic_text)
            matn = detect_matn(cleaned_ar)

            all_cleaned_hadiths.append({
                "id": idx,
                "chapter_number": chapter_id,
                "chapter_title": chapter_arabic,
                "cleaned_arabic": cleaned_ar,
                "extracted_arabic": matn,
                "english": clean_english(english_text),
                "source": book_title,
                "reference": f"{book_title} - Chapter {chapter_id} - Hadith {idx}"
            })

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(all_cleaned_hadiths, f, ensure_ascii=False, indent=4)

    print(f"✅ Cleaned hadiths (with chapters) saved to {output_json_path}")



In [6]:
books = [ "riyad_assalihin.json","bukhari.json", "muslim.json", "abudawud.json", "tirmidhi.json", "nasai.json", "malik.json", "ahmed.json"]

In [8]:
cleaned_books=["riyad_assalihin_with_matn.json","bukhari_with_matn.json","muslim_with_matn.json","abudawud_with_matn.json","tirmidhi_with_matn.json","nasai_with_matn.json","malik_with_matn.json","ahmed_with_matn.json"]

In [7]:
# Ensure output folder exists
os.makedirs("books_with_matn", exist_ok=True)

for book in books:
    book_path = os.path.join("hadith_books", book)

    if book.endswith(".json"):
        cleaned_filename = book[:-5] + "_with_matn.json"
    else:
        cleaned_filename = book + "_with_matn.json"

    # Save in a new folder named 'book_with_matn'
    cleaned_path = os.path.join("books_with_matn", cleaned_filename)

    clean_structure_hadith_book(book_path, cleaned_path)

✅ Cleaned hadiths (with chapters) saved to books_with_matn/riyad_assalihin_with_matn.json
✅ Cleaned hadiths (with chapters) saved to books_with_matn/bukhari_with_matn.json
✅ Cleaned hadiths (with chapters) saved to books_with_matn/muslim_with_matn.json
✅ Cleaned hadiths (with chapters) saved to books_with_matn/abudawud_with_matn.json
✅ Cleaned hadiths (with chapters) saved to books_with_matn/tirmidhi_with_matn.json
✅ Cleaned hadiths (with chapters) saved to books_with_matn/nasai_with_matn.json
✅ Cleaned hadiths (with chapters) saved to books_with_matn/malik_with_matn.json
✅ Cleaned hadiths (with chapters) saved to books_with_matn/ahmed_with_matn.json


In [23]:
# Combined corpus
corpus_all = []

# Loop through each file
for file_name in cleaned_books:
    file_path = os.path.join('books_with_matn', file_name)
    with open(file_path, 'r', encoding='utf-8') as f:
        book_data = json.load(f)

    for entry in book_data:
        if entry.get("extracted_arabic"):
            corpus_all.append({
                "id": str(entry.get("id")),
                "chapter_number": entry.get("chapter_number", None),
                "chapter_title": entry.get("chapter_title", "—"),
                "cleaned_arabic": entry.get("cleaned_arabic", ""),
                "english": entry.get("english", ""),
                "text": entry.get("extracted_arabic", ""),  # used for indexing
                "source": entry.get("source", "unknown source"),
                "reference": entry.get("reference", "")
            })
        

for h in corpus_all:
    h['id'] = str(h['id'])
corpus_all[:1]


[{'id': '1',
  'chapter_number': 1,
  'chapter_title': 'كتاب الأدب',
  'cleaned_arabic': 'وعن ابن عمر رضي الله عنهما أن رسول الله صلى الله عليه وسلم مر على رجل من الأنصار وهو يعظ أخاه في الحياء، فقال رسول الله صلى الله عليه وسلم "دعه فإن الحياء من الإيمان" ((متفق عليه)) .',
  'english': 'Messenger of Allah (ﷺ) passed by a man of the Ansar who was admonishing his brother regarding shyness. Messenger of Allah (ﷺ) said, "Leave him alone, for modesty is a part of Iman." .',
  'text': 'قال رسول الله صلى الله عليه وسلم "دعه فإن الحياء من الإيمان" ((متفق عليه)) .',
  'source': 'رياض الصالحين',
  'reference': 'رياض الصالحين - Chapter 1 - Hadith 1'}]

In [None]:
# !pip install colbert-ai==0.2.20 transformers==4.41.2 faiss-cpu

In [10]:
!pip show colbert-ai

Name: colbert-ai
Version: 0.2.21
Summary: Efficient and Effective Passage Search via Contextualized Late Interaction over BERT
Home-page: https://github.com/stanford-futuredata/ColBERT
Author: Omar Khattab
Author-email: okhattab@stanford.edu
License: 
Location: /system/conda/miniconda3/envs/cloudspace/lib/python3.10/site-packages
Requires: bitarray, datasets, flask, git-python, ninja, python-dotenv, scipy, tqdm, transformers, ujson
Required-by: RAGatouille


In [None]:
# # Remove old download if it exists
# !rm -rf colbertv2.0.tar.gz colbert_checkpoints

# # Download the actual ColBERTv2 pretrained model from Stanford
# !wget -O colbertv2.0.tar.gz https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz

# # Create checkpoints folder and extract
# !mkdir -p colbert_checkpoints
# !tar -xzf colbertv2.0.tar.gz -C colbert_checkpoints

# # Check if colbert.dnn exists now
# !ls colbert_checkpoints/colbertv2.0


In [20]:
from colbert.infra import ColBERTConfig
from colbert.modeling.checkpoint import Checkpoint

ckpt = Checkpoint("akhooli/Arabic-ColBERT-100K", colbert_config=ColBERTConfig())

  self.scaler = torch.cuda.amp.GradScaler()


In [None]:
hadith_1 = corpus_bukhari[1] 
hadith_2 = corpus[3]   
docs = [hadith_1["text"], hadith_2]  # extract only the text part
query_vectors = ckpt.queryFromText(docs, bsize=2)

  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


In [15]:
query_vectors.shape

torch.Size([2, 32, 128])

In [11]:
import torch.nn.functional as F
import torch

def colbert_max_similarity(vec1, vec2):
    # Convert both to float32 for compatibility
    vec1 = vec1.to(dtype=torch.float32)
    vec2 = vec2.to(dtype=torch.float32)

    # Normalize vectors
    vec1 = F.normalize(vec1, dim=-1)
    vec2 = F.normalize(vec2, dim=-1)

    # Compute similarity matrix: (tokens_query x tokens_doc)
    sim_matrix = torch.matmul(vec1, vec2.T)

    # Get max similarity for each token in query
    max_similarities, _ = sim_matrix.max(dim=1)

    return max_similarities.mean().item()

In [12]:
similarity_score = colbert_max_similarity(query_vectors[0], query_vectors[1])
print(f"Similarity: {similarity_score:.4f}")

NameError: name 'query_vectors' is not defined

In [16]:
query = "ما هو الإيمان؟"
documents = [
    "الطهور شطر الإيمان",
    "من حسن إسلام المرء تركه ما لا يعنيه",
    "الإيمان أن تؤمن بالله وملائكته وكتبه"
]

query_vec = ckpt.queryFromText([query])[0]
doc_vecs = ckpt.docFromText(documents)

# Compute similarity scores
scores = [colbert_max_similarity(query_vec, dvec) for dvec in doc_vecs]

for i, (doc, score) in enumerate(zip(documents, scores)):
    print(f"[{i}] Score: {score:.4f} | {doc}")


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: ما هو الإيمان؟, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([    2,     1,   394,   583, 14981,   105,     3,     4,     4,     4,
            4,     4,     4,     4,     4,     4,     4,     4,     4,     4,
            4,     4,     4,     4,     4,     4,     4,     4,     4,     4,
            4,     4], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

[0] Score: 0.5155 | الطهور شطر الإيمان
[1] Score: 0.3844 | من حسن إسلام المرء تركه ما لا يعنيه
[2] Score: 0.7347 | الإيمان أن تؤمن بالله وملائكته وكتبه


In [18]:
query = "ماهي منزلة الحياء في الاسلام  ؟"
documents = [entry["text"] for entry in corpus_bukhari[:10]]

query_vec = ckpt.queryFromText([query])[0]
doc_vecs = ckpt.docFromText(documents)

# Compute similarity scores
scores = [colbert_max_similarity(query_vec, dvec) for dvec in doc_vecs]

for i, (doc, score) in enumerate(zip(documents, scores)):
    print(f"[{i}] Score: {score:.4f} | {doc}")


#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: ماهي منزلة الحياء في الاسلام  ؟, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([    2,     1, 25830, 43727, 41298,   305,  2083,   105,     3,     4,
            4,     4,     4,     4,     4,     4,     4,     4,     4,     4,
            4,     4,     4,     4,     4,     4,     4,     4,     4,     4,
            4,     4], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

[0] Score: 0.2461 | سمعت رسول الله صلى الله عليه وسلم، يقول  " إنما الأعمال بالنيات، وإنما لكل امرئ ما نوى، فمن كانت هجرته إلى دنيا يصيبها أو إلى امرأة ينكحها، فهجرته إلى ما هاجر إليه "
[1] Score: 0.1731 | قال رسول الله صلى الله عليه وسلم أحيانا يأتيني مثل صلصلة الجرس  وهو أشده على  فيفصم عني وقد وعيت عنه ما قال، وأحيانا يتمثل لي الملك رجلا فيكلمني فأعي ما يقول ". قالت عائشة رضى الله عنها و

In [46]:
text = "إنما الأعمال بالنيات"
vec = ckpt.queryFromText([text])[0]
score = colbert_max_similarity(vec, vec)
print(f"Self-similarity: {score:.4f}")  

Self-similarity: 1.0000


In [47]:
a = "الدين النصيحة"
b = "النصيحة جزء أساسي من الدين الإسلامي"

vec1 = ckpt.queryFromText([a])[0]
vec2 = ckpt.queryFromText([b])[0]

print(f"Paraphrase similarity: {colbert_max_similarity(vec1, vec2):.4f}")

Paraphrase similarity: 0.8787


In [None]:
#!pip install -U einops flash_attn
!pip install -U ragatouille
!pip install -U pylate

In [None]:
!nvidia-smi

In [21]:
from ragatouille import RAGPretrainedModel

RAG = RAGPretrainedModel.from_pretrained("akhooli/Arabic-ColBERT-100K")

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel
  self.scaler = torch.cuda.amp.GradScaler()


In [None]:
index_name = "books_full_100K"
batch_size = 300
total = len(corpus_all)

# random.shuffle(corpus_all)

# texts = [
#     f"{chapter_translation_dict.get(doc['metadata'].get('chapter', ''), '')} - {doc['text']}"


# Extract text and metadata from updated corpus
texts = [doc["text"] for doc in corpus_all]  
metadatas = [
    {
        "id": doc["id"],
        "chapter_title": doc["chapter_title"],
        "chapter_number": doc.get("chapter_number", ""),
        "source": doc["source"],
        "reference": doc["reference"],
        "english": doc["english"],
        "cleaned_arabic": doc["cleaned_arabic"]
    }
    for doc in corpus_all
]

# # Index in batches
# for i in range(0, total, batch_size):
#     text_batch = texts[i:i + batch_size]
#     metadata_batch = metadatas[i:i + batch_size]
#     RAG.index(collection=text_batch, document_metadatas=metadata_batch, index_name=index_name)
#     print(f"✅ Indexed {i + len(text_batch)} / {total} hadiths")

In [None]:
!nvidia-smi

In [9]:
# RAG.from_index(index_path =".ragatouille/colbert/indexes/bukhari_full")
RAG = RAG.from_index(index_path=".ragatouille/colbert/indexes/books_full")

In [20]:
results = RAG.search("يتنزل الله في الليل",k=10) 

for res in results:
    print(res['content'])


قال رسول الله صلى الله عليه وسلم  " إذا مضى شطر الليل أو ثلثاه ينزل الله تبارك وتعالى إلى السماء الدنيا فيقول هل من سائل يعطى هل من داع يستجاب له هل من مستغفر يغفر له حتى ينفجر الصبح " .
قال رسول الله صلى الله عليه وسلم  " إن الله يمهل حتى إذا ذهب ثلث الليل الأول نزل إلى السماء الدنيا فيقول هل من مستغفر هل من تائب هل من سائل هل من داع حتى ينفجر الفجر " .
الأسود والخيط الأبيض فلا يزال يأكل ويشرب حتى يتبين له رئيهما فأنزل الله بعد ذلك  من الفجر فعلموا أنما يعني بذلك الليل والنهار .
وحدثني عن مالك، عن داود بن الحصين، قال أخبرني مخبر، أن عبد الله بن عباس، كان يقول دلوك الشمس إذا فاء الفىء وغسق الليل اجتماع الليل وظلمته .
رسول الله صلى الله عليه وسلم قال نعم أسرينا ليلتنا كلها حتى قام قائم الظهيرة وخلا الطريق فلا يمر فيه أحد حتى رفعت لنا صخرة طويلة لها ظل لم تأت عليه الشمس بعد فنزلنا عندها فأتيت الصخرة فسويت بيدي مكانا ينام فيه النبي صلى الله عليه وسلم في ظلها ثم بسطت عليه فروة ثم قلت نم يا رسول الله وأنا أنفض لك ما حولك فنام وخرجت أنفض ما حوله فإذا أنا براعي غنم مقبل
قال النبي صلى الله علي

In [1]:
pip install rank-bm25 

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [21]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

nltk.download('punkt')

class HadithSearchSystem:
    def __init__(self, hadiths, index_name="bukhari_full_100K"):
        self.hadiths = hadiths
        
        #  Load semantic model + index
        index_path = f".ragatouille/colbert/indexes/{index_name}"
        self.rag_model = RAG.from_index(index_path)

        #  Prepare BM25
        tokenized_corpus = [word_tokenize(hadith['text'].lower()) for hadith in hadiths]
        self.bm25 = BM25Okapi(tokenized_corpus)

    def semantic_search(self, query, k=5):
        results = self.rag_model.search(query, k=k)

        if results is None:
            return []

        # RAG returns: list of dicts with {'document_id', 'text', 'metadata', 'score'}
        hits = []
        for result in results:
            metadata = result['document_metadata']
            hadith_id = str(metadata.get('id', "0"))   # make sure this matches hadith['id']
            matching_hadith = next((h for h in self.hadiths if str(h['id']) == hadith_id), None)
            if matching_hadith:
                hits.append({
                    'hadith': matching_hadith,
                    'score': result['score']
                })
        return hits

    def lexical_search(self, query, k=5):
        tokenized_query = word_tokenize(query.lower())
        doc_scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(doc_scores)[-k:][::-1]
        return [{
            'hadith': self.hadiths[idx],
            'score': doc_scores[idx]
        } for idx in top_indices]

    def hybrid_search(self, query, k1=50, k2=5):
        # Step 1: Lexical search
        lexical_results = self.lexical_search(query, k=k1)

        # Step 2: Extract hadiths and their texts
        hadiths = [res['hadith'] for res in lexical_results]
        texts = [h['text'] for h in hadiths]

        # Step 3: Semantic reranking
        reranked_pairs = self.rag_model.rerank(query, texts)

        # Debug print
        print("Reranked pairs:", reranked_pairs)

        # Step 4: Match results back to hadiths
        reranked = []
        for pair in reranked_pairs:
            score = pair['score']
            text = pair['content']  # ✅ use 'content' not 'text'
            for hadith in hadiths:
                if hadith['text'] == text:
                    reranked.append({
                        'hadith': hadith,
                        'score': score
                    })
                    break

        # Step 5: Return top k2
        reranked.sort(key=lambda x: x['score'], reverse=True)
        return reranked[:k2]





[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
system = HadithSearchSystem(corpus_bukhari)  # or full dataset if not testing

results = system.hybrid_search("علاقة العمل بالنية")

print (results)

100%|██████████| 2/2 [00:00<00:00, 12.13it/s]

Reranked pairs: [{'content': 'قال النبي صلى الله عليه وسلم  " العمل بالنية، وإنما لامرئ ما نوى، فمن كانت هجرته إلى الله ورسوله فهجرته إلى الله ورسوله صلى الله عليه وسلم ومن كانت هجرته إلى دنيا يصيبها أو امرأة ينكحها، فهجرته إلى ما هاجر إليه ".', 'score': 17.921875, 'rank': 0, 'result_index': 5}, {'content': 'عن النبي صلى الله عليه وسلم قال  " الأعمال بالنية، ولامرئ ما نوى، فمن كانت هجرته إلى الله ورسوله، فهجرته إلى الله ورسوله، ومن كانت هجرته لدنيا يصيبها، أو امرأة يتزوجها، فهجرته إلى ما هاجر إليه ".', 'score': 15.5390625, 'rank': 1, 'result_index': 39}, {'content': 'حدثنا عبيد الله بن موسى، عن هشام بن عروة، عن أبيه، عن أبي مراوح، عن أبي ذر  رضى الله عنه  قال سألت النبي صلى الله عليه وسلم أى العمل أفضل، قال " إيمان بالله، وجهاد في سبيله ". قلت فأى الرقاب أفضل قال " أغلاها ثمنا، وأنفسها عند أهلها ". قلت فإن لم أفعل. قال " تعين صانعا أو تصنع لأخرق ". قال فإن لم أفعل. قال " تدع الناس من الشر، فإنها صدقة تصدق بها على نفسك ".', 'score': 14.2734375, 'rank': 2, 'result_index': 16}, {'content'




In [1]:
import json

# load the results from the hybrid search
with open(".ragatouille/colbert/indexes/books_full/collection.json", "r", encoding="utf-8") as f:
    collection = json.load(f)

In [3]:
len(collection)

44296

In [None]:
collection[:2]  # check first two entries

In [2]:
from search_engine import HadithSearchSystem
from data_loader import corpus_all

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel
  self.scaler = torch.cuda.amp.GradScaler()
[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
search_system = HadithSearchSystem(corpus_all)
query = "ما هو الإيمان؟"
results = search_system.semantic_search(query, k=10)

Loading searcher for index books_full for the first time... This may take a few seconds
[Jul 30, 09:24:16] #> Loading codec...
[Jul 30, 09:24:16] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Jul 30, 09:24:17] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Jul 30, 09:24:17] #> Loading IVF...
[Jul 30, 09:24:17] #> Loading doclens...


100%|██████████| 2/2 [00:00<00:00, 955.97it/s]

[Jul 30, 09:24:17] #> Loading codes and residuals...



100%|██████████| 2/2 [00:00<00:00, 26.38it/s]

Searcher loaded!






#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: ما هو الإيمان؟, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([    2,     1,   394,   583, 14981,   105,     3,     4,     4,     4,
            4,     4,     4,     4,     4,     4,     4,     4,     4,     4,
            4,     4,     4,     4,     4,     4,     4,     4,     4,     4,
            4,     4], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Semantic search results: [{'content': 'قال " هل تدرون ما الإيمان بالله وحده ". قالوا الله ورسوله أعلم. قال " شهادة أن لا إله إلا الله وأن محمدا رسول الله، وإقام الصلاة، وإيتاء الزكاة، وصوم رمضان، وتعطوا الخمس من المغنم ". ونهاهم عن الدباء والحنتم والمزفت. قال شعبة ربما قال النقير، وربما قال المقير. قال " احفظوه وأخبروه من وراءكم ".', 'score': 20.453125, 'rank': 1, 'document_id': 'e3ca6fde-761d-4f95-b2b5-86d4b44aebef', 'passage_id': 2476, 'document_metadata': {'id': '29', 'chapter_title': 'كتاب العلم', 'chapter_number': 3, 'source': 'صحيح البخاري', 'reference': 'صحيح البخاري - Chapter 3 - Hadith 29', 'english': 'I was an interpreter between the people and Ibn `Abbas. Once Ibn `Abbas said that a delegation of the tribe of `Abdul Qais came to the Prophet (ﷺ) who asked them, "Who are the people (i.e. you)? (Or) who are the delegates?" They replied, "We are from the tribe of Rabi`a." Then the Prophet (ﷺ) said to them, "Welcome, O people (or said, "O delegation (of `Abdul Qais).") Neither wi

In [4]:
results

[{'hadith': {'id': '29',
   'chapter_title': 'كتاب العلم',
   'chapter_number': 3,
   'source': 'صحيح البخاري',
   'reference': 'صحيح البخاري - Chapter 3 - Hadith 29',
   'english': 'I was an interpreter between the people and Ibn `Abbas. Once Ibn `Abbas said that a delegation of the tribe of `Abdul Qais came to the Prophet (ﷺ) who asked them, "Who are the people (i.e. you)? (Or) who are the delegates?" They replied, "We are from the tribe of Rabi`a." Then the Prophet (ﷺ) said to them, "Welcome, O people (or said, "O delegation (of `Abdul Qais).") Neither will you have disgrace nor will you regret." They said, "We have come to you from a distant place and there is the tribe of the infidels of Mudar intervening between you and us and we cannot come to you except in the sacred month. So please order us to do something good (religious deeds) and that we may also inform our people whom we have left behind (at home) and that we may enter Paradise (by acting on them.)" The Prophet ordered th