# AI 2024 Online Summer Internship
### Name: Rasikh Ali
### Email: rasikhali1234@gmail.com

<div class="alert alert-block alert-info">
    <h1> Libraries </h1>
</div>

In [1]:
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install faiss-gpu

import pandas as pd
import glob

import re

from sentence_transformers import SentenceTransformer
import numpy as np

import faiss

C:\Users\ABC\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\ABC\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
  from tqdm.autonotebook import tqdm, trange


<div class="alert alert-block alert-info">
    <h1>Data Exploration and Preprocessing </h1>
</div>

<div class="alert alert-block alert-warning">
    <h4>Defining columns</h4>
</div>

In [2]:
colnames = ['Chapter_Number', 
            'Chapter_English', 
            'Chapter_Arabic', 
            'Section_Number', 
            'Section_English', 
            'Section_Arabic', 
            'Hadith_number', 
            'English_Hadith', 
            'English_Isnad', 
            'English_Matn', 
            'Arabic_Hadith', 
            'Arabic_Isnad', 
            'Arabic_Matn', 
            'Arabic_Comment', 
            'English_Grade', 
            'Arabic_Grade']

<div class="alert alert-block alert-warning">
    <h4>Loading Dataset</h4>
</div>

In [3]:
path = "LK-Hadith-Corpus-master"  
book_filenames = sorted(glob.glob(path + '//**//*.csv', recursive=True))  # reading all CSV files in all books

In [4]:
import pandas as pd


sample_file = pd.read_csv(book_filenames[0], names=colnames, skiprows=1)

# Convert the DataFrame to a list of lists
data = [sample_file.columns.tolist()] + sample_file.values.tolist()

# Format and display the data using string formatting
print("Sample Data:")
for row in data:
    print("{:<15} {:<15} {:<15}".format(*row))


Sample Data:
Chapter_Number  Chapter_English Chapter_Arabic 
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taharah) كتاب الطهارة   
1.0             Purification (Kitab Al-Taha

<div class="alert alert-block alert-warning">
    <h4>Displaying Dataset</h4>
</div>

In [5]:
sample_file = pd.read_csv(book_filenames[0], names=colnames, skiprows=1)
print("Sample Data:")
print(sample_file.head())

Sample Data:
   Chapter_Number                  Chapter_English Chapter_Arabic  \
0             1.0  Purification (Kitab Al-Taharah)   كتاب الطهارة   
1             1.0  Purification (Kitab Al-Taharah)   كتاب الطهارة   
2             1.0  Purification (Kitab Al-Taharah)   كتاب الطهارة   
3             1.0  Purification (Kitab Al-Taharah)   كتاب الطهارة   
4             1.0  Purification (Kitab Al-Taharah)   كتاب الطهارة   

   Section_Number                                    Section_English  \
0             1.0                  Seclusion While Relieving Oneself   
1             1.0                  Seclusion While Relieving Oneself   
2             2.0           Choosing An Appropriate Place To Urinate   
3             3.0  What A Person Should Say When He Enters The Ar...   
4             3.0  What A Person Should Say When He Enters The Ar...   

                                      Section_Arabic  Hadith_number  \
0          باب التَّخَلِّي عِنْدَ قَضَاءِ الْحَاجَةِ            1.0 

<div class="alert alert-block alert-warning">
    <h4>Check for Missing Values</h4>
</div>

In [6]:
print("\nMissing Values:")
print(sample_file.isnull().sum())


Missing Values:
Chapter_Number       0
Chapter_English      0
Chapter_Arabic       0
Section_Number       0
Section_English      0
Section_Arabic       0
Hadith_number        0
English_Hadith       0
English_Isnad        0
English_Matn         0
Arabic_Hadith        0
Arabic_Isnad         0
Arabic_Matn          1
Arabic_Comment     389
English_Grade       20
Arabic_Grade         0
dtype: int64


<div class="alert alert-block alert-warning">
    <h4>Clean the Dataset</h4>
</div>

In [7]:
def clean_text(text):
    # removing special characters, digits, etc.
    if isinstance(text, str):
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower()
    else:
        text = ''
    return text

In [8]:
all_hadiths = []

for book_filename in book_filenames:
    data = pd.read_csv(book_filename, names=colnames, skiprows=1)
    
    data['cleaned_english_hadith'] = data['English_Hadith'].apply(lambda x: str(x) if pd.notnull(x) else "")
    
    all_hadiths.extend(data['cleaned_english_hadith'].tolist())

In [9]:
hadith_df = pd.DataFrame({
    'Hadith': all_hadiths
})

<div class="alert alert-block alert-warning">
    <h4>Dropping empty rows from the Dataset</h4>
</div>

In [10]:
hadith_df = hadith_df[hadith_df['Hadith'] != ""]

<div class="alert alert-block alert-success">
    <h4>Saving Cleaned Dataset</h4>
</div>

In [11]:
hadith_df.to_csv('cleaned_hadith_data.csv', index=False)

<div class="alert alert-block alert-info">
    <h1>Vectorization and Embedding Generation</h1>
</div>

<div class="alert alert-block alert-success">
    <h4>Loading Cleaned Dataset</h4>
</div>

In [12]:
hadith_df = pd.read_csv('cleaned_hadith_data.csv')

<div class="alert alert-block alert-warning">
    <h4>Initialize Sentence Transformer (LLM Model)</h4>
    <p>SentenceTransformer makes it easy to generate embeddings for sentences, paragraphs, or entire documents using pre-trained transformer models like BERT, RoBERTa, and others. These embeddings are dense vector representations that capture the semantic meaning of the text, allowing for tasks like similarity search, clustering, classification, and more.</p>
</div>

In [13]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')



<div class="alert alert-block alert-warning">
    <h4>Generating Embeddings</h4>
</div>

In [14]:
embeddings = model.encode(hadith_df['Hadith'].tolist())

In [15]:
embeddings = np.array(embeddings)

<div class="alert alert-block alert-success">
    <h4>Saving Embeddings</h4>
</div>

In [16]:
np.save('hadith_embeddings.npy', embeddings)

<div class="alert alert-block alert-info">
    <h1>Retrieval System Implementation</h1>
</div>

<div class="alert alert-block alert-warning">
    <h4>Storing Embeddings in a Vector Database (using FAISS)</h4>
    <p>FAISS (Facebook AI Similarity Search) is an open-source library developed by Facebook AI Research that is designed for efficient similarity search and clustering of dense vectors. It's particularly useful for large-scale datasets where you need to find the most similar items to a given query.</p>
</div>

<div class="alert alert-block alert-success">
    <h4>Loading Embeddings</h4>
</div>

In [17]:
embeddings = np.load('hadith_embeddings.npy')

<div class="alert alert-block alert-warning">
    <h4>Initializing FAISS</h4>
</div>

In [18]:
d = embeddings.shape[1]       # dimension of embeddings
index = faiss.IndexFlatL2(d)  # L2 distance index (Euclidean, it is used to calculate the similarity between two vectors)

In [19]:
index.add(embeddings)

<div class="alert alert-block alert-success">
    <h4>Saving FAISS</h4>
</div>

In [20]:
faiss.write_index(index, 'hadith_faiss.index')

<div class="alert alert-block alert-warning">
    <h4>Querying the Retrieval System</h4>
</div>

In [21]:
def retrieve_similar_hadiths(query, model, index, hadith_df, k=5):
    # Preprocess and embed the query
    query_embedding = model.encode([clean_text(query)])

    # Search the FAISS index for similar Hadiths
    distances, indices = index.search(query_embedding, k)

    # Retrieve and print the top-k Hadiths
    for i in range(k):
        print(f"Hadith {i+1}:")
        print(hadith_df['Hadith'].iloc[indices[0][i]])
        print(f"Distance: {distances[0][i]}\n")

<div class="alert alert-block alert-success">
    <h4>Loading FAISS</h4>
</div>

In [22]:
index = faiss.read_index('hadith_faiss.index')

<div class="alert alert-block alert-info">
    <h1>Testing</h1>
</div>

In [23]:
query = "Al-Mahdi"
retrieve_similar_hadiths(query, model, index, hadith_df)

Hadith 1:
Narrated AbuSa'id al-Khudri: The Prophet (ﷺ) said: The Mahdi will be of my stock, and will have a broad forehead a prominent nose. He will fill the earth will equity and justice as it was filled with oppression and tyranny, and he will rule for seven years.
Distance: 41.363380432128906

Hadith 2:
It was narrated that ‘Aishah said: “I could not find him, meaning the Prophet (ﷺ), and he was in Al-Baqi’. He said: “As-salamu ‘alaykum dara qawmin mu’minin. Antum lana faratun wa inna bikum lahiqun. Allahumma la tahrimna ajrahum wa la taftinna ba’dahum. (Peace be upon you, O abode of believing people. You have gone ahead of us and verily we will join you soon. O Allah, do not deprive us of their reward and do not put us to trial after them).”
Distance: 41.64872360229492

Hadith 3:
Narrated AbuUmamah: Abu Sa’id (Al Khudri) reported The Prophet(ﷺ) was asked “Which believers are most perfect in respect of faith? He replied “A man who strives in the path of Allaah with his life and prop

In [24]:
query = "Fasting in Ramadan"
retrieve_similar_hadiths(query, model, index, hadith_df)

Hadith 1:
Narrated Abu Ayyub: The Prophet (ﷺ) as saying: If anyone fasts during Ramadan, then follows it with six days in Shawwal, it will be like a perpetual fast.
Distance: 28.837890625

Hadith 2:
It was narrated from Abu Hurairah that the Messenger of Allah (ﷺ) said: ‘Do not anticipate Ramadan by fasting one or two days before, except for a man who has a habitual pattern of fasting, in which case let him fast.”
Distance: 30.951108932495117

Hadith 3:
Abu Ayub narrated that : the Messenger of Allah said: "Whoever fasts Ramadan, then follows it with six from Shawwal, then that is (equal in reward) to fasting everyday."
Distance: 34.41267395019531

Hadith 4:
Narrated Abdullah ibn Abbas: The Prophet (ﷺ) said: Do not fast one day or two days just before Ramadan except in the case of a man who has been in the habit or observing a fast (on that day); and do not fast until you sight it (the moon). Then fast until you sight it. If a cloud appears on that day (i.e. 29th of Ramadan) then compl

In [25]:
query = "End of time"
retrieve_similar_hadiths(query, model, index, hadith_df)

Hadith 1:
Abu Huraira reported Allah's Messenger (ﷺ) having said: The time would draw close to the Last Hour and knowledge would decrease. The rest of the hadith is the same.
Distance: 45.721771240234375

Hadith 2:
Narrated AbuHurayrah: The Prophet (ﷺ) said: A time is certainly coming to mankind when only the receiver of usury will remain, and if he does not receive it, some of its vapour will reach him. Ibn Isa said: Some of its dust will reach him.
Distance: 48.00752258300781

Hadith 3:
Abu Hurairah narrated that : Allah's Messenger said: "Indeed for (the time of) Salat (there is a) beginning and an end. The beginning of the time for the Zuhr prayer is when the sun passes the zenith, and the end of its time is when the time for Asr enters. The beginning of the time for the Asr [prayer] is when its time enters, and the end of its time is when the sun yellows (turns pale). The beginning of the time of Maghrib is when the sun as set, and the end of its time is when the twilight has vani