In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import math

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy import spatial

from sentence_transformers import SentenceTransformer
from sentence_transformers import  util

import scipy.spatial

import pandas as pd
import numpy as np


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abdulwahabmac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:


from sklearn.metrics import average_precision_score

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example documents
documents = [
    "Machine learning is fascinating.",
    "Artificial Intelligence and machine learning are closely related.",
    "I love programming in Python.",
    "Natural Language Processing is a subset of AI.",
    "Python is a versatile programming language."
]
arabic_documents_sentences = [
    "التعليم في المملكة العربية السعودية يشهد تطورًا كبيرًا.",
    "توجد العديد من الجامعات المرموقة في السعودية.",
    "الطلاب في السعودية يحصلون على تعليم عالي الجودة.",
    "تشجع الحكومة السعودية على الابتكار في التعليم.",
    "التكنولوجيا تلعب دورًا مهمًا في التعليم السعودي.",
    "هناك العديد من المنح الدراسية المتاحة للطلاب في السعودية.",
    "التعليم الإلكتروني أصبح شائعًا في المملكة.",
    "تقدم المدارس في السعودية مناهج متنوعة وشاملة.",
    "التعليم الفني والمهنى يحظى بأهمية كبيرة في السعودية.",
    "اللغة الإنجليزية تُدرّس بشكل مكثف في المدارس السعودية."
]



In [6]:

# Encode the documents using SentenceTransformers
document_embeddings = model.encode(documents, convert_to_tensor=True)

# Create a TF-IDF Vectorizer and fit on the documents
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)


In [7]:

def preprocess_query(query):
    # Tokenization, stop word removal, and stemming
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = query.lower().split()
    filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)


In [8]:

def hybrid_search(query, top_k=3, weight_st=0.5, weight_tfidf=0.5):
    # Encode the query using SentenceTransformers
    query_preprocessed = preprocess_query(query)
    query_embedding = model.encode([query_preprocessed], convert_to_tensor=True)

    # Compute cosine similarities between the query and the documents using SentenceTransformers
    cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]

    # Transform the query using the TF-IDF vectorizer
    query_tfidf = tfidf_vectorizer.transform([query_preprocessed])

    # Calculate cosine similarity between the query and the documents using TF-IDF
    tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix)[0]
    #print(tfidf_similarities)

    # Combine the scores with weights
    combined_scores = (weight_st * cosine_scores.cpu().numpy() + weight_tfidf * tfidf_similarities) / (weight_st + weight_tfidf)
    #print(combined_scores)

    # Normalize scores (optional)
    combined_scores = (combined_scores - np.min(combined_scores)) / (np.max(combined_scores) - np.min(combined_scores))

    # Find the top_k highest scores
    top_results = np.argpartition(-combined_scores, range(top_k))[:top_k]
    #print(top_results)

    print(f"Query: {query}\nTop {top_k} most similar documents:")
    for idx in top_results:
        print(f"Document: {documents[idx]}, Combined Score: {combined_scores[idx]:.4f}")

    return top_results, combined_scores



In [9]:

# Example usage with evaluation:
while True:
    user_query = input("Enter your query (or type 'exit' to quit): ")
    if user_query.lower() == 'exit':
        break
    hybrid_search(user_query)

    predicted_ranks, _ = hybrid_search(user_query)


Enter your query (or type 'exit' to quit):  love


Query: love
Top 3 most similar documents:
Document: I love programming in Python., Combined Score: 1.0000
Document: Machine learning is fascinating., Combined Score: 0.1012
Document: Artificial Intelligence and machine learning are closely related., Combined Score: 0.0150
Query: love
Top 3 most similar documents:
Document: I love programming in Python., Combined Score: 1.0000
Document: Machine learning is fascinating., Combined Score: 0.1012
Document: Artificial Intelligence and machine learning are closely related., Combined Score: 0.0150


Enter your query (or type 'exit' to quit):  AI


Query: AI
Top 3 most similar documents:
Document: Natural Language Processing is a subset of AI., Combined Score: 1.0000
Document: Artificial Intelligence and machine learning are closely related., Combined Score: 0.3703
Document: Machine learning is fascinating., Combined Score: 0.2793
Query: AI
Top 3 most similar documents:
Document: Natural Language Processing is a subset of AI., Combined Score: 1.0000
Document: Artificial Intelligence and machine learning are closely related., Combined Score: 0.3703
Document: Machine learning is fascinating., Combined Score: 0.2793


Enter your query (or type 'exit' to quit):  EXIT
