In [194]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

In [195]:
# Initialize the tokenizer and model for dense embeddings (using a pre-trained model like BERT)
model_name = "bert-base-uncased"  # You can choose another pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)



In [196]:
def get_dense_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():  # Disable gradient calculation for efficiency
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the dense representation
    dense_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return dense_embedding

In [197]:
def get_sparse_representation_tfidf(corpus, text, vocab_size=512):
    vectorizer = TfidfVectorizer(max_features=vocab_size)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    text_vector = vectorizer.transform([text])
    sparse_representation = text_vector.toarray().flatten()
    
    # Ensure consistent dimension
    if sparse_representation.shape[0] < vocab_size:
        sparse_representation = np.pad(sparse_representation, (0, vocab_size - sparse_representation.shape[0]), 'constant')
    elif sparse_representation.shape[0] > vocab_size:
        sparse_representation = sparse_representation[:vocab_size]

    return sparse_representation

In [198]:
def compute_similarity_tfidf(question, corpus):
    # Get dense embedding for the question
    question_dense = get_dense_embedding(question)
    
    # Get sparse representation for the question using TF-IDF
    question_sparse = get_sparse_representation_tfidf(corpus, question)

    scores = []
    for doc in corpus:
        # Dense similarity
        doc_dense = get_dense_embedding(doc)
        dense_score = cosine_similarity(question_dense.reshape(1, -1), doc_dense.reshape(1, -1))[0][0]
        
        # Sparse similarity
        doc_sparse = get_sparse_representation_tfidf(corpus, doc)
        sparse_score = cosine_similarity(question_sparse.reshape(1, -1), doc_sparse.reshape(1, -1))[0][0]

        # Hybrid score: weighted sum of dense and sparse scores
        combined_score = 0.6 * dense_score + 0.4 * sparse_score
        scores.append((doc, combined_score))

    # Sort documents by similarity score in descending order
    sorted_docs = sorted(scores, key=lambda x: x[1], reverse=True)
    return sorted_docs

In [199]:
# Example usage
corpus = [
    "The ancient tree stood tall, witnessing countless seasons change, its leaves whispering secrets of the past to the wind.",
    "In the kitchen, the chef prepared a gourmet meal with fresh ingredients, creating an explosion of flavors that delighted the senses.",
    "A young girl dreams of exploring distant planets one day, imagining vibrant landscapes and alien creatures far beyond the stars.",
    "The artist painted a beautiful landscape on canvas, capturing the vibrant colors of the sunset reflecting on the serene lake.",
    "A curious cat watched the birds from the window sill, its eyes wide with fascination as they chirped and flew around.",
    "The teacher explained complex math problems to eager students, using engaging examples and visual aids to make learning enjoyable and effective.",
    "In a quiet library, a scholar researched ancient texts, delving into forgotten knowledge that shaped the foundations of modern thought.",
    "The sun set behind the mountains, painting the sky orange and purple, as the world prepared for the peaceful night ahead.",
    "A musician strummed his guitar, creating a soothing melody that echoed through the room, captivating everyone with its heartfelt notes.",
    "The dog barked excitedly as the mailman approached, wagging its tail vigorously, ready to greet him with playful enthusiasm."
]
question = "Who dreams"

sorted_documents = compute_similarity_tfidf(question, corpus)
print(sorted_documents)

[('A young girl dreams of exploring distant planets one day, imagining vibrant landscapes and alien creatures far beyond the stars.', 0.34985727816148393), ('In a quiet library, a scholar researched ancient texts, delving into forgotten knowledge that shaped the foundations of modern thought.', 0.23992466926574707), ('The ancient tree stood tall, witnessing countless seasons change, its leaves whispering secrets of the past to the wind.', 0.2341342806816101), ('The sun set behind the mountains, painting the sky orange and purple, as the world prepared for the peaceful night ahead.', 0.2119516611099243), ('The teacher explained complex math problems to eager students, using engaging examples and visual aids to make learning enjoyable and effective.', 0.19984111189842224), ('A curious cat watched the birds from the window sill, its eyes wide with fascination as they chirped and flew around.', 0.18599746227264405), ('The dog barked excitedly as the mailman approached, wagging its tail vig