In [1]:
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
import numpy as np
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from bson.objectid import ObjectId

class HybridSearch:
    def __init__(self, mongo_uri, pinecone_instance, model_name='all-MiniLM-L6-v2'):
        # Initialize MongoDB connection
        self.mongo_client = MongoClient(mongo_uri)
        self.db = self.mongo_client['mytestdb']
        self.collection = self.db['collection']
        
        # Initialize Pinecone
        self.pinecone_index = pinecone_instance
        
        # Initialize sentence transformer model
        self.model = SentenceTransformer(model_name)
        
        # Download required NLTK data
        nltk.download('punkt')
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        # Convert to lowercase and remove special characters
        text = re.sub(r'[^\w\s]', '', text.lower())
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        return tokens

    def create_vector_embeddings(self):
        """Create and store vector embeddings for all documents in Pinecone"""
        documents = self.collection.find({}, {'_id': 1, 'fullplot': 1})
        
        for doc in documents:
            if 'fullplot' in doc and doc['fullplot']:
                # Generate embedding
                embedding = self.model.encode(doc['fullplot']).tolist()
                
                # Store in Pinecone with MongoDB _id as metadata
                self.pinecone_index.upsert(
                    vectors=[{
                        'id': str(doc['_id']),
                        'values': embedding,
                        'metadata': {'mongo_id': str(doc['_id'])}
                    }]
                )

    def hybrid_search(self, query, top_k=5, alpha=0.5):
        """
        Perform hybrid search using both sparse and dense retrieval
        alpha: weight for combining scores (0-1), higher value gives more weight to dense retrieval
        """
        # Generate query embedding for dense retrieval
        query_embedding = self.model.encode(query).tolist()
        
        # Perform dense retrieval using Pinecone
        dense_results = self.pinecone_index.query(
            vector=query_embedding,
            top_k=top_k,
            include_metadata=True
        )

        # Perform sparse retrieval using BM25
        # First, get all documents
        documents = list(self.collection.find({}, {'_id': 1, 'fullplot': 1}))
        
        # Preprocess documents for BM25
        processed_docs = [self.preprocess_text(doc['fullplot']) for doc in documents if 'fullplot' in doc]
        bm25 = BM25Okapi(processed_docs)
        
        # Get BM25 scores
        processed_query = self.preprocess_text(query)
        bm25_scores = bm25.get_scores(processed_query)
        
        # Normalize BM25 scores
        bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
        
        # Create a dictionary of dense scores
        dense_scores = {
            match.metadata['mongo_id']: match.score 
            for match in dense_results.matches
        }
        
        # Combine scores
        final_scores = []
        for i, doc in enumerate(documents):
            doc_id = str(doc['_id'])
            dense_score = dense_scores.get(doc_id, 0)
            sparse_score = bm25_scores[i]
            
            # Combine scores using weighted average
            combined_score = (alpha * dense_score) + ((1 - alpha) * sparse_score)
            final_scores.append((doc, combined_score))
        
        # Sort by combined score and return top_k results
        final_results = sorted(final_scores, key=lambda x: x[1], reverse=True)[:top_k]
        
        return final_results

        

    def search(self, query, top_k=5, alpha=0.5):
        """Wrapper method for performing search and returning formatted results"""
        # Perform hybrid search to get initial matches with their scores
        hybrid_results = self.hybrid_search(query, top_k, alpha)
        
        mylist = []
        for doc, score in hybrid_results:
            # Retrieve the full document from MongoDB using its ObjectId
            complete_doc = self.collection.find_one({"_id": ObjectId(doc['_id'])})
            if complete_doc:
                # Add the score to the complete document
                complete_doc['score'] = score
                mylist.append(complete_doc)

        # Print formatted results
        for result in mylist:
            print(f"_id: {result['_id']}")
            print(f"chapter: {result.get('chapter', 'N/A')}")
            print(f"verse: {result.get('verse', 'N/A')}")
            print(f"speaker: {result.get('speaker', 'N/A')}")
            print(f"sanskrit: {result.get('sanskrit', 'N/A')}")
            print(f"translation: {result.get('translation', 'N/A')}")
            # print(f"questions: {result.get('questions', 'N/A')}")
            print(f"fullplot: {result.get('fullplot', 'N/A')}")
            print("=" * 50)

        return mylist


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize the hybrid search
mongo_uri="mongodb+srv://nidhish:nidhish@cluster1.vthss.mongodb.net/?retryWrites=true&w=majority&appName=Cluster1"
from pinecone import Pinecone
pc = Pinecone(api_key="pcsk_7Cj4Kj_bK4WbhEpxCM4PJQWLkP8muKcU6eRAN7pSLy2fphFvVr9NXuYY395kHhKo3K6za")
index = pc.Index(host="https://mydb-j1b0j8k.svc.aped-4627-b74a.pinecone.io")
hybrid_searcher = HybridSearch(mongo_uri, index)

# First, create vector embeddings for all documents (run this once)
hybrid_searcher.create_vector_embeddings()

# Perform hybrid search
query = "How does the Gita start?"
results = hybrid_searcher.search(query, top_k=5, alpha=0.6)

# Print results
for result in results:
    print(f"Score: {result['score']}")
    print(f"Text: {result['fullplot'][:200]}...")
    print("---")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


_id: 6777fca6e8210468ca2b1e0d
chapter: 1
verse: 1
speaker: धृतराष्ट्र
sanskrit: धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः| मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय  || 1.1 || 
translation: Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"
questions: How does the Gita start?
fullplot: How does the Gita start? Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"
_id: 6777fca6e8210468ca2b1faf
chapter: 11
verse: 5
speaker: भगवान
sanskrit: पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः| नानाविधानि दिव्यानि नानावर्णाकृतीनि च || 11.5 || 
translation: The Blessed Lord said, "Behold, O Arjuna, forms of Mine, by the hundreds and thousands, of different sorts, divine, and of various colors and shapes."
questions: Where in the Gita did Krishna start revealing his divine form to Arjuna?
fullplo

In [5]:

hybrid_searcher = HybridSearch(mongo_uri, index)
query = "How does the Gita start?"
results = hybrid_searcher.search(query, top_k=5, alpha=0.6)

# Print results
for result in results:
    print(f"Score: {result['score']}")
    print("Document:")
    for key, value in result.items():
        print(f"{key}: {value}")
    print("---")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


_id: 6777fca6e8210468ca2b1e0d
chapter: 1
verse: 1
speaker: धृतराष्ट्र
sanskrit: धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः| मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय  || 1.1 || 
translation: Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"
questions: How does the Gita start?
fullplot: How does the Gita start? Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"
_id: 6777fca6e8210468ca2b1faf
chapter: 11
verse: 5
speaker: भगवान
sanskrit: पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः| नानाविधानि दिव्यानि नानावर्णाकृतीनि च || 11.5 || 
translation: The Blessed Lord said, "Behold, O Arjuna, forms of Mine, by the hundreds and thousands, of different sorts, divine, and of various colors and shapes."
questions: Where in the Gita did Krishna start revealing his divine form to Arjuna?
fullplo

In [6]:

query = "When does Yoga commence? When should I start doing Yoga?"
results = hybrid_searcher.search(query, top_k=5, alpha=0.6)

# Print results
for result in results:
    print(f"Score: {result['score']}")
    print("Document:")
    for key, value in result.items():
        print(f"{key}: {value}")
    print("---")


_id: 677a563013819fdd8e18aa54
chapter: 1
verse: 1
speaker: N/A
sanskrit: अथ योगानुशासनम्
translation: Now, the teachings of yoga are presented since the student is ready to received these teachings.
questions: N/A
fullplot: When does Yoga commence? When should I start doing Yoga? Now, the teachings of yoga are presented since the student is ready to received these teachings.
_id: 6777fca6e8210468ca2b1ef9
chapter: 6
verse: 4
speaker: भगवान
sanskrit: यदा हि नेन्द्रियार्थेषु न कर्मस्वनुषज्जते| सर्वसङ्कल्पसंन्यासी योगारूढस्तदोच्यते || 6.4 || 
translation: When a person is not attached to the sense-objects or to actions, having renounced all thoughts, then they are said to have attained Yoga.
questions: when is a person said to be established in Yoga?
fullplot: when is a person said to be established in Yoga? When a person is not attached to the sense-objects or to actions, having renounced all thoughts, then they are said to have attained Yoga.
_id: 677a563013819fdd8e18aac3
chapter: 3
vers

In [43]:
results

[{'_id': ObjectId('6777fca6e8210468ca2b1e0d'),
  'chapter': 1,
  'verse': 1,
  'speaker': 'धृतराष्ट्र',
  'sanskrit': 'धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः| मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय  || 1.1 || ',
  'translation': 'Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"',
  'questions': 'How does the Gita start?',
  'fullplot': 'How does the Gita start? Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"',
  'score': 0.8038516282},
 {'_id': ObjectId('6777fca6e8210468ca2b1faf'),
  'chapter': 11,
  'verse': 5,
  'speaker': 'भगवान',
  'sanskrit': 'पश्य मे पार्थ रूपाणि शतशोऽथ सहस्रशः| नानाविधानि दिव्यानि नानावर्णाकृतीनि च || 11.5 || ',
  'translation': 'The Blessed Lord said, "Behold, O Arjuna, forms of Mine, by the hundreds and thousands, of different sorts, 