In [1]:
import json
import minsearch
import openai
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

### Create embeddings using pretrained models

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
model = SentenceTransformer("all-mpnet-base-v2")



In [4]:
with open("documents_with_ids.json", 'r') as f_in:
    documents_id = json.load(f_in)

### Setup ElasticSearch connection

In [5]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '37b2288c9e17', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'srJ-aaLQQHCyqLZPfWZkpw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Create mappings and index

In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": { 
        "properties": {
            # Fields for Bible verses
            "book": { "type": "text" },  # Bible-specific field
            "book_name": { "type": "keyword" },  # Bible-specific field
            "chapter": { "type": "text" },  # Bible-specific field
            "verse": { "type": "text" },  # Bible-specific field
            
            # Fields for YouTube transcripts
            "video_id": { "type": "keyword" },  # Video-specific field
            "title": { "type": "text" },  # Video-specific field
            "publish_date": { "type": "date" },  # Video-specific field
            "author": { "type": "text" },  # Video-specific field

            # Common field for both types of documents
            "text": { "type": "text" },  # Both Bible verses and video transcripts share this
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

index_name = "vector_db"

# Create the index

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_db'})

### Add documents into index

In [7]:
for doc in tqdm(documents_id):
    text = doc['text']

    doc['text_vector'] = model.encode(text)

  0%|          | 0/36755 [00:00<?, ?it/s]

In [8]:
for doc in tqdm(documents_id):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/36755 [00:00<?, ?it/s]

### Create end user query

In [9]:
search_term = "who is daniel"
vector_search_term = model.encode(search_term)

### Define the evaluation metric function

In [10]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [11]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [12]:
def evaluate(question_dict, search_function):
    relevance_total = []

    for q in tqdm(questions_dict):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Load the questions dataset

In [13]:
import pandas as pd

questions = pd.read_csv('questions.csv')

In [14]:
questions_dict = questions.to_dict(orient='records')

In [15]:
def evaluate(question_dict, search_function):
    relevance_total = []

    for q in tqdm(questions_dict):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Semantic search only

In [18]:
def semantic_search(es_client, index_name, embedding_vector):
    # Construct the search query
    search_query = {
        "size": 10,  # Limit the number of results
        "knn": {
            "field": "text_vector",  # Field containing the dense vector
            "query_vector": embedding_vector,  # The query vector (embedding)
            "k": 10,  # Number of nearest neighbors to retrieve
            "num_candidates": 1000  # Candidate pool size for efficiency
        }
    }

    # Execute the search query
    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    # Collect and return the results from the hits
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [19]:
def question_vector_knn(q):
    question = q['question']

    embedding_vector = model.encode(question)

    return semantic_search(es_client, index_name, embedding_vector)

In [20]:
evaluate(questions_dict, question_vector_knn)

  0%|          | 0/5500 [00:00<?, ?it/s]

{'hit_rate': 0.25327272727272726, 'mrr': 0.1530637085137085}

### Retrival with langchain

In [68]:
from langchain.embeddings import SentenceTransformerEmbeddings
from typing import Dict
from langchain_elasticsearch import ElasticsearchRetriever

In [69]:
es_url = 'http://localhost:9200'

In [74]:
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


In [100]:
query = "who is john chapter 1 vs 1"

In [93]:
def elastic_search_hybrid(query):
    def hybrid_query(search_query: str) -> Dict:
        # Generate the embedding vector for the query
        embedding_vector = embeddings.embed_query(search_query)
    
        return {
            "size": 5,  # Limit the number of results
            "query": {
                "bool": {
                    # Keyword-based search
                    "must": {
                        "multi_match": {
                            "query": search_query,
                            "fields": ["text^4", "book_name", "chapter", "verse", "title", "author"],  # Bible and video fields
                            "type": "best_fields",
                            "boost": 0.5
                        }
                    }
                }
            },
            # Semantic search using k-nearest neighbors (KNN)
            "knn": {
                "field": "text_vector",  # Field that stores the dense vectors for semantic search
                "query_vector": embedding_vector,  # The query vector (embedding)
                "k": 5,  # Number of nearest neighbors to retrieve
                "num_candidates": 10000,  # Number of candidate vectors to consider for efficiency
                "boost": 0.5,  # Adjust the weight of vector similarity
            }
        }
    
    # Elasticsearch retriever initialization
    hybrid_retriever = ElasticsearchRetriever.from_es_params(
        index_name=index_name,
        body_func=hybrid_query,
        content_field='text',  # The field to retrieve text content from
        url=es_url,  # The Elasticsearch URL endpoint
    )
    hybrid_results = hybrid_retriever.invoke(query)

    result_docs = []
    
    for hit in hybrid_results:
        result_docs.append(hit.metadata['_source'])

    return result_docs

In [101]:
hybrid_results = hybrid_retriever.invoke(query)

## hybrid search evaluation

In [98]:
def question_text_hybrid(q):
    question = q['question']

    return elastic_search_hybrid(question)

In [99]:
evaluate(questions_dict, question_text_hybrid)

  0%|          | 0/5500 [00:00<?, ?it/s]

{'hit_rate': 0.11018181818181819, 'mrr': 0.0825818181818181}