In [1]:
import json
import minsearch
import openai
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

### Create embeddings using pretrained models

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
model = SentenceTransformer("all-mpnet-base-v2")



In [4]:
with open("documents_with_ids.json", 'r') as f_in:
    documents_id = json.load(f_in)

### Setup ElasticSearch connection

In [5]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '653f7fd556fb', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'TjKlD42_S8a0B9n_pfvwLw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Create mappings and index

In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": { 
        "properties": {
            # Fields for Bible verses
            "book": { "type": "text" },  # Bible-specific field
            "book_name": { "type": "keyword" },  # Bible-specific field
            "chapter": { "type": "text" },  # Bible-specific field
            "verse": { "type": "text" },  # Bible-specific field
            
            # Fields for YouTube transcripts
            "video_id": { "type": "keyword" },  # Video-specific field
            "title": { "type": "text" },  # Video-specific field
            "publish_date": { "type": "date" },  # Video-specific field
            "author": { "type": "text" },  # Video-specific field

            # Common field for both types of documents
            "text": { "type": "text" },  # Both Bible verses and video transcripts share this
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

index_name = "vector_db"

# Create the index

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_db'})

### Add documents into index

In [7]:
for doc in tqdm(documents_id):
    text = doc['text']

    doc['text_vector'] = model.encode(text)

  0%|          | 0/36755 [00:00<?, ?it/s]

In [8]:
for doc in tqdm(documents_id):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/36755 [00:00<?, ?it/s]

### Create end user query

In [9]:
search_term = "who is daniel"
vector_search_term = model.encode(search_term)

### Define the evaluation metric function

In [10]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [11]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [12]:
def evaluate(question_dict, search_function):
    relevance_total = []

    for q in tqdm(questions_dict):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Load the questions dataset

In [13]:
import pandas as pd

questions = pd.read_csv('questions.csv')

In [14]:
questions_dict = questions.to_dict(orient='records')

In [15]:
def evaluate(question_dict, search_function):
    relevance_total = []

    for q in tqdm(questions_dict):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Just keyword search

In [16]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "should": [  # Use 'should' to match either condition
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["text^4", "book_name", "chapter", "verse", "book"],
                            "type": "best_fields"
                        }
                    },
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["title", "text", "author"],  # Fields for video transcripts
                            "type": "best_fields"
                        }
                    }
                ]
            }
        }
    }

    # Execute the search query
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    
    # Collect the results from the hits
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
evaluate(questions_dict, elastic_search)

### Semantic search only

In [17]:
def semantic_search(es_client, index_name, embedding_vector):
    # Construct the search query
    search_query = {
        "size": 10,  # Limit the number of results
        "knn": {
            "field": "text_vector",  # Field containing the dense vector
            "query_vector": embedding_vector,  # The query vector (embedding)
            "k": 10,  # Number of nearest neighbors to retrieve
            "num_candidates": 1000  # Candidate pool size for efficiency
        }
    }

    # Execute the search query
    response = es_client.search(index=index_name, body=search_query)

    # Collect and return the results from the hits
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [18]:
def question_vector_knn(q):
    question = q['question']

    embedding_vector = model.encode(question)

    return semantic_search(es_client, index_name, embedding_vector)

In [None]:
evaluate(questions_dict, question_vector_knn)

  0%|          | 0/5500 [00:00<?, ?it/s]

### Semantic search plus keyword search

In [21]:
def hybrid_search(es_client, index_name, query_text, embedding_vector):
    # Hybrid search query with script_score
    search_query = {
        "size": 10,  # Limit the number of results
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": query_text,
                            "fields": [
                                "text^4",        # Boost relevance of 'text' field (common field)
                                "book_name",     # Bible-specific field
                                "chapter",       # Bible-specific field
                                "verse",         # Bible-specific field
                                "title",         # YouTube-specific field
                                "author"         # YouTube-specific field
                            ],
                            "type": "best_fields"
                        }
                    }
                ],
                "should": [
                    {
                        "script_score": {
                            "query": {
                                "match_all": {}  # Apply score script to all documents
                            },
                            "script": {
                                "source": """
                                    double cosine_similarity = dotProduct(params.query_vector, 'text_vector') /
                                                              (Math.sqrt(dotProduct(params.query_vector, params.query_vector)) * 
                                                               Math.sqrt(dotProduct('text_vector', 'text_vector')) + 1e-10);
                                    return _score + cosine_similarity * params.vector_weight;
                                """,
                                "params": {
                                    "query_vector": embedding_vector,  # The embedding vector
                                    "vector_weight": 0.5  # Adjust the weight of vector similarity (0-1)
                                }
                            }
                        }
                    }
                ]
            }
        }
    }

    # Execute the search query
    try:
        response = es_client.search(index=index_name, body=search_query)
        # Collect and return the results from the hits
        result_docs = [hit['_source'] for hit in response['hits']['hits']]
        return result_docs
    except Exception as e:
        print(f"Error executing search: {e}")
        return []

In [22]:
def hybrid_vector_knn(q):
    question = q['question']

    embedding_vector = model.encode(question)

    return hybrid_search(es_client, index_name, q, embedding_vector)

In [23]:
evaluate(questions_dict, hybrid_vector_knn)

  0%|          | 0/5500 [00:00<?, ?it/s]

BadRequestError: BadRequestError(400, 'x_content_parse_exception', '[multi_match] unknown token [START_OBJECT] after [query]')