In [108]:
import json

with open('documents_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [109]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch("http://localhost:9200")

In [110]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": { 
        "properties": {
            # Fields for Bible verses
            "book": { "type": "text" },  # Bible-specific field
            "book_name": { "type": "keyword" },  # Bible-specific field
            "chapter": { "type": "text" },  # Bible-specific field
            "verse": { "type": "text" },  # Bible-specific field
            
            # Fields for YouTube transcripts
            "video_id": { "type": "keyword" },  # Video-specific field
            "title": { "type": "text" },  # Video-specific field
            "publish_date": { "type": "date" },  # Video-specific field
            "author": { "type": "text" },  # Video-specific field

            # Common field for both types of documents
            "text": { "type": "text" }  # Both Bible verses and video transcripts share this
        }
    }
}

index_name = "text_retrival"

# Create the index
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_retrival'})

In [112]:
for doc in tqdm(documents):
    # Ensure each document has the expected structure for indexing
    if 'book_name' in doc:  # It's a Bible verse
        # Prepare for indexing
        es_client.index(index=index_name, document=doc)
    elif 'video_id' in doc:  # It's a YouTube transcript
        # Prepare for indexing
        es_client.index(index=index_name, document=doc)

  0%|          | 0/36755 [00:00<?, ?it/s]

In [113]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "should": [  # Use 'should' to match either condition
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["text^4", "book_name", "chapter", "verse", "book"],
                            "type": "best_fields"
                        }
                    },
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["title", "text", "author"],  # Fields for video transcripts
                            "type": "best_fields"
                        }
                    }
                ]
            }
        }
    }

    # Execute the search query
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    
    # Collect the results from the hits
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
elastic_search("who is esther")

In [114]:
import pandas as pd
from tqdm.auto import tqdm

In [115]:
questions = pd.read_csv('questions.csv')

In [116]:
question_dict = questions.to_dict(orient='records')

In [117]:
relevance_total = []

for q in tqdm(question_dict):
    doc_id = q['document']
    results = elastic_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/5500 [00:00<?, ?it/s]

In [120]:

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [121]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [122]:
hit_rate(relevance_total), mrr(relevance_total)

(0.09909090909090909, 0.11408484848484857)

### Try minisearch

In [128]:
import minsearch

index = minsearch.Index(
    text_fields = ["book", "chapter", "verse", "text"],
    keyword_fields = ["book_name"]
)
for document in documents:
    for key, value in document.items():
        document[key] = str(value)
        
        
index.fit(documents)

<minsearch.Index at 0x7fef06f7cfe0>

In [129]:
def minsearch_search(query):
    boost = {"text": 5.0, "book" : 3.0}

    results = index.search(
        query = query,
        boost_dict = boost,
        num_results = 10
    )
    
    return results

In [130]:
relevance_total = []

for q in tqdm(question_dict):
    doc_id = q['document']
    results = minsearch_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/5500 [00:00<?, ?it/s]

In [131]:
def evaluate(question_dict, search_function):
    relevance_total = []

    for q in tqdm(question_dict):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [132]:
evaluate(question_dict, lambda q: elastic_search(q['question']))

  0%|          | 0/5500 [00:00<?, ?it/s]

{'hit_rate': 0.09909090909090909, 'mrr': 0.11408484848484857}

In [133]:
evaluate(question_dict, lambda q: minsearch_search(q['question']))

  0%|          | 0/5500 [00:00<?, ?it/s]

{'hit_rate': 0.1910909090909091, 'mrr': 0.12470209235209237}