In [32]:
import json
import minsearch
import openai
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm

In [33]:
with open('/root/practice/logos/data/kjv.json', 'r') as f:
    bible_json = json.load(f)

In [34]:
with open('/root/practice/logos/data/video_data.json', 'r') as f:
    video_json = json.load(f)

In [35]:
bible = bible_json["verses"]

In [36]:
for document in bible:
    for key, value in document.items():
        document[key] = str(value)

In [37]:
documents = bible + video_json

### Create embeddings using pretrained models

In [38]:
from sentence_transformers import SentenceTransformer

In [39]:
model = SentenceTransformer("all-mpnet-base-v2")



In [40]:
model.encode("HEY")

array([ 3.62563431e-02,  3.65621261e-02, -2.75592525e-02, -5.64602902e-03,
        9.52188298e-03,  5.61916828e-03,  1.50828315e-02,  2.79092207e-03,
       -8.40722024e-03,  4.92982147e-03,  3.10310517e-02, -4.11591940e-02,
       -1.81176197e-02,  6.07926995e-02,  2.99968254e-02, -1.08458489e-01,
        1.44992340e-02, -1.65472571e-02,  3.42858844e-02,  2.41216514e-02,
       -1.55948335e-02, -3.41906622e-02,  2.34584557e-03,  3.93656082e-02,
       -3.60202417e-03, -3.58743034e-03, -1.08050834e-03,  5.03031304e-03,
        6.96088048e-03,  2.01435145e-02, -4.55798721e-03, -1.08006755e-02,
        4.39347550e-02, -7.64079113e-03,  1.95554003e-06, -1.20934723e-02,
        1.27174305e-02,  3.25570218e-02,  3.39521617e-02,  5.23817316e-02,
        4.45942171e-02, -2.14427356e-02, -1.66741535e-02,  5.31283347e-03,
       -6.47456897e-03, -3.01942118e-02,  2.21300144e-02, -3.18748169e-02,
       -5.36219813e-02, -1.45198945e-02, -8.48753378e-03, -2.42418572e-02,
       -2.12789653e-03,  

In [41]:
documents[1]

{'book_name': 'Genesis',
 'book': '1',
 'chapter': '1',
 'verse': '2',
 'text': 'And the earth was without form, and void; and darkness [was] upon the face of the deep. And the Spirit of God moved upon the face of the waters.'}

In [43]:
#created the dense vector using the pre-trained model
#operations = []
#for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    #doc["text_vector"] = model.encode(doc["text"]).tolist()
    #operations.append(doc)

In [26]:
#with open("embedding.json", "w") as f_out:
    #json.dump(operations, f_out)

In [44]:
with open("embedding.json", 'r') as f_in:
    operations = json.load(f_in)

### Setup ElasticSearch connection

In [56]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '852a9717a70c', 'cluster_name': 'docker-cluster', 'cluster_uuid': '559pgzE7Tv60N3ZoBVmHfA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Create mappings and index

In [48]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": { 
        "properties": {
            # Fields for Bible verses
            "book": { "type": "text" },  # Bible-specific field
            "book_name": { "type": "keyword" },  # Bible-specific field
            "chapter": { "type": "text" },  # Bible-specific field
            "verse": { "type": "text" },  # Bible-specific field
            
            # Fields for YouTube transcripts
            "video_id": { "type": "keyword" },  # Video-specific field
            "title": { "type": "text" },  # Video-specific field
            "publish_date": { "type": "date" },  # Video-specific field
            "author": { "type": "text" },  # Video-specific field

            # Common field for both types of documents
            "text": { "type": "text" },  # Both Bible verses and video transcripts share this
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

index_name = "semantic_db"

# Create the index

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'semantic_db'})

### Add documents into index

In [50]:
for doc in tqdm(operations):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

  0%|          | 0/36755 [00:00<?, ?it/s]

### Create end user query

In [72]:
search_term = "who is daniel"
vector_search_term = model.encode(search_term)

### Just keyword search

In [73]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "should": [  # Use 'should' to match either condition
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["text^4", "book_name", "chapter", "verse", "book"],
                            "type": "best_fields"
                        }
                    },
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["title", "text", "author"],  # Fields for video transcripts
                            "type": "best_fields"
                        }
                    }
                ]
            }
        }
    }

    # Execute the search query
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    
    # Collect the results from the hits
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [74]:
elastic_search(search_term)

[{'book_name': 'Daniel',
  'book': '27',
  'chapter': '6',
  'verse': '20',
  'text': 'And when he came to the den, he cried with a lamentable voice unto Daniel: [and] the king spake and said to Daniel, O Daniel, servant of the living God, is thy God, whom thou servest continually, able to deliver thee from the lions?',
  'text_vector': [0.014248613268136978,
   0.019800670444965363,
   -0.03871258348226547,
   -0.015385773964226246,
   -0.015388746745884418,
   0.02548498474061489,
   -0.02031332440674305,
   0.01975187472999096,
   -0.0034762851428240538,
   -0.04139573872089386,
   -0.005555352196097374,
   0.00043049140367656946,
   0.02800014242529869,
   0.00334215653128922,
   0.06177111715078354,
   -0.020387260243296623,
   0.003735283622518182,
   0.027448555454611778,
   0.02116377092897892,
   0.006825804244726896,
   -0.01685667596757412,
   0.014767714776098728,
   -0.0110285934060812,
   0.00949306134134531,
   0.011786223389208317,
   -0.0024676076136529446,
   -0.01054

### Semantic search only

In [75]:
def semantic_search(es_client, index_name, embedding_vector):
    # Construct the search query
    search_query = {
        "size": 10,  # Limit the number of results
        "knn": {
            "field": "text_vector",  # Field containing the dense vector
            "query_vector": embedding_vector,  # The query vector (embedding)
            "k": 10,  # Number of nearest neighbors to retrieve
            "num_candidates": 1000  # Candidate pool size for efficiency
        }
    }

    # Execute the search query
    response = es_client.search(index=index_name, body=search_query)

    # Collect and return the results from the hits
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [76]:
semantic_search(es_client, "semantic_db", vector_search_term)

[{'book_name': 'Ezekiel',
  'book': '26',
  'chapter': '28',
  'verse': '3',
  'text': 'Behold, thou [art] wiser than Daniel; there is no secret that they can hide from thee:',
  'text_vector': [-0.0028161518275737762,
   0.08924423903226852,
   -0.05437847971916199,
   0.03044683113694191,
   -0.03309536352753639,
   0.006054593715816736,
   0.016378888860344887,
   0.04821370169520378,
   0.005410484503954649,
   -0.026547841727733612,
   -0.026491373777389526,
   0.023054596036672592,
   -0.013191018253564835,
   -0.05193816497921944,
   0.06266587227582932,
   0.024481117725372314,
   0.04216930642724037,
   -0.015519686974585056,
   0.014638707973062992,
   0.011665662750601768,
   -0.015018047764897346,
   -0.013453107327222824,
   0.01441267877817154,
   -0.012344010174274445,
   0.06017716974020004,
   -0.011281323619186878,
   -0.005609339103102684,
   0.0167611725628376,
   -0.004907980561256409,
   -0.07369940727949142,
   -0.02966362237930298,
   -0.03477355092763901,
   0.

### Semantic search plus keyword search

In [79]:
def semantic_search(es_client, index_name, query_text, embedding_vector):
    # Hybrid search query with script_score
    search_query = {
        "size": 10,  # Limit the number of results
        "query": {
            "bool": {
                "must": [
                    {
                        "multi_match": {
                            "query": query_text,
                            "fields": [
                                "text^4",        # Boost relevance of 'text' field (common field)
                                "book_name",     # Bible-specific field
                                "chapter",       # Bible-specific field
                                "verse",         # Bible-specific field
                                "title",         # YouTube-specific field
                                "author"         # YouTube-specific field
                            ],
                            "type": "best_fields"
                        }
                    }
                ],
                # Use script_score to combine vector similarity with BM25 keyword matching
                "should": [
                    {
                        "script_score": {
                            "query": {
                                "match_all": {}  # Apply score script to all documents
                            },
                            "script": {
                                "source": """
                                // Cosine similarity for the dense vector
                                double cosine_similarity = dotProduct(params.query_vector, 'text_vector') / (cosineSimilarity(params.query_vector, 'text_vector') + 1.0);
                                
                                // Combine cosine similarity with BM25 score (default _score)
                                return _score + cosine_similarity * params.vector_weight;
                                """,
                                "params": {
                                    "query_vector": embedding_vector,  # The embedding vector
                                    "vector_weight": 0.5  # Adjust the weight of vector similarity (0-1)
                                }
                            }
                        }
                    }
                ]
            }
        }
    }

    # Execute the search query
    response = es_client.search(index=index_name, body=search_query)

    # Collect and return the results from the hits
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [80]:
semantic_search(es_client, "semantic_db", search_term, vector_search_term)

[{'book_name': 'Daniel',
  'book': '27',
  'chapter': '6',
  'verse': '20',
  'text': 'And when he came to the den, he cried with a lamentable voice unto Daniel: [and] the king spake and said to Daniel, O Daniel, servant of the living God, is thy God, whom thou servest continually, able to deliver thee from the lions?',
  'text_vector': [0.014248613268136978,
   0.019800670444965363,
   -0.03871258348226547,
   -0.015385773964226246,
   -0.015388746745884418,
   0.02548498474061489,
   -0.02031332440674305,
   0.01975187472999096,
   -0.0034762851428240538,
   -0.04139573872089386,
   -0.005555352196097374,
   0.00043049140367656946,
   0.02800014242529869,
   0.00334215653128922,
   0.06177111715078354,
   -0.020387260243296623,
   0.003735283622518182,
   0.027448555454611778,
   0.02116377092897892,
   0.006825804244726896,
   -0.01685667596757412,
   0.014767714776098728,
   -0.0110285934060812,
   0.00949306134134531,
   0.011786223389208317,
   -0.0024676076136529446,
   -0.01054