### Setting up elastic search

In [6]:
from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")
es.info()

{'name': 'aba709d5764b',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'aq-DUWdOT0mjoUCmqP0GZQ',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

### Creating an index

In [7]:
# Importing Data Into Elastic Search.

index_name = "passage_embeddings_idx"
index_mapping = {
    "mappings":{
        "dynamic":"true",
        "properties":{
            "passages": {
                "type": "text"
            },
            "metadata":{
                "type":"object"
            },
            "embedding":{
                "type":"dense_vector",
                "dims": 768
            }
        }
    }
}
if es.indices.exists(index=index_name): es.indices.delete(index=index_name)

es.indices.create(index = index_name, body = index_mapping)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'passage_embeddings_idx'}

### Creating documents for passage_metadata index 

In [None]:
# Path to the fixed CSV file
import csv
import json

fixed_csv_file = "./docs/passage_metadata_emb.csv"

# Open the fixed CSV file
with open(fixed_csv_file, "r", encoding = "utf-8") as file:
    csv_reader = csv.reader(file)
    # Skip the header row if it contains column names
    next(csv_reader, None)
    
    for row in csv_reader:
        Passage, Metadata, Embeddings = row  # Assuming these are the column names in your CSV
        
        metadata_dict = json.loads(Metadata)
        embeddings = json.loads(Embeddings)
        
        # Define the document to be indexed
        document = {
            "passage": Passage,
            "metadata": metadata_dict,
            "embedding": embeddings
        }
        
        # Index the document into Elasticsearch
        es.index(index=index_name, document=document)

# Refresh the index to make the data available for searching
es.indices.refresh(index=index_name)

# Check the document count in the index
es.cat.count(index=index_name, format="json")



### Endcoder for embeddings

In [8]:
from sentence_transformers import SentenceTransformer
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

### Retrieval of passages when queried

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
# Retrieve relevant passages
def retrieve_passages(index_name,question, top_k = 3, model = model):
    query = {
        "query": {
            "match": {
                "passage": question
            }
        }
    }

    results = es.search(index = index_name, body = query, size = top_k)

    question_embedding = model.encode(question)

    passages = []
    relevance_scores = []
    passage_metadata = []
    for hit in result["hits"]["hits"]:
        passages.append(hit["_source"]["passage"])
        relevance_scores.append(cosine_similarity([question_embedding], [hit["_source"]["embedding"]])[0][0])
        passage_metadata.append(hit["_source"]["metadata"])

    return passages, relevance_scores, passage_metadata

question = "What is a valid offer?"
passages, relevance_scores, meta_data = retrieve_passages(index_name = index_name, question=question)
results_df = pd.DataFrame({
    "Question" : [question] * len(passages),
    "Passage" : passages,
    "Relevance Scores": relevance_scores,
    "Passage Metadata": meta_data
})
results_df.to_csv("./docs/question_answering.csv", index = False)
results_df