In [2]:
import json
from elasticsearch import Elasticsearch
import pandas as pd
from tqdm.auto  import tqdm
from sentence_transformers import SentenceTransformer

In [3]:
embedded_model = SentenceTransformer("all-mpnet-base-v2")

In [4]:
e_client = Elasticsearch('http://localhost:9200')

e_client.info()

ObjectApiResponse({'name': '699e94d444a1', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4No4U7IcRFSxM5CuiGnr_g', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
index_name =  "cloud-comparative-guide"

In [6]:
def elastic_search_knn(field,vector):

    knn = {
        "field" : field,
        "query_vector" : vector,
        "k" : 5,
        "num_candidates" : 10000
    }

    search_query = {
    "knn": knn,
    "_source" : ["Service_Type","Link_to_Documentation","Google_Cloud_Product","Google_Cloud_Product_Description","AWS_Offering",'Azure_Offering','Id']
   
   }
    es_results = e_client.search(
         index=index_name,
         body=search_query,
        
   )


    result_docs = []

    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [16]:
def vector_knn(q):
    question = q['question']

    v_q = embedded_model.encode(question)

    return elastic_search_knn('Text_Vector',v_q)

In [8]:
df_ground_truth = pd.read_csv('../data/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [9]:
ground_truth[0]

{'question': 'What are the key differences in the zero trust security features between Chrome Enterprise Premium and similar offerings from AWS and Azure?',
 'Google_Cloud_Product': 'Chrome Enterprise Premium',
 'document_id': '40b23873859451847af0143acb81838c'}

In [17]:
vector_knn(ground_truth[0])

[{'Google_Cloud_Product_Description': 'Run more secure and compliant workloads on Google Cloud.',
  'Link_to_Documentation': 'https://cloud.google.com/assured-workloads',
  'Google_Cloud_Product': 'Assured Workloads',
  'Azure_Offering': 'Azure Government',
  'Service_Type': 'Regulated services',
  'Id': '8c86f3a3e52325cc7b9e23dbecf3a9b9',
  'AWS_Offering': 'AWS GovCloud'},
 {'Google_Cloud_Product_Description': 'Use global network firewall policies and regional network firewall policies to achieve a zero-trust network posture using a fully-distributed, cloud-first firewall service.',
  'Link_to_Documentation': 'https://cloud.google.com/firewall/docs/about-firewalls',
  'Google_Cloud_Product': 'Cloud Next Generation Firewall Essentials',
  'Azure_Offering': 'Azure Firewall',
  'Service_Type': 'Firewall',
  'Id': 'a05690af1e904d4c8032a3c0369ca58d',
  'AWS_Offering': 'AWS Network Firewall'},
 {'Google_Cloud_Product_Description': 'Provide data sovereignty controls for Google Cloud workload

In [18]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [19]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [22]:
def evaluate(ground_truth,search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['Id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [23]:
evaluate(ground_truth,vector_knn)

100%|██████████| 1101/1101 [00:34<00:00, 31.46it/s]


{'hit_rate': 0.5431425976385105, 'mrr': 0.4016197396306383}

In [43]:
index_name = "comparative-guide-vector"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Service_Category": {"type": "text"},
            "Service_Type": {"type": "text"},
            "Link_to_Documentation": {"type": "text"},
            "Google_Cloud_Product": {"type": "text"},
            "Google_Cloud_Product_Description": {"type": "text"},
            "AWS_Offering": {"type": "text"},
            "Azure_Offering": {"type": "text"},
            "Id": {"type": "keyword"},
            "General_Vector": {"type": "dense_vector","dims": 768,"index": True,"similarity":"cosine"},


        }
    }
}

e_client.indices.delete(index=index_name,ignore_unavailable=True)
e_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'comparative-guide-vector'})

In [44]:
with open("../data/documents_id.json","r") as d_in:
    documents = json.load(d_in)

documents[0]

{'Service_Category': 'Security & identity',
 'Service_Type': 'Zero trust & secure enterprise browser',
 'Link_to_Documentation': 'https://chromeenterprise.google/products/chrome-enterprise-premium/',
 'Google_Cloud_Product': 'Chrome Enterprise Premium',
 'Google_Cloud_Product_Description': 'Enable secure access to critical applications and services, with integrated threat and data protection.',
 'AWS_Offering': '',
 'Azure_Offering': '',
 'Id': '40b23873859451847af0143acb81838c'}

In [45]:
for doc in tqdm(documents):

    
    qt = doc["Service_Type"] + ' ' + doc["Link_to_Documentation"] + ' ' + doc["Google_Cloud_Product"] + ' ' + doc["Google_Cloud_Product_Description"] + ' ' + doc["AWS_Offering"] + ' ' + doc['Azure_Offering']
    doc['General_Vector'] = embedded_model.encode(qt)

100%|██████████| 221/221 [00:04<00:00, 48.06it/s]


In [46]:
documents[0]

{'Service_Category': 'Security & identity',
 'Service_Type': 'Zero trust & secure enterprise browser',
 'Link_to_Documentation': 'https://chromeenterprise.google/products/chrome-enterprise-premium/',
 'Google_Cloud_Product': 'Chrome Enterprise Premium',
 'Google_Cloud_Product_Description': 'Enable secure access to critical applications and services, with integrated threat and data protection.',
 'AWS_Offering': '',
 'Azure_Offering': '',
 'Id': '40b23873859451847af0143acb81838c',
 'General_Vector': array([ 1.07001308e-02, -3.17839696e-03,  1.25289215e-02, -1.29329525e-02,
        -2.30601858e-02,  1.33294938e-02,  8.82174149e-02,  2.68973932e-02,
         8.15159306e-02, -1.91053525e-02, -5.18506318e-02, -1.36517389e-02,
         1.54298358e-02,  5.90137839e-02,  1.97247732e-02,  1.69134941e-02,
         1.40672959e-02, -2.67079286e-02,  7.02679679e-02, -8.64985632e-04,
        -1.61981639e-02, -1.98052488e-02, -3.95962261e-02, -2.81337667e-02,
         5.79113327e-02, -5.14879525e-02,

In [47]:
for doc in tqdm(documents):
    e_client.index(index=index_name,document=doc)

100%|██████████| 221/221 [00:00<00:00, 363.67it/s]


In [48]:
def question_vector_knn(q):
    question = q['question']

    v_q = embedded_model.encode(question)

    return elastic_search_knn('General_Vector',v_q)

In [49]:
evaluate(ground_truth,question_vector_knn)

100%|██████████| 1101/1101 [00:32<00:00, 34.28it/s]


{'hit_rate': 0.9009990917347865, 'mrr': 0.7837874659400551}