In [36]:
import json
from elasticsearch import Elasticsearch
import pandas as pd
import minsearch
from tqdm.auto  import tqdm
import random

In [2]:
with open("../data/documents_id.json","r") as d_in:
    documents = json.load(d_in)

documents[0]

{'Service_Category': 'Security & identity',
 'Service_Type': 'Zero trust & secure enterprise browser',
 'Link_to_Documentation': 'https://chromeenterprise.google/products/chrome-enterprise-premium/',
 'Google_Cloud_Product': 'Chrome Enterprise Premium',
 'Google_Cloud_Product_Description': 'Enable secure access to critical applications and services, with integrated threat and data protection.',
 'AWS_Offering': '',
 'Azure_Offering': '',
 'Id': '40b23873859451847af0143acb81838c'}

In [3]:
text_fields = ["Service_Category", "Service_Type", "Link_to_Documentation", "Google_Cloud_Product", "Google_Cloud_Product_Description", "AWS_Offering", "Azure_Offering"]

index = minsearch.Index(
        text_fields=text_fields,
        keyword_fields=['Id']
        )


index.fit(documents)

<minsearch.Index at 0x17a6ab710>

In [4]:
e_client = Elasticsearch('http://localhost:9200')

e_client.info()

ObjectApiResponse({'name': '699e94d444a1', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4No4U7IcRFSxM5CuiGnr_g', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [5]:
index_name =  "cloud-comparative-guide"

In [6]:
def search(search_engine,query,boost={}):

    if search_engine == 'minisearch':

        results = index.search(
            query=query,
            filter_dict={},
            boost_dict=boost,
            num_results=5
        )

        return results
    
    elif search_engine == 'elastic_search':

        search_query = {
        "size": 5,
        "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["Google_Cloud_Product^7","Google_Cloud_Product_Description^3","Service_Type"],
                }
                },
        }
        
        a_res = e_client.search(index=index_name, body=search_query,source=["Service_Type","Link_to_Documentation","Google_Cloud_Product","Google_Cloud_Product_Description","AWS_Offering",'Azure_Offering','Id'])

        result_docs = []

        for hit in a_res["hits"]["hits"]:
            result_docs.append(hit['_source'])

        return result_docs
    
    else:
        return "please pass the appropriate search engine"

In [10]:
df_ground_truth = pd.read_csv('../data/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [21]:
ground_truth[0]

{'question': 'What are the key differences in the zero trust security features between Chrome Enterprise Premium and similar offerings from AWS and Azure?',
 'Google_Cloud_Product': 'Chrome Enterprise Premium',
 'document_id': '40b23873859451847af0143acb81838c'}

In [12]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [47]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['Id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        f'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [48]:
results = evaluate(ground_truth, lambda q: search('minisearch',q['question']))
results

100%|██████████| 1101/1101 [00:02<00:00, 516.68it/s]


{'hit_rate': 0.8937329700272479, 'mrr': 0.7556161065697862}

In [49]:
results = evaluate(ground_truth, lambda q: search('elastic_search',q['question']))
results

  a_res = e_client.search(index=index_name, body=search_query,source=["Service_Type","Link_to_Documentation","Google_Cloud_Product","Google_Cloud_Product_Description","AWS_Offering",'Azure_Offering','Id'])
100%|██████████| 1101/1101 [00:15<00:00, 70.39it/s]


{'hit_rate': 0.8664850136239782, 'mrr': 0.7556917953375724}

In [50]:
df_validation = df_ground_truth[:100]
df_test = df_ground_truth[100:]

In [51]:
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [52]:
gt_val = df_validation.to_dict(orient='records')

In [53]:
param_ranges = {
    'Service_Category': (0.0, 3.0),
    'Service_Type': (0.0, 3.0),
    'Link_to_Documentation': (0.0, 3.0),
    'Google_Cloud_Product': (0.0, 3.0),
    'Google_Cloud_Product_Description': (0.0, 3.0),
    'AWS_Offering': (0.0, 3.0),
    'Azure_Offering': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return search('minisearch',q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [54]:
simple_optimize(param_ranges, objective, n_iterations=20)

100%|██████████| 100/100 [00:00<00:00, 485.99it/s]
100%|██████████| 100/100 [00:00<00:00, 509.24it/s]
100%|██████████| 100/100 [00:00<00:00, 506.98it/s]
100%|██████████| 100/100 [00:00<00:00, 477.57it/s]
100%|██████████| 100/100 [00:00<00:00, 473.32it/s]
100%|██████████| 100/100 [00:00<00:00, 487.17it/s]
100%|██████████| 100/100 [00:00<00:00, 488.66it/s]
100%|██████████| 100/100 [00:00<00:00, 522.31it/s]
100%|██████████| 100/100 [00:00<00:00, 511.44it/s]
100%|██████████| 100/100 [00:00<00:00, 522.04it/s]
100%|██████████| 100/100 [00:00<00:00, 516.05it/s]
100%|██████████| 100/100 [00:00<00:00, 524.70it/s]
100%|██████████| 100/100 [00:00<00:00, 509.98it/s]
100%|██████████| 100/100 [00:00<00:00, 498.56it/s]
100%|██████████| 100/100 [00:00<00:00, 510.94it/s]
100%|██████████| 100/100 [00:00<00:00, 519.72it/s]
100%|██████████| 100/100 [00:00<00:00, 521.82it/s]
100%|██████████| 100/100 [00:00<00:00, 514.28it/s]
100%|██████████| 100/100 [00:00<00:00, 505.37it/s]
100%|██████████| 100/100 [00:00

({'Service_Category': 0.10638495651755087,
  'Service_Type': 1.269946147222612,
  'Link_to_Documentation': 1.5531045466189122,
  'Google_Cloud_Product': 1.3250028735372683,
  'Google_Cloud_Product_Description': 1.9395345731534959,
  'AWS_Offering': 2.067143729150134,
  'Azure_Offering': 0.799844469488588},
 0.9041666666666666)

In [56]:
best_boost = {'Service_Category': 0.10638495651755087,
  'Service_Type': 1.269946147222612,
  'Link_to_Documentation': 1.5531045466189122,
  'Google_Cloud_Product': 1.3250028735372683,
  'Google_Cloud_Product_Description': 1.9395345731534959,
  'AWS_Offering': 2.067143729150134,
  'Azure_Offering': 0.799844469488588}

In [57]:
evaluate(ground_truth, lambda q: search('minisearch',q['question'],best_boost))

100%|██████████| 1101/1101 [00:02<00:00, 517.50it/s]


{'hit_rate': 0.9291553133514986, 'mrr': 0.8166969421737824}