In [2]:
import json
import pandas as pd
from tqdm.auto  import tqdm
import hashlib
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

###  STEP 1: Documents Preparation

In [3]:
data_df = pd.read_csv('../data/raw_data.csv').fillna('')


documents = data_df.to_dict(orient='records')


documents[0]

{'Service_Category': 'Security & identity',
 'Service_Type': 'Zero trust & secure enterprise browser',
 'Link_to_Documentation': 'https://chromeenterprise.google/products/chrome-enterprise-premium/',
 'Google_Cloud_Product': 'Chrome Enterprise Premium',
 'Google_Cloud_Product_Description': 'Enable secure access to critical applications and services, with integrated threat and data protection.',
 'AWS_Offering': '',
 'Azure_Offering': ''}

### Step 1.1: Generate documents unique id

In [4]:
def generate_document_id(doc):
    combined = f"{doc['Service_Category']}-{doc['Service_Type']}{doc['Google_Cloud_Product']}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex
    return document_id

In [5]:
for doc in documents:
    doc['Id'] = generate_document_id(doc)

documents[3]

{'Service_Category': 'Serverless',
 'Service_Type': 'Workflow orchestration',
 'Link_to_Documentation': 'https://cloud.google.com/workflows',
 'Google_Cloud_Product': 'Workflows',
 'Google_Cloud_Product_Description': 'Orchestrate and automate Google Cloud and HTTP-based API services with serverless workflows.',
 'AWS_Offering': 'AWS Step Functions',
 'Azure_Offering': 'Azure Logic Apps',
 'Id': 'f7895e069156a9bf43580e864e959a0f'}

In [6]:
with open("../data/documents_id.json","wt") as d_out:
    json.dump(documents,d_out)

###  STEP 2: Create Embedings using Pretrained Models

In [7]:
embedded_model = SentenceTransformer("all-mpnet-base-v2")

In [9]:
embedded_documents = []

for doc in tqdm(documents):
    doc['Text_Vector'] = embedded_model.encode(doc['Google_Cloud_Product_Description']).tolist()
    embedded_documents.append(doc)

embedded_documents[0]

100%|██████████| 221/221 [00:07<00:00, 28.27it/s]


{'Service_Category': 'Security & identity',
 'Service_Type': 'Zero trust & secure enterprise browser',
 'Link_to_Documentation': 'https://chromeenterprise.google/products/chrome-enterprise-premium/',
 'Google_Cloud_Product': 'Chrome Enterprise Premium',
 'Google_Cloud_Product_Description': 'Enable secure access to critical applications and services, with integrated threat and data protection.',
 'AWS_Offering': '',
 'Azure_Offering': '',
 'Id': '40b23873859451847af0143acb81838c',
 'Text_Vector': [-0.01063474453985691,
  -0.014226234517991543,
  -0.006132534705102444,
  0.020562559366226196,
  -0.030249089002609253,
  0.012065530754625797,
  -0.0043956986628472805,
  -0.03471396863460541,
  -0.01978864148259163,
  -0.011411530897021294,
  0.03373177722096443,
  0.007983692921698093,
  -0.008418280631303787,
  0.061007533222436905,
  0.01943526789546013,
  -0.02337801642715931,
  0.07064682990312576,
  -0.033563170582056046,
  -0.019648706540465355,
  0.02394556812942028,
  -0.0003796077

### STEP 3: Setup ElasticSearch Connection 

In [10]:
e_client = Elasticsearch('http://localhost:9200')

e_client.info()

ObjectApiResponse({'name': '699e94d444a1', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4No4U7IcRFSxM5CuiGnr_g', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### STEP 4: Create Mapping and Index

In [11]:
index_name = "cloud-comparative-guide"

index_settings = {
    "settings": {
        "number_of_shards": 2,
        "number_of_replicas": 2
    },
    "mappings": {
        "properties": {
            "Service_Category": {"type": "text"},
            "Service_Type": {"type": "text"},
            "Link_to_Documentation": {"type": "text"},
            "Google_Cloud_Product": {"type": "text"},
            "Google_Cloud_Product_Description": {"type": "text"},
            "AWS_Offering": {"type": "text"},
            "Azure_Offering": {"type": "text"},
            "Id": {"type": "keyword"},
            "Text_Vector": {"type": "dense_vector","dims": 768,"index": True,"similarity":"cosine"},


        }
    }
}


e_client.indices.delete(index=index_name,ignore_unavailable=True)
e_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'cloud-comparative-guide'})

### STEP 5:  Add documents into Index

In [12]:
for doc in tqdm(embedded_documents):
    try: 
       e_client.index(index=index_name,document=doc)
    except Exception as e:
        print(e)

100%|██████████| 221/221 [00:02<00:00, 93.85it/s] 


### STEP 6: Create end user query

In [13]:
search_term = "best tools for setting up web security?"
vector_search_term = embedded_model.encode(search_term)

In [17]:
query = {
    "field" : "Text_Vector",
    "query_vector" : vector_search_term,
    "k" : 3,
    "num_candidates" : 10000,
}

In [18]:
res = e_client.search(index=index_name,knn=query,source=["Service_Type","Link_to_Documentation","Google_Cloud_Product","Google_Cloud_Product_Description","AWS_Offering",'Azure_Offering','Id'])

res["hits"]["hits"]

[{'_index': 'cloud-comparative-guide',
  '_id': 'i4tU85MBMwZrT-_CwbfW',
  '_score': 0.7700671,
  '_source': {'Google_Cloud_Product_Description': 'Help protect your website from fraudulent activity, spam, and abuse without creating friction.',
   'Link_to_Documentation': 'https://cloud.google.com/recaptcha-enterprise',
   'Google_Cloud_Product': 'reCAPTCHA Enterprise',
   'Azure_Offering': 'Microsoft Dynamics Fraud',
   'Service_Type': 'Abuse prevention',
   'Id': 'd0f6fcbcf72de491f2ae55c13261de1b',
   'AWS_Offering': 'AWS WAF CAPTCHA, AWS Fraud'}},
 {'_index': 'cloud-comparative-guide',
  '_id': 'tYtU85MBMwZrT-_Cubat',
  '_score': 0.7378681,
  '_source': {'Google_Cloud_Product_Description': 'Help protect your applications and websites against denial of service and web attacks.',
   'Link_to_Documentation': 'https://cloud.google.com/armor',
   'Google_Cloud_Product': 'Google Cloud Armor',
   'Azure_Offering': 'Azure Web Application Firewall (WAF)',
   'Service_Type': 'WAF and DDoS',
   