In [15]:
import json
import pandas as pd
from tqdm.auto  import tqdm
import hashlib
from openai import OpenAI
import pickle

## Step 1: Load Raw Data For Ground Truth Preparation

In [27]:
with open("../data/documents_id.json","r") as d_out:
    documents = json.load(d_out)

In [28]:
documents[3]

{'Service_Category': 'Serverless',
 'Service_Type': 'Workflow orchestration',
 'Link_to_Documentation': 'https://cloud.google.com/workflows',
 'Google_Cloud_Product': 'Workflows',
 'Google_Cloud_Product_Description': 'Orchestrate and automate Google Cloud and HTTP-based API services with serverless workflows.',
 'AWS_Offering': 'AWS Step Functions',
 'Azure_Offering': 'Azure Logic Apps',
 'Id': 'f7895e069156a9bf43580e864e959a0f'}

### Step 3: Generate user question chatgpt

In [29]:
prompt_template = """
You emulate a cloud engineer who's undergoing some multi-cloud operation involving the the three popular cloud vendor.
Formulate 5 questions this cloud engineer might ask based on the comaparative guide below. 
comaparative guide:

Service category: {Service_Category}
Service type: {Service_Type}
Link to Documentation: {Link_to_Documentation}
Google Cloud product: {Google_Cloud_Product}
Google Cloud product description: {Google_Cloud_Product_Description}
AWS offering: {AWS_Offering}
Azure offering: {Azure_Offering}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [30]:
o_client = OpenAI(api_key="")

In [31]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = o_client.chat.completions.create(
        model= 'gpt-4o',
        messages=[{"role":"user","content":prompt}]
    )

    json_response = response.choices[0].message.content

    return json_response

In [33]:
results = {}


for doc in tqdm(documents):
    doc_id = doc['Id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions

100%|██████████| 221/221 [10:27<00:00,  2.84s/it]


In [34]:
results

{'40b23873859451847af0143acb81838c': '[\n    "What are the key differences in the zero trust security features between Chrome Enterprise Premium and similar offerings from AWS and Azure?",\n    "How does Chrome Enterprise Premium integrate threat and data protection compared to its AWS and Azure counterparts?",\n    "Is there a cost advantage or consideration for implementing Chrome Enterprise Premium over similar services from AWS and Azure?",\n    "What level of support and customization is available for secure enterprise browsing across Google Cloud, AWS, and Azure?",\n    "Are there any specific compliance certifications or regulations that Chrome Enterprise Premium meets that AWS and Azure do not?" \n]',
 '5f90a9d143ff5d84885d2bd1690d1dce': '[\n  "How do the pricing models compare for Google Cloud Batch, AWS Batch, and Azure Batch?",\n  "What are the limitations in terms of job queue management for each of the Batch services offered by Google Cloud, AWS, and Azure?",\n  "How does 

In [35]:
with open('../data/generated_questions.bin', 'wb') as file_out:
    pickle.dump(results, file_out)

In [36]:
parsed_results = {}

for doc_id, json_questions in tqdm(results.items()):
    parsed_results[doc_id] = json.loads(json_questions)

100%|██████████| 221/221 [00:00<00:00, 337437.64it/s]


In [37]:
parsed_results

{'40b23873859451847af0143acb81838c': ['What are the key differences in the zero trust security features between Chrome Enterprise Premium and similar offerings from AWS and Azure?',
  'How does Chrome Enterprise Premium integrate threat and data protection compared to its AWS and Azure counterparts?',
  'Is there a cost advantage or consideration for implementing Chrome Enterprise Premium over similar services from AWS and Azure?',
  'What level of support and customization is available for secure enterprise browsing across Google Cloud, AWS, and Azure?',
  'Are there any specific compliance certifications or regulations that Chrome Enterprise Premium meets that AWS and Azure do not?'],
 '5f90a9d143ff5d84885d2bd1690d1dce': ['How do the pricing models compare for Google Cloud Batch, AWS Batch, and Azure Batch?',
  'What are the limitations in terms of job queue management for each of the Batch services offered by Google Cloud, AWS, and Azure?',
  'How does the integration with container

In [38]:
doc_index  = {d['Id']: d for d in documents}

In [39]:
doc_index

{'40b23873859451847af0143acb81838c': {'Service_Category': 'Security & identity',
  'Service_Type': 'Zero trust & secure enterprise browser',
  'Link_to_Documentation': 'https://chromeenterprise.google/products/chrome-enterprise-premium/',
  'Google_Cloud_Product': 'Chrome Enterprise Premium',
  'Google_Cloud_Product_Description': 'Enable secure access to critical applications and services, with integrated threat and data protection.',
  'AWS_Offering': '',
  'Azure_Offering': '',
  'Id': '40b23873859451847af0143acb81838c'},
 '5f90a9d143ff5d84885d2bd1690d1dce': {'Service_Category': 'Compute',
  'Service_Type': 'Workload orchestration',
  'Link_to_Documentation': 'https://cloud.google.com/batch',
  'Google_Cloud_Product': 'Batch',
  'Google_Cloud_Product_Description': 'A fully managed service that lets you schedule, queue, and run batch processing workloads for VMs or containers on Google Cloud at scale.',
  'AWS_Offering': 'AWS Batch',
  'Azure_Offering': 'Azure Batch',
  'Id': '5f90a9d

In [40]:
final_results =[]


for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['Google_Cloud_Product']
    for q in questions:
        final_results.append((q,course,doc_id))

final_results[1]

('How does Chrome Enterprise Premium integrate threat and data protection compared to its AWS and Azure counterparts?',
 'Chrome Enterprise Premium',
 '40b23873859451847af0143acb81838c')

In [41]:
df = pd.DataFrame(final_results, columns=['question', 'Google_Cloud_Product', 'document_id'])

df.to_csv('../data/ground-truth-data.csv', index=False)

In [42]:
df.head(20)

Unnamed: 0,question,Google_Cloud_Product,document_id
0,What are the key differences in the zero trust...,Chrome Enterprise Premium,40b23873859451847af0143acb81838c
1,How does Chrome Enterprise Premium integrate t...,Chrome Enterprise Premium,40b23873859451847af0143acb81838c
2,Is there a cost advantage or consideration for...,Chrome Enterprise Premium,40b23873859451847af0143acb81838c
3,What level of support and customization is ava...,Chrome Enterprise Premium,40b23873859451847af0143acb81838c
4,Are there any specific compliance certificatio...,Chrome Enterprise Premium,40b23873859451847af0143acb81838c
5,How do the pricing models compare for Google C...,Batch,5f90a9d143ff5d84885d2bd1690d1dce
6,What are the limitations in terms of job queue...,Batch,5f90a9d143ff5d84885d2bd1690d1dce
7,How does the integration with container orches...,Batch,5f90a9d143ff5d84885d2bd1690d1dce
8,What SLAs and uptime guarantees do Google Clou...,Batch,5f90a9d143ff5d84885d2bd1690d1dce
9,How can I leverage each provider's Batch servi...,Batch,5f90a9d143ff5d84885d2bd1690d1dce
