In [2]:
import pandas as pd
import os
import json
from tqdm.auto import tqdm

openai_api_key = os.environ.get('OPENAI_API_KEY')
from openai import OpenAI

In [3]:
client = OpenAI()

In [4]:
df = pd.read_csv('../data/data.csv', sep=';')
df

Unnamed: 0,Question_ID,Question,Answer
0,1,What is the difference between SEO and SEM?,"SEO focuses on organic search results, while S..."
1,2,What is a meta description?,A meta description is a brief summary of a web...
2,3,What is a keyword density?,Keyword density is the number of times a keywo...
3,4,What is a backlink?,A backlink is a hyperlink that points to your ...
4,5,What is a website audit?,A website audit is a comprehensive analysis of...
...,...,...,...
131,132,What is a marketing operations certification e...,A marketing operations certification exam is a...
132,133,What is a marketing operations professional as...,A marketing operations professional associatio...
133,134,What is a marketing operations job description?,A marketing operations job description outline...
134,135,What is a marketing operations salary survey?,A marketing operations salary survey provides ...


In [5]:
documents = df.to_dict(orient='records')
documents[0]

{'Question_ID': 1,
 'Question': 'What is the difference between SEO and SEM?',
 'Answer': 'SEO focuses on organic search results, while SEM includes both organic and paid search results.'}

## Generating gold standard evaluation data

In [9]:
prompt_template = """
You emulate a user od our marketing ai assistant application. 
Formulate 5 questions this user might ask based on a FAQ record. 
The record should contain the answer to the questions, and the questions should be complete and not too short.

The record:

Question_ID:{Question_ID}
Question: {Question}
Answer: {Answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [10]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [11]:
prompt = prompt_template.format(**documents[0])

In [12]:
questions = llm(prompt)

In [13]:
json.loads(questions)

['What are the main differences between SEO and SEM?',
 'Can you explain how SEO relates to organic search results and SEM to paid search results?',
 'What does SEO stand for and how is it different from SEM?',
 'In the context of digital marketing, how do SEO and SEM compare in terms of strategy?',
 'Is there a situation where a business should prioritize SEM over SEO?']

In [14]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [15]:
results = {}

In [16]:

for doc in tqdm(documents): 
    doc_id = doc['Question_ID']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions

questions

  0%|          | 0/136 [00:00<?, ?it/s]

['Can you explain what a marketing operations industry report is and its significance?',
 'What type of insights can I expect to find in a marketing operations industry report?',
 'What are some of the trends highlighted in a marketing operations industry report?',
 'How can a marketing operations industry report help me understand the challenges in the field?',
 'Is a marketing operations industry report beneficial for making strategic marketing decisions?']

In [17]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))
        
final_results[0]

(1,
 'What are the main distinctions between SEO and SEM in terms of search strategies?')

In [18]:
df_results = pd.DataFrame(final_results, columns=['Question_ID', 'Question'])
df_results

Unnamed: 0,Question_ID,Question
0,1,What are the main distinctions between SEO and...
1,1,Can you explain how SEO contributes to organic...
2,1,What elements are included in SEM apart from o...
3,1,How do SEO and SEM work together to improve on...
4,1,Is it possible for a business to rely solely o...
...,...,...
675,136,Can you explain what a marketing operations in...
676,136,What type of insights can I expect to find in ...
677,136,What are some of the trends highlighted in a m...
678,136,How can a marketing operations industry report...


In [19]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [20]:
!head ../data/ground-truth-retrieval.csv

Question_ID,Question
1,What are the main distinctions between SEO and SEM in terms of search strategies?
1,Can you explain how SEO contributes to organic search results?
1,What elements are included in SEM apart from organic search?
1,How do SEO and SEM work together to improve online visibility?
1,Is it possible for a business to rely solely on SEO without using SEM?
2,What exactly is a meta description in the context of web content?
2,Can you explain the role of a meta description in search engine results?
2,How does a meta description summarize the content of a webpage?
2,What is the importance of having an effective meta description for SEO?
