In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-09-09 17:23:01--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3,7K) [text/plain]
Saving to: ‘minsearch.py.5’


2024-09-09 17:23:02 (37,7 MB/s) - ‘minsearch.py.5’ saved [3832/3832]



In [2]:
import pandas as pd
import minsearch
import os
import random
from tqdm.auto import tqdm
import json

openai_api_key = os.environ.get('OPENAI_API_KEY')
from openai import OpenAI

In [3]:
df = pd.read_csv('../data/data.csv', sep=';')
df

Unnamed: 0,Question_ID,Question,Answer
0,1,What is the difference between SEO and SEM?,"SEO focuses on organic search results, while S..."
1,2,What is a meta description?,A meta description is a brief summary of a web...
2,3,What is a keyword density?,Keyword density is the number of times a keywo...
3,4,What is a backlink?,A backlink is a hyperlink that points to your ...
4,5,What is a website audit?,A website audit is a comprehensive analysis of...
...,...,...,...
131,132,What is a marketing operations certification e...,A marketing operations certification exam is a...
132,133,What is a marketing operations professional as...,A marketing operations professional associatio...
133,134,What is a marketing operations job description?,A marketing operations job description outline...
134,135,What is a marketing operations salary survey?,A marketing operations salary survey provides ...


In [4]:
documents = df.to_dict(orient='records')
documents[0]

{'Question_ID': 1,
 'Question': 'What is the difference between SEO and SEM?',
 'Answer': 'SEO focuses on organic search results, while SEM includes both organic and paid search results.'}

## Ingestion

In [5]:
index = minsearch.Index(
    text_fields=['Question', 'Answer'],
    keyword_fields=[]
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x7254c6c6dee0>

## RAG flow

In [7]:
from langfuse.callback import CallbackHandler

langfuse_handler = CallbackHandler(
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    host=os.getenv("LANGFUSE_HOST"),
)

In [8]:
from langfuse.openai import openai

openai.langfuse_auth_check()

client = OpenAI()

In [9]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [10]:
prompt_template = """
You are a marketing expert providing answers based on the FAQ database. Answer the QUESTION using only the information from the CONTEXT.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

entry_template = """
Question_id:{Question_ID}
Question: {Question}
Answer: {Answer}
""".strip()

def build_prompt(query, search_results):

    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [11]:

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [12]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [13]:
question = 'What do you know about email marketing??'
answer = rag(question)
print(answer)

Email marketing is the practice of sending targeted emails to a list of subscribers to promote products or services. It involves creating email marketing campaigns, which are a series of emails sent to a targeted audience for promotion. Additionally, email marketing automation allows for the automation of tasks such as sending personalized emails and triggering campaigns based on user behavior. Email marketing analytics plays a crucial role in measuring and analyzing the performance of these campaigns. To effectively manage these efforts, an email marketing service provider (ESP) can be used, and email list segmentation helps in dividing the email list into smaller, more targeted groups.


## Retrieval evaluation

In [14]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')
df_question

Unnamed: 0,Question_ID,Question
0,1,What are the main distinctions between SEO and...
1,1,Can you explain how SEO contributes to organic...
2,1,What elements are included in SEM apart from o...
3,1,How do SEO and SEM work together to improve on...
4,1,Is it possible for a business to rely solely o...
...,...,...
675,136,Can you explain what a marketing operations in...
676,136,What type of insights can I expect to find in ...
677,136,What are some of the trends highlighted in a m...
678,136,How can a marketing operations industry report...


In [15]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]

{'Question_ID': 1,
 'Question': 'What are the main distinctions between SEO and SEM in terms of search strategies?'}

In [16]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [17]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [18]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['Question_ID']
        results = search_function(q)
        relevance = [d['Question_ID'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': round(hit_rate(relevance_total), 4),
        'mrr': round(mrr(relevance_total), 4),
    }

In [19]:
evaluate(ground_truth, lambda q: minsearch_search(q['Question']))

  0%|          | 0/680 [00:00<?, ?it/s]

{'hit_rate': 0.9838, 'mrr': 0.9256}

#### Finding the best parameters

In [20]:
df_validation = df_question[:80]
df_test = df_question[80:]

In [21]:
def simple_optimize(param_ranges, objective_function, n_iterations=50):
    best_params = None
    best_score = float('-inf')
    
    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
                
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [22]:
gt_val = df_validation.to_dict(orient='records')

In [23]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [24]:
param_ranges = {
    'Question_ID': (0.0, 3.0),
    'Question': (0.0, 3.0),
    'Answer': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['Question'], boost_params)
    
    results = evaluate(gt_val, search_function)
    return results['mrr']

In [25]:

simple_optimize(param_ranges, objective, n_iterations=50)

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

({'Question_ID': 0.7714700981665726,
  'Question': 1.4131203492177264,
  'Answer': 1.4417191893488859},
 0.8596)

In [26]:
def minsearch_improved(query):
    boost = {
        'Question_ID': 0.77,
        'Question': 1.41,
        'Answer': 1.44
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['Question']))

  0%|          | 0/680 [00:00<?, ?it/s]

{'hit_rate': 0.9838, 'mrr': 0.9256}

## RAG evaluation

In [27]:
prompt_template_rag_evaluation = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [28]:
len(ground_truth)

680

In [29]:
ground_truth[0]

{'Question_ID': 1,
 'Question': 'What are the main distinctions between SEO and SEM in terms of search strategies?'}

In [30]:
record = ground_truth[0]
question = record['Question']
answer_llm = rag(question)

In [31]:
answer_llm

'The main distinctions between SEO and SEM in terms of search strategies are that SEO focuses on organic search results, while SEM includes both organic and paid search results.'

In [32]:
prompt = prompt_template_rag_evaluation.format(question=question, answer_llm=answer_llm)
prompt

'You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.\nYour task is to analyze the relevance of the generated answer to the given question.\nBased on the relevance of the generated answer, you will classify it\nas "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".\n\nHere is the data for evaluation:\n\nQuestion: What are the main distinctions between SEO and SEM in terms of search strategies?\nGenerated Answer: The main distinctions between SEO and SEM in terms of search strategies are that SEO focuses on organic search results, while SEM includes both organic and paid search results.\n\nPlease analyze the content and context of the generated answer in relation to the question\nand provide your evaluation in parsable JSON without using code blocks:\n\n{\n  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",\n  "Explanation": "[Provide a brief explanation for your evaluation]"\n}'

In [33]:
print(_)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What are the main distinctions between SEO and SEM in terms of search strategies?
Generated Answer: The main distinctions between SEO and SEM in terms of search strategies are that SEO focuses on organic search results, while SEM includes both organic and paid search results.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


#### I Evaluation with gpt-4o-mini

In [34]:
llm(prompt)

'{\n  "Relevance": "RELEVANT",\n  "Explanation": "The generated answer accurately describes the main distinctions between SEO and SEM by highlighting that SEO focuses on organic search results while SEM encompasses both organic and paid strategies. This directly addresses the question about search strategies."\n}'

In [35]:
df_sample = df_question.sample(n=100, random_state=1)

In [36]:
sample = df_sample.to_dict(orient='records')

In [37]:
evaluations = []

for record in tqdm(sample):
    question = record['Question']
    answer_llm = rag(question)

    prompt = prompt_template_rag_evaluation.format(question=question, answer_llm=answer_llm)
    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

In [38]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer_llm', 'evaluation'])

In [39]:
df_eval['id']=df_eval.record.apply(lambda x: x['Question_ID'])
df_eval['question']=df_eval.record.apply(lambda x: x['Question'])

df_eval['relevance']=df_eval.evaluation.apply(lambda x: x['Relevance'])
df_eval['explanation']=df_eval.evaluation.apply(lambda x: x['Explanation'])

In [40]:
del df_eval['record']
del df_eval['evaluation']

In [41]:
df_eval

Unnamed: 0,answer_llm,id,question,relevance,explanation
0,The primary functions of a social media manage...,52,What are the primary functions of a social med...,PARTLY_RELEVANT,The generated answer addresses one of the prim...
1,The key components examined in a technical ana...,5,What are the key components examined in a tech...,RELEVANT,The generated answer directly addresses the qu...
2,The context provided does not specify the type...,38,What types of marketing variables are typicall...,PARTLY_RELEVANT,The generated answer acknowledges the marketin...
3,A heatmap visually represents user activity on...,10,How does a heatmap visually represent the area...,PARTLY_RELEVANT,The generated answer provides some information...
4,You can analyze the performance of your email ...,59,In what ways can I analyze the performance of ...,PARTLY_RELEVANT,The generated answer addresses the question by...
...,...,...,...,...,...
95,While the provided context does not explicitly...,28,Why is it important to have a clearly defined ...,RELEVANT,The generated answer addresses the question by...
96,A marketing operations consultant provides exp...,115,What role does a marketing operations consulta...,PARTLY_RELEVANT,The generated answer identifies that a marketi...
97,A business can utilize a customer journey map ...,14,In what ways can a business utilize a customer...,RELEVANT,The generated answer effectively addresses the...
98,"In a marketing operations industry report, you...",136,What type of insights can I expect to find in ...,RELEVANT,The generated answer directly addresses the qu...


In [42]:

df_eval.relevance.value_counts()

relevance
RELEVANT           58
PARTLY_RELEVANT    25
NON_RELEVANT       17
Name: count, dtype: int64

In [43]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.58
PARTLY_RELEVANT    0.25
NON_RELEVANT       0.17
Name: proportion, dtype: float64

In [44]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [45]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer_llm,id,question,relevance,explanation
5,The provided context does not specify the topi...,118,What topics are covered in the marketing opera...,NON_RELEVANT,The generated answer does not address the ques...
8,The context provided does not include specific...,124,What skills are important for a successful mar...,NON_RELEVANT,The generated answer explicitly states that it...
9,The context provided does not contain specific...,109,What are the advantages of using outbound mark...,NON_RELEVANT,The generated answer explicitly states that it...
10,The provided context does not contain specific...,93,Can you explain how social media marketing dif...,NON_RELEVANT,The generated answer does not address the ques...
20,The provided context does not specify typical ...,89,What are the typical reasons for performing a ...,NON_RELEVANT,The generated answer does not address the ques...
21,The provided context does not specify the skil...,114,What skills are typically required for someone...,NON_RELEVANT,The generated answer does not address the ques...
27,The provided context does not include specific...,12,Are there any examples of successful value pro...,NON_RELEVANT,The generated answer fails to provide any exam...
30,"I'm sorry, but the provided context does not c...",4,How can I check the backlinks pointing to my w...,NON_RELEVANT,The generated answer does not provide any usef...
35,The context provided does not specifically out...,12,What are the key components that make up a str...,NON_RELEVANT,The generated answer does not address the spec...
39,The provided context does not include specific...,79,Are there best practices for placement of a ca...,NON_RELEVANT,The generated answer does not address the ques...


#### II Evaluation with gpt-4o

In [46]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['Question']
    answer_llm = rag(question, model='gpt-4o')

    prompt = prompt_template_rag_evaluation.format(question=question, answer_llm=answer_llm)
    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

In [47]:

df_eval_2 = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer_llm', 'evaluation'])

df_eval_2['id']=df_eval_2.record.apply(lambda x: x['Question_ID'])
df_eval_2['question']=df_eval_2.record.apply(lambda x: x['Question'])

df_eval_2['relevance']=df_eval_2.evaluation.apply(lambda x: x['Relevance'])
df_eval_2['explanation']=df_eval_2.evaluation.apply(lambda x: x['Explanation'])

In [48]:
del df_eval_2['record']
del df_eval_2['evaluation']

In [49]:
df_eval_2

Unnamed: 0,answer_llm,id,question,relevance,explanation
0,The primary functions of a social media manage...,52,What are the primary functions of a social med...,PARTLY_RELEVANT,The generated answer addresses one key functio...
1,A technical analysis during a website audit ex...,5,What are the key components examined in a tech...,RELEVANT,The generated answer directly addresses the ke...
2,The CONTEXT does not provide specific informat...,38,What types of marketing variables are typicall...,NON_RELEVANT,The generated answer fails to address the spec...
3,A heatmap visually represents the areas of a w...,10,How does a heatmap visually represent the area...,RELEVANT,The generated answer accurately describes how ...
4,You can analyze the performance of your email ...,59,In what ways can I analyze the performance of ...,PARTLY_RELEVANT,The generated answer mentions email marketing ...
...,...,...,...,...,...
95,The provided context does not directly answer ...,28,Why is it important to have a clearly defined ...,NON_RELEVANT,The generated answer does not address the ques...
96,A marketing operations consultant plays a role...,115,What role does a marketing operations consulta...,PARTLY_RELEVANT,The generated answer addresses the question by...
97,A business can utilize a customer journey map ...,14,In what ways can a business utilize a customer...,RELEVANT,The generated answer directly addresses the qu...
98,You can expect to find insights into the trend...,136,What type of insights can I expect to find in ...,RELEVANT,The generated answer directly addresses the qu...


In [50]:
df_eval_2.relevance.value_counts()

relevance
RELEVANT           49
PARTLY_RELEVANT    30
NON_RELEVANT       21
Name: count, dtype: int64

In [51]:
df_eval_2.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.49
PARTLY_RELEVANT    0.30
NON_RELEVANT       0.21
Name: proportion, dtype: float64

In [52]:
df_eval_2.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

In [53]:
df_eval_2[df_eval_2.relevance == 'NON_RELEVANT']

Unnamed: 0,answer_llm,id,question,relevance,explanation
2,The CONTEXT does not provide specific informat...,38,What types of marketing variables are typicall...,NON_RELEVANT,The generated answer fails to address the spec...
5,The CONTEXT does not provide specific details ...,118,What topics are covered in the marketing opera...,NON_RELEVANT,The generated answer fails to address the ques...
7,The provided CONTEXT does not include specific...,91,What information is typically included in a we...,NON_RELEVANT,The generated answer does not address the ques...
8,The CONTEXT does not provide specific details ...,124,What skills are important for a successful mar...,NON_RELEVANT,The generated answer fails to address the ques...
9,The provided context doesn't explicitly detail...,109,What are the advantages of using outbound mark...,NON_RELEVANT,The generated answer does not address the ques...
21,The CONTEXT does not provide specific informat...,114,What skills are typically required for someone...,NON_RELEVANT,The generated answer does not address the ques...
22,"Unfortunately, the provided context does not i...",130,How can I participate in a marketing operation...,NON_RELEVANT,The generated answer does not provide any info...
26,The CONTEXT does not provide specific types of...,108,What types of content are typically used in in...,NON_RELEVANT,The generated answer does not address the ques...
27,"Based on the context provided, there are no sp...",12,Are there any examples of successful value pro...,NON_RELEVANT,The generated answer does not provide any exam...
30,The context provided does not contain informat...,4,How can I check the backlinks pointing to my w...,NON_RELEVANT,The generated answer states that it does not c...
