Generating the ground-truth data

step1: Load data

In [1]:
import json
import hashlib
from collections import defaultdict
from openai import OpenAI
from tqdm.auto import tqdm
import pandas as pd
from elasticsearch import Elasticsearch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#loading data
with open('documents.json', 'r') as file:  # Replace 'data.json' with the path to your file
    data = json.load(file)

In [3]:
documents = []

for article_dict in data:
    for doc in article_dict['documents']:
        doc['Title'] = article_dict['Title']
        documents.append(doc)

documents[1]

{'body': 'A typical data team consists of the following roles: All these people work to create a data product. To explain the core responsibilities of each role, we will use a case scenario: Suppose we work at an online classifieds company. It’s a platform where users can go to sell things they don’t need (like OLX, where I work). If a user has an iPhone they want to sell — they go to this website, create a listing and sell their phone. On this platform, sellers sometimes have problems with identifying the correct category for the items they are selling. To help them, we want to build a service that suggests the best category. To sell their iPhone, the user creates a listing and the site needs to automatically understand that this iPhone has to go in the “mobile phones” category. Let’s start with the first role: product manager.',
 'Title': 'Roles in a Data Team'}

Step2: Generate document id

In [4]:
def generate_document_id(doc):
    combined = f"{doc['Title']}-{doc['body']}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [5]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [6]:
documents[1]

{'body': 'A typical data team consists of the following roles: All these people work to create a data product. To explain the core responsibilities of each role, we will use a case scenario: Suppose we work at an online classifieds company. It’s a platform where users can go to sell things they don’t need (like OLX, where I work). If a user has an iPhone they want to sell — they go to this website, create a listing and sell their phone. On this platform, sellers sometimes have problems with identifying the correct category for the items they are selling. To help them, we want to build a service that suggests the best category. To sell their iPhone, the user creates a listing and the site needs to automatically understand that this iPhone has to go in the “mobile phones” category. Let’s start with the first role: product manager.',
 'Title': 'Roles in a Data Team',
 'id': '80c6bcf6'}

Step3: Checking how unique the hashes are

In [7]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [8]:
len(hashes), len(documents)

(42, 42)

Step4: Saving the document

In [9]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [10]:
!head documents-with-ids.json

[
  {
    "body": "14 Dec 2020 byAlexey Grigorev",
    "Title": "Roles in a Data Team",
    "id": "d2d8f692"
  },
  {
    "body": "A typical data team consists of the following roles: All these people work to create a data product. To explain the core responsibilities of each role, we will use a case scenario: Suppose we work at an online classifieds company. It\u2019s a platform where users can go to sell things they don\u2019t need (like OLX, where I work). If a user has an iPhone they want to sell \u2014 they go to this website, create a listing and sell their phone. On this platform, sellers sometimes have problems with identifying the correct category for the items they are selling. To help them, we want to build a service that suggests the best category. To sell their iPhone, the user creates a listing and the site needs to automatically understand that this iPhone has to go in the \u201cmobile phones\u201d category. Let\u2019s start with the first role: product manager.",
    "T

Step5: Use an LLM to generate user questions for each record

In [11]:
#building the prompt
prompt_template = """
You emulate a user who wants to get more info from the articles.
Formulate 5 questions this user may ask based on a record. The record
should contain the body from which the answer to the questions are drawn from, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

body: {body}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [12]:
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [13]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gemma:2b',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [14]:
results = {}

In [15]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [10:44<00:00, 15.35s/it]


In [16]:
results['ab83ca73']

'{\n  "question1": "What are the three main areas that data scientists should focus on?",\n  "question2": "What is the content of this article based on?",\n  "question3": "What is the purpose of the podcast episode \'The ABC\'s of Data Sciencewith Danny Ma at DataTalks.Club\'?",\n  "question4": "What are some tips to learn new skills?",\n  "question5": "What\'s the influence of the podcast episode on this article?":\n}'

Step6: Storing the data

In [17]:
parsed_results = {}

for doc_id, json_questions in results.items():
    try:
        # Try to parse the JSON string as-is
        parsed_results[doc_id] = json.loads(json_questions)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON for document ID {doc_id}: {e}")
        print(f"Attempting to fix the JSON string for document ID {doc_id}...")

        # Fix the JSON by adding missing commas between key-value pairs
        fixed_json = json_questions.replace('"\n "', '",\n "')

        try:
            # Try to parse the fixed JSON
            parsed_results[doc_id] = json.loads(fixed_json)
        except json.JSONDecodeError as e:
            print(f"Unable to fix the JSON for document ID {doc_id}: {e}")
            print(f"Problematic JSON string: {fixed_json}")

Error parsing JSON for document ID 80c6bcf6: Expecting ',' delimiter: line 6 column 96 (char 442)
Attempting to fix the JSON string for document ID 80c6bcf6...
Unable to fix the JSON for document ID 80c6bcf6: Expecting ',' delimiter: line 6 column 96 (char 442)
Problematic JSON string: {
  "question1": "What is the primary responsibility of a product manager in this scenario?",
  "question2": "What is the purpose of this category suggestion service?",
  "question3": "What type of platform does this scenario apply to?",
  "question4": "How does this category suggestion service help sellers to accurately classify their items?",
  "question5": "What are some of the challenges faced by the product manager in this scenario?"
Error parsing JSON for document ID e8f74f94: Expecting ',' delimiter: line 6 column 99 (char 389)
Attempting to fix the JSON string for document ID e8f74f94...
Unable to fix the JSON for document ID e8f74f94: Expecting ',' delimiter: line 6 column 99 (char 389)
Problema

In [19]:
len(parsed_results)

17

In [20]:
doc_index = {d['id']: d for d in documents}

In [23]:
parsed_results

{'d2d8f692': {'question1': 'What was the purpose of the trip?',
  'question2': 'What specific details were included in the article?',
  'question3': 'Are there any notable quotes or anecdotes from the article?',
  'question4': 'What was the result of the trip?',
  'question5': 'What were the conclusions drawn from the outcome?'},
 '99ce651b': {'question1': 'What is the difference between a data scientist and a data analyst?',
  'question2': 'How does a data scientist use data to create a machine learning model?',
  'question3': 'What does a data scientist focus more on when creating a machine learning service?',
  'question4': "What is the main focus of a data scientist's job?",
  'question5': "How does a data scientist's focus on engineering impact the final product?"},
 'f10a49a7': {'question1': 'What is the main focus of machine learning engineers?',
  'question2': 'What are the skills that machine learning engineers have that are similar to that of data engineers?',
  'question3': 

In [25]:
final_results = []

for doc_id, questions in parsed_results.items():
    title = doc_index[doc_id]['Title']
    body = doc_index[doc_id]['body']
    for question_number, question in questions.items():
        final_results.append((question, title, body, doc_id))

In [27]:
len(final_results)

85

In [26]:
final_results

[('What was the purpose of the trip?',
  'Roles in a Data Team',
  '14 Dec 2020 byAlexey Grigorev',
  'd2d8f692'),
 ('What specific details were included in the article?',
  'Roles in a Data Team',
  '14 Dec 2020 byAlexey Grigorev',
  'd2d8f692'),
 ('Are there any notable quotes or anecdotes from the article?',
  'Roles in a Data Team',
  '14 Dec 2020 byAlexey Grigorev',
  'd2d8f692'),
 ('What was the result of the trip?',
  'Roles in a Data Team',
  '14 Dec 2020 byAlexey Grigorev',
  'd2d8f692'),
 ('What were the conclusions drawn from the outcome?',
  'Roles in a Data Team',
  '14 Dec 2020 byAlexey Grigorev',
  'd2d8f692'),
 ('What is the difference between a data scientist and a data analyst?',
  'Roles in a Data Team',
  'The roles of a data scientist and data analyst are pretty similar. In some companies, it’s the same person who does both jobs. However, data scientists typically focus more on predicting rather than explaining. A data analyst fetches the data, looks at it, explain

In [28]:
df = pd.DataFrame(final_results, columns=['question', 'title', 'body', 'document'])

In [29]:
df.shape

(85, 4)

In [30]:
df.to_csv('ground-truth-data.csv', index=False)

In [31]:
!head ground-truth-data.csv

question,title,body,document
What was the purpose of the trip?,Roles in a Data Team,14 Dec 2020 byAlexey Grigorev,d2d8f692
What specific details were included in the article?,Roles in a Data Team,14 Dec 2020 byAlexey Grigorev,d2d8f692
Are there any notable quotes or anecdotes from the article?,Roles in a Data Team,14 Dec 2020 byAlexey Grigorev,d2d8f692
What was the result of the trip?,Roles in a Data Team,14 Dec 2020 byAlexey Grigorev,d2d8f692
What were the conclusions drawn from the outcome?,Roles in a Data Team,14 Dec 2020 byAlexey Grigorev,d2d8f692
What is the difference between a data scientist and a data analyst?,Roles in a Data Team,"The roles of a data scientist and data analyst are pretty similar. In some companies, it’s the same person who does both jobs. However, data scientists typically focus more on predicting rather than explaining. A data analyst fetches the data, looks at it, explains what’s going on to the team, and gives some recommendations on what to do about it. A 

Elastic Search evaluation with text

Step1: Loading the data

In [32]:
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

Step2: Index the data

In [33]:
es_client = Elasticsearch('http://localhost:9200') 

In [34]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "heading": {"type": "text"},
            "body": {"type": "text"},
            "Title": {"type": "keyword"} ,
            "id": {"type": "keyword"}
        }
    }
}

index_name = "data-roles-eval"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'data-roles-eval'})

In [35]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 42/42 [00:00<00:00, 98.65it/s] 


Step3: Building search model

In [36]:
def elastic_search(query, title, index_name="data-roles-eval"):
    search_query = {
        "size": 3,  # Retrieve only the top result based on the highest score
        "query": {
            "bool": {
                "should": [
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["heading^4", "body"],  # Boost 'heading' field
                            "type": "best_fields",  # Best field matching strategy
                            "fuzziness": "AUTO"  # Allow slight variations in text
                        }
                    }
                ],
                "filter": [
                    {
                        "term": {
                            "Title": title  # Make sure 'Title' field is indexed as a keyword
                        }
                    }
                ]
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [37]:
elastic_search('whos is a data scientist', "Roles in a Data Team")

[{'heading': 'Data Scientist',
  'body': 'The roles of a data scientist and data analyst are pretty similar. In some companies, it’s the same person who does both jobs. However, data scientists typically focus more on predicting rather than explaining. A data analyst fetches the data, looks at it, explains what’s going on to the team, and gives some recommendations on what to do about it. A data scientist, on the other hand, focuses more on creating machine learning services. For example, one of the questions that a data scientist would want to answer is “How can we use this data to build a machine learning model for predicting something?” In other words, data scientists incorporate the data into the product. Their focus is more on engineering than analysis. Data scientists work more closely with engineers on integrating data solutions into the product. The skills of data scientists include: For our example, the data scientists are the people who develop the model used for predicting t

Step4: Evaluation

In [38]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [39]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [40]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], title=q['title'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 85/85 [00:01<00:00, 61.73it/s]


In [41]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [42]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [43]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7411764705882353, 0.5784313725490197)