In [1]:
import pandas as pd
import json
import time


In [2]:
from openai import OpenAI


client = OpenAI()
#from error import APIConnectionError

In [36]:
df=pd.read_csv('../data/clean_data/date_chunked_5s.csv')

In [37]:
df.head()

Unnamed: 0,id,content,number of sentences,number of words
0,0,this is the first I see an egg in bath ingredi...,5,56
1,1,My husband is going through his first vomit/di...,5,49
2,2,"На вкус как жареные грибы, кисловатые.. Чувств...",5,41
3,3,"Это я с бухты барахты приготовил и пробую, а к...",5,204
4,4,"Juice of lemon, naturally sparkling water, hon...",5,150


In [38]:
documents = df.to_dict(orient='records')
documents[0:2]

[{'id': 0,
  'content': 'this is the first I see an egg in bath ingredients and egg?. The plastic might even be better (especially if you have hard plastic) considering you have the cloth in between, because stainless steel could draw EMFs Your love Better anything non processed.. Sea water, an egg, sea salt, ACV, a little Urine.. .. .',
  'number of sentences': 5,
  'number of words': 56},
 {'id': 1,
  'content': 'My husband is going through his first vomit/diarrhea detox, hydration formula or egg is not what he wants.. He is craving orange juice or lemonade.. What is good to give him to help detox?. is pasteurised milk gonna be better than nothing in the bath?. Впервые попробовал тухлую печень.',
  'number of sentences': 5,
  'number of words': 49}]

In [39]:
prompt_template = """
You emulate a user of our primal health adviser application.
Formulate 5 questions this user might ask based on chunked content of conversation.
Make the questions specific to that chunk.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

Chunked_Content: {content}


Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [40]:
prompt = prompt_template.format(**documents[0])

In [41]:
prompt

'You emulate a user of our primal health adviser application.\nFormulate 5 questions this user might ask based on chunked content of conversation.\nMake the questions specific to that chunk.\nThe record should contain the answer to the questions, and the questions should\nbe complete and not too short. Use as fewer words as possible from the record. \n\nThe record:\n\nChunked_Content: this is the first I see an egg in bath ingredients and egg?. The plastic might even be better (especially if you have hard plastic) considering you have the cloth in between, because stainless steel could draw EMFs Your love Better anything non processed.. Sea water, an egg, sea salt, ACV, a little Urine.. .. .\n\n\nProvide the output in parsable JSON without using code blocks:\n\n{"questions": ["question1", "question2", ..., "question5"]}'

In [42]:
print(_)

You emulate a user of our primal health adviser application.
Formulate 5 questions this user might ask based on chunked content of conversation.
Make the questions specific to that chunk.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

Chunked_Content: this is the first I see an egg in bath ingredients and egg?. The plastic might even be better (especially if you have hard plastic) considering you have the cloth in between, because stainless steel could draw EMFs Your love Better anything non processed.. Sea water, an egg, sea salt, ACV, a little Urine.. .. .


Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", ..., "question5"]}


In [43]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [44]:
questions = llm(prompt)
questions

'{"questions":["Why is there an egg included in the bath ingredients?","What materials are recommended for the bath, specifically regarding plastic and stainless steel?","Could you explain the benefits of using sea water in the bath?","What is the role of sea salt and ACV in this bath recipe?","Is urine used in bath ingredients for a specific reason?"]}'

In [45]:
import json

In [46]:
json.loads(questions)

{'questions': ['Why is there an egg included in the bath ingredients?',
  'What materials are recommended for the bath, specifically regarding plastic and stainless steel?',
  'Could you explain the benefits of using sea water in the bath?',
  'What is the role of sea salt and ACV in this bath recipe?',
  'Is urine used in bath ingredients for a specific reason?']}

In [47]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [48]:
from tqdm.auto import tqdm

In [49]:
results = {}


In [50]:
len(documents)

5837

In [51]:
# Generationg questions for 49k documents was taking too long

In [52]:
# get 300 random documents out of 49006 documents for evaluation
import random
number =200                           # set the number to select here.
random_docs = random.sample(documents,number)
len(random_docs)


200

## Parallelarization


In [53]:
from concurrent.futures import ThreadPoolExecutor, as_completed
def process_document(doc):
    doc_id = doc['id']
    if doc_id in results:
        return None  # Skip this document if already processed
    try:
        questions_raw = generate_questions(doc)
        questions = json.loads(questions_raw)
        return (doc_id, questions['questions'])
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error processing doc_id {doc_id}: {e}")
        return None

# Track time
t0 = time.time()

# Create a ThreadPoolExecutor with a suitable number of workers
with ThreadPoolExecutor(max_workers=3) as executor:
    # Submit all tasks
    
    futures = {executor.submit(process_document, doc): doc for doc in tqdm(random_docs)}

    total_tasks = len(futures)
    # Collect results as they complete
    for future in tqdm(as_completed(futures)):
        result = future.result()
        if result is not None:
            doc_id, questions = result
            results[doc_id] = questions

# End time
t1 = time.time()
# Print time taken
print(f"Time taken: {(t1 - t0)/60} minutes")


  0%|          | 0/200 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Time taken: 1.5109469215075175 minutes


In [54]:
len(results)


200

In [55]:
# no need to use the following code

In [56]:
# import time
# t0=time.time()
# for doc in tqdm(random_docs[:1000]): 
#     doc_id = doc['id']
#     if doc_id in results:
#         continue
#     try:
#         questions_raw = generate_questions(doc)
#         questions = json.loads(questions_raw)  # If an error happens here, the next line will not run
#         results[doc_id] = questions['questions']  # This line will be skipped if there's an error in json.loads()
#     except (json.JSONDecodeError, KeyError) as e:
#         print(f"Error processing doc_id {doc_id}: {e}")
    
# t1=time.time()
# print((t0-t1)/60)

In [57]:
len(results)
type(results)

dict

In [58]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [59]:
final_results[0:10]

[(608, 'What makes you say that threatening your family is not acceptable?'),
 (608, 'Why do you feel that someone needs to get a life?'),
 (608, 'Have you experienced any recent threats to your family?'),
 (608,
  'What would you consider an appropriate response to someone threatening your family?'),
 (608,
  'Have you taken any steps to address these threats towards your family?'),
 (1011, "Where can I find a thin silk floss that's unwaxed like silk thread?"),
 (1011,
  'Is there any reason to be cautious about the composition of the Zahnseide floss?'),
 (1011, 'What is the composition of Yaweco Biodegradable Dental Floss?'),
 (1011,
  'How frequently should I use a dental floss brush, floss, and waterpik?'),
 (1011, 'What is a traditional way of flossing that helps kill bacteria?')]

In [60]:
len(final_results)

1000

In [61]:
df = pd.DataFrame(final_results, columns=['id', 'question'])
df.head()

Unnamed: 0,id,question
0,608,What makes you say that threatening your famil...
1,608,Why do you feel that someone needs to get a life?
2,608,Have you experienced any recent threats to you...
3,608,What would you consider an appropriate respons...
4,608,Have you taken any steps to address these thre...


In [62]:
df.to_csv('../data/clean_data/ground-truth-data_final.csv', index=False)

In [64]:
!head '../data/clean_data/ground-truth-data_final.csv'

id,question
608,What makes you say that threatening your family is not acceptable?
608,Why do you feel that someone needs to get a life?
608,Have you experienced any recent threats to your family?
608,What would you consider an appropriate response to someone threatening your family?
608,Have you taken any steps to address these threats towards your family?
1011,Where can I find a thin silk floss that's unwaxed like silk thread?
1011,Is there any reason to be cautious about the composition of the Zahnseide floss?
1011,What is the composition of Yaweco Biodegradable Dental Floss?
1011,"How frequently should I use a dental floss brush, floss, and waterpik?"
