### 1) Load Packages 

In [29]:
import json
import pandas as pd
import minsearch
from openai import OpenAI
from tqdm.auto  import tqdm

### 2) Setting Connection

In [2]:
open_client = OpenAI(api_key="")

### 2) Load Data

In [3]:
data_df = pd.read_csv('../data/data.csv')


documents = data_df.to_dict(orient='records')


documents[0]

{'id': 0,
 'exercise_name': 'Push-Ups',
 'type_of_activity': 'Strength',
 'type_of_equipment': 'Bodyweight',
 'body_part': 'Upper Body',
 'type': 'Push',
 'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
 'instructions': 'Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.'}

### 3) Retrival (Minsearch)

In [4]:
prompt_template = """
You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()


In [5]:
def llm(documents,model='gpt-4o-mini'):

    prompt = prompt_template.format(**documents)

    response = open_client.chat.completions.create(
        model=model,
        messages=[{"role":"user","content": prompt}]
    )



    return response.choices[0].message.content


In [35]:
ground_truth = {}

In [36]:
for doc in tqdm(documents):

    doc_id = doc['id']

    if doc_id in ground_truth:
        continue

    result_raw = llm(doc)
    result = json.loads(result_raw)
    ground_truth[doc_id] = result['questions']


100%|██████████| 207/207 [05:41<00:00,  1.65s/it]


In [37]:
final_results = []

for doc_id, questions in ground_truth.items():
    for q in questions:
        final_results.append((doc_id, q))

In [38]:
final_results[0]

(0, 'What is the starting position for doing push-ups?')

In [39]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [40]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)