### 1) Load Packages 

In [74]:
import json
import pandas as pd
import minsearch
from openai import OpenAI
from tqdm.auto  import tqdm
import random

### 2) Setting Connection

In [31]:
open_client = OpenAI(api_key="")

### 2) Load Data

In [7]:
data_df = pd.read_csv('../data/data.csv')


documents = data_df.to_dict(orient='records')


documents[0]

{'id': 0,
 'exercise_name': 'Push-Ups',
 'type_of_activity': 'Strength',
 'type_of_equipment': 'Bodyweight',
 'body_part': 'Upper Body',
 'type': 'Push',
 'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
 'instructions': 'Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.'}

### 3) Retrival (Minsearch)

In [41]:
query = 'Is the Lat Pulldown considered a strength training activity, and if so, why?'

In [10]:
text_fields = ["exercise_name",
               "type_of_activity",
               "type_of_equipment",
               "body_part",
               "type",
               "muscle_groups_activated",
               "instructions"
               ]

index = minsearch.Index(
        text_fields=text_fields,
        keyword_fields=['id']
        )


index.fit(documents)

<minsearch.Index at 0x104c2ee50>

In [75]:
def search(query,boost={}):

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )


    return results

### 4) Rag Flow

In [15]:
prompt_template = """
You're a fitness insrtuctor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()


entry_template = """
exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}
""".strip()


def build_prompt(query, search_results):

    context = ""

    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"


    prompt = prompt_template.format(question=query,context=context).strip()
    return prompt


In [43]:
def llm(prompt,model):

    response = open_client.chat.completions.create(
        model=model,
        messages=[{"role":"user","content": prompt}]
    )



    return response.choices[0].message.content


In [46]:
def rag(query,model='gpt-4o-mini'):

    search_results = search(query)


    prompt = build_prompt(query,search_results)


    result = llm(prompt,model)

    return result




In [47]:
rag(query)

'Yes, the Lat Pulldown is considered a strength training activity. This classification is based on its primary objective to build and strengthen the upper body, specifically targeting muscle groups such as the Latissimus Dorsi and Biceps. The exercise involves using a machine to perform a pulling motion, which is typical of strength training exercises that aim to increase muscle strength and size.'

### 5) Retrieval Evaluation

In [52]:
ground_truth_df = pd.read_csv('../data/ground-truth-retrieval.csv')

ground_truth = ground_truth_df.to_dict(orient="records")


In [67]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [71]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [76]:
evaluate(ground_truth, lambda q: search(q['question']))

100%|██████████| 1035/1035 [00:01<00:00, 599.49it/s]


{'hit_rate': 0.9352657004830918, 'mrr': 0.8134479717813055}

### 6) Finding the best parameters

In [77]:
df_validation = ground_truth_df[:100]
df_test = ground_truth_df[100:]

In [78]:
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [79]:
gt_val = df_validation.to_dict(orient='records')

In [80]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [84]:
simple_optimize(param_ranges, objective, n_iterations=20)

100%|██████████| 100/100 [00:00<00:00, 564.94it/s]
100%|██████████| 100/100 [00:00<00:00, 587.39it/s]
100%|██████████| 100/100 [00:00<00:00, 587.36it/s]
100%|██████████| 100/100 [00:00<00:00, 592.65it/s]
100%|██████████| 100/100 [00:00<00:00, 593.73it/s]
100%|██████████| 100/100 [00:00<00:00, 585.01it/s]
100%|██████████| 100/100 [00:00<00:00, 600.38it/s]
100%|██████████| 100/100 [00:00<00:00, 604.12it/s]
100%|██████████| 100/100 [00:00<00:00, 601.69it/s]
100%|██████████| 100/100 [00:00<00:00, 590.97it/s]
100%|██████████| 100/100 [00:00<00:00, 599.29it/s]
100%|██████████| 100/100 [00:00<00:00, 603.35it/s]
100%|██████████| 100/100 [00:00<00:00, 598.73it/s]
100%|██████████| 100/100 [00:00<00:00, 586.00it/s]
100%|██████████| 100/100 [00:00<00:00, 606.07it/s]
100%|██████████| 100/100 [00:00<00:00, 603.11it/s]
100%|██████████| 100/100 [00:00<00:00, 583.15it/s]
100%|██████████| 100/100 [00:00<00:00, 596.20it/s]
100%|██████████| 100/100 [00:00<00:00, 599.34it/s]
100%|██████████| 100/100 [00:00

({'exercise_name': 2.9620514273910423,
  'type_of_activity': 1.4480659199628136,
  'type_of_equipment': 0.30925442034563844,
  'body_part': 1.5572417586561653,
  'type': 0.09597121215688575,
  'muscle_groups_activated': 2.566743529930406,
  'instructions': 0.720527974160517},
 0.8705833333333333)

In [85]:
best_boost = {'exercise_name': 2.9620514273910423,
  'type_of_activity': 1.4480659199628136,
  'type_of_equipment': 0.30925442034563844,
  'body_part': 1.5572417586561653,
  'type': 0.09597121215688575,
  'muscle_groups_activated': 2.566743529930406,
  'instructions': 0.720527974160517}

In [86]:
evaluate(ground_truth, lambda q: search(q['question'],best_boost))

100%|██████████| 1035/1035 [00:01<00:00, 598.13it/s]


{'hit_rate': 0.9381642512077295, 'mrr': 0.8971612606395217}

### 7) Rag Evaluation

In [97]:
rag_eval_promt_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [92]:
data_sample = ground_truth_df.sample(n=200,random_state=1)

In [93]:
sample = data_sample.to_dict(orient='records')

In [94]:
sample[0]

{'id': 171,
 'question': 'What equipment do I need to perform the Banded Pull-Up?'}

In [98]:
evaluations = []

In [99]:

for doc in tqdm(sample):
    question = doc['question']
    answer_llm = rag(question)

    prompt = rag_eval_promt_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = json.loads(llm(prompt,'gpt-4o-mini'))

    evaluations.append((doc,answer_llm,evaluation))

100%|██████████| 200/200 [08:10<00:00,  2.45s/it]


In [100]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

In [101]:
df_eval 

Unnamed: 0,record,answer,evaluation
0,"{'id': 171, 'question': 'What equipment do I n...","To perform the Banded Pull-Up, you need a Resi...","{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
1,"{'id': 115, 'question': 'Do I need any equipme...","No, you do not need any equipment to perform J...","{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
2,"{'id': 53, 'question': 'What specific body par...",The Dumbbell Lateral Raise primarily works on ...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
3,"{'id': 198, 'question': 'What should I do afte...",After extending the handle overhead in the Cab...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
4,"{'id': 19, 'question': 'How deep should my squ...","When doing Goblet Squats, you should squat dow...","{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
...,...,...,...
195,"{'id': 83, 'question': 'What muscles does the ...",The Pendlay Row primarily targets the Latissim...,"{'Relevance': 'PARTLY_RELEVANT', 'Explanation'..."
196,"{'id': 94, 'question': 'What type of exercise ...",TRX Push-Ups are classified as a type of Stren...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
197,"{'id': 123, 'question': 'How do you return to ...",To return to the starting position after compl...,"{'Relevance': 'RELEVANT', 'Explanation': 'The ..."
198,"{'id': 76, 'question': 'Do I need any equipmen...","No, you do not need any equipment to perform F...","{'Relevance': 'RELEVANT', 'Explanation': 'The ..."


In [102]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [103]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.875
PARTLY_RELEVANT    0.110
NON_RELEVANT       0.015
Name: proportion, dtype: float64

In [104]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)