In [None]:
from pydantic import BaseModel,Field
from typing import List
import os 
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

GOOGLE_API_KEY =os.getenv('Gemini_key')

In [2]:
df =pd.read_csv('../Data/data.csv')

In [3]:
# drop id column 
df.drop('id',axis=1,inplace=True)

In [4]:
df.to_csv('../Data/cleaned_data.csv')

In [5]:
from langchain_community.document_loaders import CSVLoader

file_path =r'..\Data\cleaned_data.csv'
loader =CSVLoader(file_path)
docs =loader.load()
print(docs[0].page_content)

: 0
exercise_name: Push-Ups
type_of_activity: Strength
type_of_equipment: Bodyweight
body_part: Upper Body
type: Push
muscle_groups_activated: Pectorals, Triceps, Deltoids
instructions: Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.


In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
class Reslut(BaseModel):
    questions:List[str] =Field(...,max_items =5,title ="List of all generatied questions")

llm =ChatGoogleGenerativeAI(
    model ='models/gemini-2.0-flash-lite',
    temperature=0.5,
    google_api_key=GOOGLE_API_KEY 
)

structured_llm =llm.with_structured_output(Reslut)

prompt_template ='''
    you emulate a user of out fitness assistant application
    formulate 5 questions this user might ask based on a provided excersise
    the record should contain the asnwer of the questions
    the questions should be complate and not too short
    use a fewer words as possible form the record
    record:{record}
    you must follow the pydantic structure for your generated questions
    {{"questions": ["question1", "question2", ..., "question5"]}}
    genrate only questions not questions and answers
'''

prompt =ChatPromptTemplate.from_template(prompt_template)
questions_chain =(
    prompt
    |llm
)

In [4]:
q =questions_chain.invoke({
    "record":docs[0].page_content
}).content
print(q)

```json
{
  "questions": [
    "How many push-ups should I aim for if I'm a beginner?",
    "What modifications can I do to make push-ups easier?",
    "How far apart should my hands be placed during a push-up?",
    "Can push-ups be incorporated into a full-body workout?",
    "What are some common mistakes to avoid when doing push-ups?"
  ]
}
```


In [5]:
type(q)

str

In [6]:
from json_repair import repair_json
from pprint import pprint
pprint(repair_json(q))

('{"questions": ["How many push-ups should I aim for if I\'m a beginner?", '
 '"What modifications can I do to make push-ups easier?", "How far apart '
 'should my hands be placed during a push-up?", "Can push-ups be incorporated '
 'into a full-body workout?", "What are some common mistakes to avoid when '
 'doing push-ups?"]}')


In [7]:
import json
str_q =repair_json(q)
json_q =json.loads(str_q)
type(json_q)

dict

In [8]:
json_q

{'questions': ["How many push-ups should I aim for if I'm a beginner?",
  'What modifications can I do to make push-ups easier?',
  'How far apart should my hands be placed during a push-up?',
  'Can push-ups be incorporated into a full-body workout?',
  'What are some common mistakes to avoid when doing push-ups?']}

In [13]:
try_out=[]
for key,value in json_q.items():
    for i in value:
        try_out.append((0,i))

try_out

[(0, "How many push-ups should I aim for if I'm a beginner?"),
 (0, 'What modifications can I do to make push-ups easier?'),
 (0, 'How far apart should my hands be placed during a push-up?'),
 (0, 'Can push-ups be incorporated into a full-body workout?'),
 (0, 'What are some common mistakes to avoid when doing push-ups?')]

In [14]:
t =pd.DataFrame(try_out,columns=['id','question'])
t.head()

Unnamed: 0,id,question
0,0,How many push-ups should I aim for if I'm a be...
1,0,What modifications can I do to make push-ups e...
2,0,How far apart should my hands be placed during...
3,0,Can push-ups be incorporated into a full-body ...
4,0,What are some common mistakes to avoid when do...


In [None]:
import time
def generate_questions(docs):
    all_questions= []
    id =0
    for doc in docs:
        
        qs =questions_chain.invoke({
              "record":doc.page_content
            }).content
        
        # print(qs)
        str_qs =repair_json(qs)
        json_qs =json.loads(str_qs)

        for key,value in json_qs.items():
             for question in value:
                 all_questions.append((id,question))
        id +=1

        time.sleep(3)

    return all_questions

questions= generate_questions(docs)

In [15]:
questions[0]

'How many sets and reps are generally recommended for Push-Ups to build muscle?'

In [None]:
df_result =pd.DataFrame(questions ,columns=['id','question'])
df_result.to_csv('../Data/ground-truth-retrieval.csv')