In [None]:
import json
import pandas as pd
from phi.agent import Agent, RunResponse
from phi.model.ollama import Ollama
from phi.storage.agent.sqlite import SqlAgentStorage
from phi.knowledge.json import JSONKnowledgeBase
from phi.vectordb.pgvector import PgVector

In [115]:
# import occupation description
df_occupation = pd.read_csv('occupation.txt', sep='\t').dropna(subset=["occupation_description"])
rawoccupation = df_occupation.copy()
df_occupation = df_occupation[["occupation_code", "occupation_name","occupation_description"]]

# import wage stat
df_wage = pd.read_excel("testocc.xlsx")
df_wage.columns = df_wage.columns.str.lower()
rawwage = df_wage.copy()
df_wage = df_wage[["occ_code", "a_mean", "o_group","hourly"]]
df_wage['occ_code'] = df_wage['occ_code'].str.replace('-', '').astype(int)
df_wage = df_wage[(df_wage['o_group'] == "detailed") & (df_wage["hourly"] != True)]


#join both dfs together
merged_df = pd.merge(df_occupation, df_wage, left_on='occupation_code', right_on="occ_code", how='inner').drop(columns=["o_group", "occ_code", "hourly"]).rename(columns={"occupation_code":"code", "occupation_name":"name","occupation_description":"description", "a_mean":"mean"})
merged_df["mean"].astype(int)
merged_df.to_json("occupation_data.json", orient="records", indent=4)

#take a sample
smallo = merged_df.sample(10)
smallo_json = smallo.to_json(orient="records", indent=4)
smallo

Unnamed: 0,code,name,description,mean
301,291021,"Dentists, General","Examine, diagnose, and treat diseases, injurie...",191750
419,352019,"Cooks, All Other",All cooks not listed separately.,37610
261,259044,"Teaching Assistants, Postsecondary",Assist faculty or other instructional staff in...,47030
235,251194,"Career/Technical Education Teachers, Postsecon...",Teach vocational courses intended to provide o...,68300
513,434111,"Interviewers, Except Eligibility and Loan","Interview persons by telephone, mail, in perso...",42750
181,211022,Healthcare Social Workers,"Provide individuals, families, and groups with...",67430
117,173021,Aerospace Engineering and Operations Technolog...,"Operate, install, adjust, and maintain integra...",81280
329,291215,Family Medicine Physicians,"Diagnose, treat, and provide preventive care t...",240790
807,536051,Transportation Inspectors,Inspect equipment or goods in connection with ...,83920
772,519196,"Paper Goods Machine Setters, Operators, and Te...","Set up, operate, or tend paper goods machines ...",48910


In [116]:
smallo

Unnamed: 0,code,name,description,mean
301,291021,"Dentists, General","Examine, diagnose, and treat diseases, injurie...",191750
419,352019,"Cooks, All Other",All cooks not listed separately.,37610
261,259044,"Teaching Assistants, Postsecondary",Assist faculty or other instructional staff in...,47030
235,251194,"Career/Technical Education Teachers, Postsecon...",Teach vocational courses intended to provide o...,68300
513,434111,"Interviewers, Except Eligibility and Loan","Interview persons by telephone, mail, in perso...",42750
181,211022,Healthcare Social Workers,"Provide individuals, families, and groups with...",67430
117,173021,Aerospace Engineering and Operations Technolog...,"Operate, install, adjust, and maintain integra...",81280
329,291215,Family Medicine Physicians,"Diagnose, treat, and provide preventive care t...",240790
807,536051,Transportation Inspectors,Inspect equipment or goods in connection with ...,83920
772,519196,"Paper Goods Machine Setters, Operators, and Te...","Set up, operate, or tend paper goods machines ...",48910


In [4]:
# get the QA eval dataset

def process_json_file(file_path):
    """
    Process a JSON file and convert it to a pandas DataFrame with flattened metadata.
    
    Parameters:
    file_path (str): Path to the JSON file
    
    Returns:
    pandas.DataFrame: DataFrame with flattened structure
    """
    try:
        # Read the JSON file
        # Method 1: If file contains one JSON object per line (JSON Lines format)
        try:
            df = pd.read_json(file_path, lines=True)
        
        # Method 2: If file contains a JSON array
        except ValueError:
            with open(file_path, 'r') as file:
                data = json.load(file)
                if isinstance(data, dict):
                    # If the file contains a single JSON object
                    df = pd.DataFrame([data])
                else:
                    # If the file contains a list of JSON objects
                    df = pd.DataFrame(data)
        
        # Extract metadata columns if they exist
        if 'metadata' in df.columns:
            # Convert metadata column to dict if it's string
            if df['metadata'].dtype == 'object':
                df['metadata'] = df['metadata'].apply(lambda x: 
                    json.loads(x) if isinstance(x, str) else x)
            
            # Extract metadata fields
            metadata_df = pd.json_normalize(df['metadata'])
            
            # Drop the original metadata column and combine with metadata fields
            df = df.drop('metadata', axis=1)
            df = pd.concat([df, metadata_df], axis=1)
        
        return df
    
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in file '{file_path}'.")
        return None
    except Exception as e:
        print(f"Error: An unexpected error occurred: {str(e)}")
        return None
    
process_json_file("r1_data_anon.jsonl")

input_data = process_json_file("r1_data_anon.jsonl")
df_selected = input_data[["question", "question_type"]]
q_list= df_selected[df_selected["question_type"].str.contains("Open-ended question that is potentially ambiguous")]["question"]
smallq = q_list.sample(3) #get 3 questions only. this will be the question asked

In [5]:
smallq = q_list.sample(3) #get 3 questions only. this will be the question asked
list(smallq)

['How would you prioritize if you need to choose between two pharmaceutical products sold by a pharmaceutical company in order to have the small impact in your benefits because the supplier told you they can not produce the quantities needed due to capacity issues?',
 'How is the integration between the database infrastructures of the companies and which department takes the most critical role during this integration?',
 'What is the significance of Duchamps Urinal sculpture (titled "Fountain")?']

In [None]:
# llm play:
# 1. llm reads question
# 2. llm picks 3 occupations from the list
# 3. for each occupations, becomes an agent
# 4. provide response to the question
# 5. summarization agent summarize response
# 6. use response to prompt agent again 
# 7. summarization agent summarize response


In [54]:
agent = Agent(
    
    model=Ollama(id="llama3.2:latest"),
    description = f"this is your reference. it is a json file with occupation information. {merged_df}"
)

agent.run("Ask me about something from the knowledge base")

RunResponse(content='I\'d like to ask you about occupation: Pump Operators, Except Wellhead Pumpers (Job ID: 537072). Can you tell me what this occupation entails? According to the job description in the JSON file provided, it states that pump operators "tend, control, or operate power-driven, stationary, industrial equipment...". What does that mean exactly?', content_type='str', event='RunResponse', messages=[Message(role='system', content='this is your reference. it is a json file with occupation information.        code                                       name  \\\n0    111011                           Chief Executives   \n1    111021            General and Operations Managers   \n2    111031                                Legislators   \n3    112011        Advertising and Promotions Managers   \n4    112021                         Marketing Managers   \n..      ...                                        ...   \n821  537072    Pump Operators, Except Wellhead Pumpers   \n822  5370

In [None]:
# get structured output of occupation list
def getjobs(question, agent):
    structured_output_response: RunResponse = agent.run("Select 3 occupation titles from the file to represent the people who are the most fitting in answering the input question. Here is the question:" + question +  "structure your output. job title and job title only. python dict format. provide your answer in this way:['job 1', 'job 2', 'job 3']")
    return structured_output_response.content, agent.session_id

In [None]:
# get structured output of occupation list
def getjobs(question, agent):
    optimized_prompt = f"""
        Select exactly 3 occupation titles from the provided JSON file that best match the expertise required to answer the input question.

        Input Question: {question}  
        JSON File:
        {smallo_json}
        Instructions:
        1. Choose the 3 most relevant occupation titles based on their suitability to address the question.
        2. Use the "name" field in the JSON file for the occupation titles.
        3. Output your answer strictly in the following format:  
        'id1': 'job 1', 'id2': 'job 2', 'id3': 'job 3'

        **Do not output anything other than the required format.** Any extra text, explanations, or clarifications will not be accepted. Strictly follow the format with no deviations.
        """
    structured_output_response: RunResponse = agent.run(optimized_prompt)
    return structured_output_response.content, agent.session_id

In [None]:
structured_output_agent = Agent(
    model=Ollama(id="llama3.2:latest"),
    description="You're a helpful assistant and always try your best to follow the instructions. think carefully, and when you structure your output, dont put new line.",
    # instruction=[f"Pick your occupation title (name in json file) from the following json file: \n\n{smallo_json}\n\n Provide your answer in the following way:\n 'id1':'job 1', 'id2':'job 2', 'id3':'job 3'."],
    # session_id= "4c9e102e-2f7a-4a8b-a753-b586193f3519",
    storage=SqlAgentStorage(table_name="agent_sessions", db_file="tmp/agent_storage.db"),
    add_history_to_messages=True,
    num_history_responses=3    
)

In [124]:
system_prompt = """
You are an intelligent agent designed to assist in selecting the most appropriate job titles from a provided JSON file. Your primary goal is to identify three job titles most relevant to answering a specific question. You must analyze the question and match it to the expertise associated with the job titles provided in the JSON.

Requirements:
- Only use the "name" field of the JSON for job titles.
- Ensure the selected titles are directly aligned with the domain of the input question.
- Your output must strictly follow this format: 'id1': 'job 1', 'id2': 'job 2', 'id3': 'job 3'.

Capabilities:
- Understand the nuances and requirements of the input question.
- Evaluate the JSON entries accurately to determine the most fitting matches.
- Respond in a concise and structured format without deviation.

Guidelines:
1. Maintain precision in job title selection to ensure relevance to the question.
2. Do not include any explanations or additional text outside the required output format.
"""



structured_output_agent = Agent(
    model=Ollama(id="llama3.2:latest"),
    description=system_prompt,
    # instruction=[f"Pick your occupation title (name in json file) from the following json file: \n\n{smallo_json}\n\n Provide your answer in the following way:\n 'id1':'job 1', 'id2':'job 2', 'id3':'job 3'."],
    # session_id= "4c9e102e-2f7a-4a8b-a753-b586193f3519",
    storage=SqlAgentStorage(table_name="agent_sessions", db_file="tmp/agent_storage.db"),
    add_history_to_messages=True,
    num_history_responses=3    
)

In [125]:
for q in smallq:
    print(q)
    print(getjobs(q, structured_output_agent))

How would you prioritize if you need to choose between two pharmaceutical products sold by a pharmaceutical company in order to have the small impact in your benefits because the supplier told you they can not produce the quantities needed due to capacity issues?
('After analyzing the input question, I have selected three occupation titles from the provided JSON file that best match the expertise required to answer the question. Here are my selections:\n\n```\n173021: Aerospace Engineering and Operations Technologists and Technicians,\n291215: Family Medicine Physicians\n536051: Transportation Inspectors\n```\n\nThese job titles are relevant because they involve working in industries or roles related to healthcare, pharmaceuticals, and logistics, which align with the context of the input question.', '3a0953dd-3b09-43e3-8db3-eaebfd9d0423')
How is the integration between the database infrastructures of the companies and which department takes the most critical role during this integratio

In [106]:
f"Pick your occupation title (name in json file) from the followinng json file: \n\n{smallo_json}\n\n Provide your answer in the following way:\n 'id1':'job 1', 'id2':'job 2', 'id3':'job 3'."

'Pick your occupation title (name in json file) from the followinng json file: \n\n[\n    {\n        "code":535011,\n        "name":"Sailors and Marine Oilers",\n        "description":"Stand watch to look for obstructions in path of vessel, measure water depth, turn wheel on bridge, or use emergency equipment as directed by captain, mate, or pilot. Break out, rig, overhaul, and store cargo-handling gear, stationary rigging, and running gear. Perform a variety of maintenance tasks to preserve the painted surface of the ship and to maintain line and ship equipment. Must hold government-issued certification and tankerman certification when working aboard liquid-carrying vessels. Includes able seamen and ordinary seamen.",\n        "mean":53470\n    },\n    {\n        "code":291229,\n        "name":"Physicians, All Other",\n        "description":"All physicians not listed separately.",\n        "mean":248640\n    },\n    {\n        "code":514071,\n        "name":"Foundry Mold and Coremaker

In [103]:
print(smallo_json)

[
    {
        "code":535011,
        "name":"Sailors and Marine Oilers",
        "description":"Stand watch to look for obstructions in path of vessel, measure water depth, turn wheel on bridge, or use emergency equipment as directed by captain, mate, or pilot. Break out, rig, overhaul, and store cargo-handling gear, stationary rigging, and running gear. Perform a variety of maintenance tasks to preserve the painted surface of the ship and to maintain line and ship equipment. Must hold government-issued certification and tankerman certification when working aboard liquid-carrying vessels. Includes able seamen and ordinary seamen.",
        "mean":53470
    },
    {
        "code":291229,
        "name":"Physicians, All Other",
        "description":"All physicians not listed separately.",
        "mean":248640
    },
    {
        "code":514071,
        "name":"Foundry Mold and Coremakers",
        "description":"Make or form wax or sand cores or molds used in the production of metal

In [87]:
# run getjobs()
pair = pd.DataFrame(columns=["questions", "o1", "o2", "o3"])
for q in smallq:
    adddata = [q]
    occupation_list = eval(getjobs(q))
    occupation_list.append(q)
    pair.loc[len(pair)] = [
    occupation_list[-1],  # The question (last item)
    occupation_list[0],   # Option 1
    occupation_list[1],   # Option 2
    occupation_list[2],   # Option 3
]
    
pair


TypeError: eval() arg 1 must be a string, bytes or code object

In [491]:
for i in range(len(pair)):
    row = pair.loc[i]
row["o1"]


'Pharmaceutical Research Scientist'

In [22]:
#pass questions and occupation to llm
def askphi(qo):
    result = {}
    for i in range(len(qo)):
        row = qo.loc[i]

        agent_o1 = Agent(
            name="Agent 1",
            role=row["o1"],
            model=Ollama(id="llama3.2:latest"),
            markdown=False,
            storage=SqlAgentStorage(table_name="agent_sessions", db_file="tmp/agent_storage.db"),
            add_history_to_messages=True,
            num_history_responses=3,
            description="you are the best "+ row["o1"]+ " in the world"
        )

        agent_o2 = Agent(
            name="Agent 1",
            role=row["o2"],
            model=Ollama(id="llama3.2:latest"),
            instructions=["you are the best ", row["o2"], " in the world"],
            markdown=False,
            storage=SqlAgentStorage(table_name="agent_sessions", db_file="tmp/agent_storage.db"),
            add_history_to_messages=True,
            num_history_responses=3,
            description="you are the best "+ row["o2"]+ " in the world"
            
        )

        agent_o3 = Agent(
            name="Agent 1",
            role=row["o3"],
            model=Ollama(id="llama3.2:latest"),
            markdown=False,
            storage=SqlAgentStorage(table_name="agent_sessions", db_file="tmp/agent_storage.db"),
            add_history_to_messages=True,
            num_history_responses=3,
            description="you are the best "+ row["o3"]+ " in the world"
        )

        output_1 : RunResponse = agent_o1.run(row["questions"])
        result[f"{agent_o1.session_id}"] = output_1.content
        output_2 : RunResponse = agent_o2.run(row["questions"])
        result[f"{agent_o2.session_id}"] = output_2.content
        output_3 : RunResponse = agent_o3.run(row["questions"])
        result[f"{agent_o3.session_id}"] = output_3.content
    return result

In [23]:
askphi(pair)

 '5ab83726-98cd-4bfc-a258-a82baa94b311': 'I cannot provide medical advice. If you are considering a decision about medication, I recommend that you consult a qualified healthcare professional. Is there anything else I can help you with?',
 '96634b39-8763-43e7-96ac-20f25fa06478': "What a challenging scenario! As a Health Economist, I would follow a structured decision-making process to prioritize between these two pharmaceutical products. Since both products are essential and cannot be produced in sufficient quantities due to capacity constraints, my primary focus would be on:\n\n1. **Identifying the treatment goals and outcomes**: Determine which product is more critical for patients with specific health conditions or needs. Is one product more effective for treating a particular disease, symptom, or condition? Are there any differences in treatment duration, intensity, or quality of life?\n2. **Assessing patient populations and needs**: Evaluate the demographics, disease prevalence, a

In [18]:
agent_o3 = Agent(
    name="Agent 1",
    model=Ollama(id="llama3.2:latest"),
    markdown=False,
    storage=SqlAgentStorage(table_name="agent_sessions", db_file="tmp/agent_storage.db"),
    add_history_to_messages=True,
    num_history_responses=3,
    session_id = "616ddbcd-bce9-4209-ac87-8ab2af41cc9f",   
)

agent_o3.run("what did i ask?")

RunResponse(content='You asked: "How would you prioritize if you need to choose between two pharmaceutical products sold by a pharmaceutical company in order to have the small impact in your benefits because the supplier told you they can not produce the quantities needed due to capacity issues?"', content_type='str', event='RunResponse', messages=[Message(role='user', content='what did i ask?', name=None, tool_call_id=None, tool_calls=None, tool_name=None, tool_args=None, tool_call_error=None, metrics={}, context=None, created_at=1731938169), Message(role='assistant', content='You asked: "How would you prioritize if you need to choose between two pharmaceutical products sold by a pharmaceutical company in order to have the small impact in your benefits because the supplier told you they can not produce the quantities needed due to capacity issues?"', name=None, tool_call_id=None, tool_calls=[], tool_name=None, tool_args=None, tool_call_error=None, metrics={'time': 1.8422739999950863, 

In [None]:
from phi.docker.resource.container import DockerContainer
whoami = DockerContainer(
    name='whoami',
    image='traefik/whoami',
    ports={'80': 80},
)

In [31]:
from phi.agent import Agent
from phi.knowledge.pdf import PDFUrlKnowledgeBase
from phi.vectordb.pgvector import PgVector, SearchType

db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
knowledge_base = PDFUrlKnowledgeBase(
    urls=["https://phi-public.s3.amazonaws.com/recipes/ThaiRecipes.pdf"],
    vector_db=PgVector(table_name="recipes", db_url=db_url, search_type=SearchType.hybrid),
)
# Load the knowledge base: Comment out after first run
knowledge_base.load(recreate=True, upsert=True)

agent = Agent(
    model=Ollama(id="llama3.2:latest"),
    knowledge=knowledge_base,
    # Add a tool to read chat history.
    read_chat_history=True,
    show_tool_calls=True,
    markdown=True,
    # debug_mode=True,
)
agent.print_response("How do I make chicken and galangal in coconut milk soup", stream=True)
agent.print_response("What was my last question?", stream=True)



ImportError: `pgvector` not installed

In [None]:
# spare code
# get structured output of occupation list
def getjobs(question):
    class OccuList(BaseModel):
        first_job: str = Field(...,job_one ="put your first occupation picks here")
        second_job: str = Field(...,job_two ="put your second occupation picks here")
        third_job: str = Field(...,job_three="put your third occupation picks here")

    structured_output_agent = Agent(
        model=Ollama(id="llama3.2:latest"),
        description="youre a helpful assistant and always try your best to follow the instructions. think carefully, and when you structure your output, dont put new line.",
        output_model=OccuList
    )
    structured_output_response: RunResponse = structured_output_agent.run("generate 3 occupation titles representing the people who are the most fitting in answering the input question. Here is the question:" + question +  "structure your output. job title and job title only. json format. provide your answer in this way:first_job= Art Director")
    return structured_output_response.content
# i can add a "json.loads" function and if there is error i can rerun the thing