# Baseline Naive Agent

In [134]:
import pandas as pd
import json
import os 
from datetime import datetime

from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from pathlib import Path
import time

In [2]:
!pip show openai

Name: openai
Version: 2.14.0
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: Apache-2.0
Location: /opt/anaconda3/lib/python3.12/site-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: 


In [3]:
!pip install --upgrade openai



In [65]:
# Loading the stratified features 
X_train = pd.read_parquet('data_output/prod_features/X_train.parquet')
y_train = pd.read_parquet('data_output/prod_features/y_train.parquet')
X_test = pd.read_parquet('data_output/prod_features/X_test.parquet')
y_test = pd.read_parquet('data_output/prod_features/y_test.parquet')

In [132]:
# Partitioning data into chunks of 100 records (to monitor cost efficiency)
output_dir = 'data_output/prod_features/json_test_files/'
partition_size = 100

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

df_to_chunk ={'X_test':X_test, 'y_test':y_test}

for name, file in df_to_chunk.items():

    for chunk_id, chunk_df in iter(file.groupby(np.arange(len(file)) // partition_size)):
        file_path = f'{output_dir}{name}_part_{chunk_id}.jsonl'
        chunk_df.to_json(file_path,orient='records', lines=True)

    print(f'Partioning Complete: {chunk_id} Partitions Created')


Partioning Complete: 87 Partitions Created
Partioning Complete: 87 Partitions Created


In [133]:
# Multiple Predictions using OpenAI 
class NaiveAgent:
    def __init__(self, system_prompt, model = 'gpt-5-nano' ):
        load_dotenv()
        
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ValueError('OpenAI API Key not found in .env')
        
        self.client = OpenAI(api_key=api_key)
        
        self.model = model
        self.system_prompt = system_prompt
        self.user_prompt = None
        self.base_data_path = Path('data_output/prod_features/json_test_files/')
        self.output_dir = Path('data_output/agentic_outputs')
        self.input_data = None
    
    def JSON_read(self, file_name, output_dir='data_output'):
        try:
            data = []
            self.file_path = self.base_data_path / file_name
            with self.file_path.open('r', encoding='utf-8') as f:
                for line in f:
                    data.append(json.loads(line.strip()))
            return data
        except Exception as e:
            print(f'Error Loading JSON: \n{e}')
            return None
    
    def call_agent(self):   
        str_user_prompt = str(self.user_prompt)
        str_sys_prompt = str(self.system_prompt)
        
        agent_response = self.client.responses.create(
            model = self.model,
            input = [
                {
                    "role":"system",
                    "content":str_sys_prompt
                },
                {
                    "role":"user",
                    "content":str_user_prompt
                }
            ],
            text = {
                "format":{
                    "type":"json_schema",
                    "name":"agent_output",
                    "strict":True,
                    "schema": {
                        "type":"object",
                        "properties":{
                                        "price":{"type":"number"}
                                    },
                                    "required":["price"],
                                    "additionalProperties": False
                                }
                            }
                        }
    )
        output = json.loads(agent_response.output_text)
        return output
    
    def batch_predict(self, file_name, output_file='preds.jsonl'):
        data = self.JSON_read(file_name)
        if not data:
            return
        results = []
        
        for i, record in enumerate(tqdm(data, desc="Processing Test Data Predictions")):
            self.user_prompt = f'Predict the price for this item:\n{json.dumps(record,indent=2)}'
            prediction = self.call_agent()
        
            if prediction:
                result = {
                    'predicted_price': prediction['price'],
                    'pred_id':i
                }
                results.append(result)
        
        # Saving results as JSONL
        
        output_dir = Path('data_output/agentic_preds/') 
        start = Path(file_name).stem #before the .jsonl
        end = Path(file_name).suffix
        output_file_name = f'{start}_preds{end}'
        output_path = output_dir / output_file_name
        
        with output_path.open('w', encoding='utf-8') as f:
            for result in results:
                f.write(json.dumps(result) + '\n')
        
        print(f'Saved {len(results)} predictions at {str(output_path)}')

In [135]:
start_time = time.time()

agent_naive = NaiveAgent(
    system_prompt=(
    'Predict the house price in GBP based on what you know and the information provided.'
    )
    )
chunk1_response = agent_naive.batch_predict(file_name='X_test_part_0.jsonl')

end_time = time.time()
duration = end_time - start_time
print(f'Predictions were completed in {duration}')

Processing Test Data Predictions: 100%|██████████| 100/100 [17:38<00:00, 10.58s/it]

Saved 100 predictions at data_output/agentic_preds/X_test_part_0_preds.jsonl
Predictions were completed in 1058.4592680931091





To Finish: Make OpenAI make predictions for each of the test datapoints. 

Try Batching API:
Send the test set tomorrow!

Try Caching your prompts.

## 2. Multi-Agent Models

In [None]:
agent_list = {
    "1.AnalystAgent":
        """
        You're a critically thinking Data Analyst that provides helpful supporting information by creating SQL scripts focusing on:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABITABLE_ROOMS
        
        Generate a SQL Script to query house prices dataset and generate an aggregated supporting data relating to the input JSON and the fields above. 
        
        Output Format:
        SELECT <Criteria>
        FROM <Table Names> 
        """,
    "2.PredictionAgent":
        """
        Using data obtained from the Analyst Agent, make a prediction for the house price in GBP for the JSON input. 
        Focus on and consider the following core features from the JSON input to make your prediction:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABITABLE_ROOMS
        """
}

In [None]:
class MultiAgent(NaiveAgent):
    
    def __init__(self, system_prompt, user_prompt, model='gpt-5-nano'):
        super().__init__(system_prompt, user_prompt, model) # Initialising the parent class. 
        self.memory = []
        
    def initialise_agents(self, agent_list, max_agents = 5):
        self.max_agents = max_agents
        self.agent_list = agent_list
        
        if len(agent_list) > self.max_agents: # Validation to prevent high costs
            raise ValueError(f"You have more than {self.max_agents} agents, please reduce n_agents or increase max_agents.")
        
        for agent_name, system_prompt in agent_list.items():
            self.agent_name = agent_name
            self.system_prompt = system_prompt
            self.startThinking() # Should be replaced with createSingleAgent()
                
    def createSingleAgent(self):
        pass 
    
    def startThinking(self,agent):
        # Use self.memory to input into the agent!
        print(f"Agent:{self.agent_name} \nSys_Prompt:\n{self.system_prompt}!")
        
        output = None
        self.memory.append(output)
    
    def main(self):
        startThinking(agent="1.AnalystAgent")
        startThinking(agent="2.PredictionAgent")
        

In [117]:
complex_agent = MultiAgent(system_prompt='Say Hello', user_prompt='Say Hello')

In [118]:
complex_agent.create_agents(agent_list)

Agent:Analyst 
Sys_Prompt:

        You're a critically thinking Data Analyst that provides helpful supporting information by creating SQL scripts focusing on:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABITABLE_ROOMS
        
        Generate a SQL Script to query house prices dataset and generate an aggregated supporting data relating to the input JSON and the fields above. 
        
        Output Format:
        SELECT <Criteria>
        FROM <Table Names> 
        !
Agent:Agent B 
Sys_Prompt:

        Using data obtained from the Analyst Agent, make a prediction for the house price in GBP for the JSON input. 
        Focus on and consider the following core features from the JSON input to make your prediction:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABIT

Version 1:
- 2-agent mode with a critique given average/summary stats. 
    - Tool calling perhaps?
- n-agent with querying the dataset using SQL? 
    - Must either use pandassql or set up a SQL server. The former might be a quick win. 
    - Create an MasterAgent() inheriting from NaiveAgent(). When initialised, this will create n-naive_agents but the prompt will be the key different. 
    - NaiveAgent class may need to be refactored to be generalisable.