# Baseline Naive Agent

In [14]:
import pandas as pd
import json
import os 
from datetime import datetime

from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
!pip show openai

Name: openai
Version: 2.14.0
Summary: The official Python library for the openai API
Home-page: https://github.com/openai/openai-python
Author: 
Author-email: OpenAI <support@openai.com>
License: Apache-2.0
Location: /opt/anaconda3/lib/python3.12/site-packages
Requires: anyio, distro, httpx, jiter, pydantic, sniffio, tqdm, typing-extensions
Required-by: 


In [3]:
!pip install --upgrade openai



In [4]:
# Loading the stratified features 
X_train = pd.read_parquet('data_output/prod_features/X_train.parquet')
y_train = pd.read_parquet('data_output/prod_features/y_train.parquet')
X_test = pd.read_parquet('data_output/prod_features/X_test.parquet')
y_test = pd.read_parquet('data_output/prod_features/y_test.parquet')

In [6]:
X_train.head()

Unnamed: 0,1HE_district,1HE_CURRENT_ENERGY_RATING,1HE_POTENTIAL_ENERGY_RATING,1HE_PROPERTY_TYPE,1HE_BUILT_FORM,1HE_ENERGY_TARIFF,1HE_MAINS_GAS_FLAG,1HE_GLAZED_AREA,1HE_HOT_WATER_ENERGY_EFF,1HE_HOT_WATER_ENV_EFF,...,CORE_HEATING_COST_POTENTIAL,CORE_HOT_WATER_COST_CURRENT,CORE_HOT_WATER_COST_POTENTIAL,CORE_TOTAL_FLOOR_AREA,CORE_MULTI_GLAZE_PROPORTION,CORE_EXTENSION_COUNT,CORE_NUMBER_HABITABLE_ROOMS,CORE_NUMBER_HEATED_ROOMS,CORE_LOW_ENERGY_LIGHTING,CORE_NUMBER_OPEN_FIREPLACES
3090,BUCKINGHAMSHIRE,D,C,House,End-Terrace,Unknown,Not Available,Normal,Good,Good,...,327.0,118.0,103.0,88.24,100.0,0.0,5.0,5.0,0.0,0.0
28196,BUCKINGHAMSHIRE,C,C,House,Mid-Terrace,Single,Y,Normal,Good,Good,...,282.0,85.0,74.0,48.794,100.0,0.0,3.0,3.0,0.0,0.0
16362,SLOUGH,E,C,House,Semi-Detached,Single,Y,Normal,Good,Good,...,511.0,85.0,54.0,73.0,13.0,2.0,5.0,5.0,83.0,1.0
22322,BUCKINGHAMSHIRE,D,D,House,Mid-Terrace,Single,Y,Normal,Good,Good,...,589.0,93.0,82.0,72.7,0.0,0.0,4.0,4.0,78.0,1.0
24254,WINDSOR AND MAIDENHEAD,F,C,House,Mid-Terrace,Single,Y,Normal,Average,Average,...,648.0,102.0,81.0,84.0,5.0,2.0,4.0,2.0,0.0,1.0


In [33]:
# Partitioning data into chunks of 100 records (to monitor cost efficiency)
output_dir = 'data_output/prod_features/json_test_files/'
partition_size = 100

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

df_to_chunk ={'X_test':X_test, 
              'y_test':y_test}

for name, file in df_to_chunk.items():

    for chunk_id, chunk_df in iter(X_test.groupby(np.arange(len(file)) // partition_size)):
        file_path = f'{output_dir}{name}_part_{chunk_id}.parquet'
        chunk_df.to_parquet(file_path,index=False)

    print(f'Partioning of {name.title()} Complete: {chunk_id} Partitions Created')


Partioning of X_Test Complete: 87 Partitions Created
Partioning of Y_Test Complete: 87 Partitions Created


In [38]:
# Partitioning data into chunks of 100 records (to monitor cost efficiency)
output_dir = 'data_output/prod_features/json_test_files/'
partition_size = 100

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

df_to_chunk ={'X_test':X_test, 
              'y_test':y_test}

for name, file in df_to_chunk.items():

    for chunk_id, chunk_df in iter(X_test.groupby(np.arange(len(X_test)) // partition_size)):
        file_path = f'{output_dir}{name}_part_{chunk_id}.JSON'
        chunk_df.to_json(file_path,orient='records', indent=4)

    print(f'Partioning Complete: {chunk_id} Partitions Created')


Partioning Complete: 87 Partitions Created
Partioning Complete: 87 Partitions Created


In [55]:
class NaiveAgent:
    def __init__(self, system_prompt, user_prompt, model = 'gpt-5-nano' ):
        load_dotenv()
        
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ValueError('OpenAI API Key not found in .env')
        
        self.client = OpenAI(api_key=api_key)
        
        self.model = model
        self.system_prompt = system_prompt
        self.user_prompt = user_prompt
        self.input_path = Path('data_output/X_train.json')
        self.output_dir = Path('data_output')
        self.input_data = None
    
    def JSON_read(self, output_dir='data_output'):
        with self.input_path.open('r', encoding='utf-8') as f:
            input_data = json.load(f)
    
    def call_agent(self):   
        str_user_prompt = str(self.user_prompt)
        str_sys_prompt = str(self.system_prompt)
        
        agent_response = self.client.responses.create(
            model = self.model,
            input = [
                {
                    "role":"system",
                    "content":str_sys_prompt
                },
                {
                    "role":"user",
                    "content":str_user_prompt
                }
            ],
            text = {
                "format":{
                    "type":"json_schema",
                    "name":"agent_output",
                    "strict":True,
                    "schema": {
                        "type":"object",
                        "properties":{
                            "price":{"type":"number"}
                        },
                        "required":["price"],
                        "additionalProperties": False
                        }
                    }
                }
        )
        output = json.loads(agent_response.output_text)
        print(json.dumps(output, indent=3))

In [37]:
with open('data_output/X_train.json', 'r') as f:
    data = json.load(f)

first_entry = data[0]
print(first_entry)

{'1HE_district': 'BUCKINGHAMSHIRE', '1HE_CURRENT_ENERGY_RATING': 'D', '1HE_POTENTIAL_ENERGY_RATING': 'B', '1HE_PROPERTY_TYPE': 'House', '1HE_BUILT_FORM': 'Semi-Detached', '1HE_ENERGY_TARIFF': 'Single', '1HE_MAINS_GAS_FLAG': 'Y', '1HE_GLAZED_AREA': 'Normal', '1HE_HOT_WATER_ENERGY_EFF': 'Good', '1HE_HOT_WATER_ENV_EFF': 'Good', '1HE_WINDOWS_ENERGY_EFF': 'Good', '1HE_WINDOWS_ENV_EFF': 'Good', '1HE_WALLS_ENERGY_EFF': 'Very Poor', '1HE_WALLS_ENV_EFF': 'Very Poor', '1HE_ROOF_ENERGY_EFF': 'Very Poor', '1HE_MAINHEAT_ENERGY_EFF': 'Good', '1HE_MAINHEATC_ENERGY_EFF': 'Good', '1HE_LIGHTING_ENERGY_EFF': 'Good', '1HE_MECHANICAL_VENTILATION': 'natural', '1HE_TENURE': 'Owner Occupied', 'CORE_match_confidence': 92.5925925926, 'CORE_BUILDING_REFERENCE_NUMBER': 5175796468.0, 'CORE_CURRENT_ENERGY_EFFICIENCY': 67.0, 'CORE_POTENTIAL_ENERGY_EFFICIENCY': 81.0, 'CORE_ENVIRONMENT_IMPACT_CURRENT': 60.0, 'CORE_ENVIRONMENT_IMPACT_POTENTIAL': 75.0, 'CORE_ENERGY_CONSUMPTION_CURRENT': 199.0, 'CORE_ENERGY_CONSUMPTION_P

In [47]:
agent2 = NaiveAgent(
    system_prompt='Predict the house price in GBP based on what you know and the information provided about houses in the UK.',
    user_prompt=first_entry
    )
agent2.call_agent()

{
   "price": 625000
}


In [53]:
X_test.shape
X_train.shape

(20417, 44)

To Finish: Make OpenAI make predictions for each of the test datapoints. 

Try Batching API:
Send the test set tomorrow!

Try Caching your prompts.

## 2. Multi-Agent Models

In [None]:
agent_list = {
    "1.AnalystAgent":
        """
        You're a critically thinking Data Analyst that provides helpful supporting information by creating SQL scripts focusing on:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABITABLE_ROOMS
        
        Generate a SQL Script to query house prices dataset and generate an aggregated supporting data relating to the input JSON and the fields above. 
        
        Output Format:
        SELECT <Criteria>
        FROM <Table Names> 
        """,
    "2.PredictionAgent":
        """
        Using data obtained from the Analyst Agent, make a prediction for the house price in GBP for the JSON input. 
        Focus on and consider the following core features from the JSON input to make your prediction:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABITABLE_ROOMS
        """
}

In [None]:
class MultiAgent(NaiveAgent):
    
    def __init__(self, system_prompt, user_prompt, model='gpt-5-nano'):
        super().__init__(system_prompt, user_prompt, model) # Initialising the parent class. 
        self.memory = []
        
    def initialise_agents(self, agent_list, max_agents = 5):
        self.max_agents = max_agents
        self.agent_list = agent_list
        
        if len(agent_list) > self.max_agents: # Validation to prevent high costs
            raise ValueError(f"You have more than {self.max_agents} agents, please reduce n_agents or increase max_agents.")
        
        for agent_name, system_prompt in agent_list.items():
            self.agent_name = agent_name
            self.system_prompt = system_prompt
            self.startThinking() # Should be replaced with createSingleAgent()
                
    def createSingleAgent(self):
        pass 
    
    def startThinking(self,agent):
        # Use self.memory to input into the agent!
        print(f"Agent:{self.agent_name} \nSys_Prompt:\n{self.system_prompt}!")
        
        output = None
        self.memory.append(output)
    
    def main(self):
        startThinking(agent="1.AnalystAgent")
        startThinking(agent="2.PredictionAgent")
        

In [117]:
complex_agent = MultiAgent(system_prompt='Say Hello', user_prompt='Say Hello')

In [118]:
complex_agent.create_agents(agent_list)

Agent:Analyst 
Sys_Prompt:

        You're a critically thinking Data Analyst that provides helpful supporting information by creating SQL scripts focusing on:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABITABLE_ROOMS
        
        Generate a SQL Script to query house prices dataset and generate an aggregated supporting data relating to the input JSON and the fields above. 
        
        Output Format:
        SELECT <Criteria>
        FROM <Table Names> 
        !
Agent:Agent B 
Sys_Prompt:

        Using data obtained from the Analyst Agent, make a prediction for the house price in GBP for the JSON input. 
        Focus on and consider the following core features from the JSON input to make your prediction:
        1HE_district
        1HE_PROPERTY TYPE 
        1HE_BUILT_FORM 
        CORE_TOTAL_FLOOR_AREA 
        CORE_EXTENSION_COUNT 
        CORE_NUMBER_OF_HABIT

Version 1:
- 2-agent mode with a critique given average/summary stats. 
    - Tool calling perhaps?
- n-agent with querying the dataset using SQL? 
    - Must either use pandassql or set up a SQL server. The former might be a quick win. 
    - Create an MasterAgent() inheriting from NaiveAgent(). When initialised, this will create n-naive_agents but the prompt will be the key different. 
    - NaiveAgent class may need to be refactored to be generalisable.