In [3]:
# Standard libraries
import os
import sys
import json
from dotenv import load_dotenv

# Third-party libraries
import pandas as pd
from IPython.display import HTML

# Add project root to sys.path
project_root = os.path.dirname(os.getcwd())
sys.path.append(project_root)

# Local modules
from src.classifiers import is_uae_real_estate_query, llm_classifier
from src.tools import extract_data_intent, safe_dataframe_tool, create_plotly_code
from src.geo_tools import generate_google_maps_html

load_dotenv()

True

In [4]:
query = "create bar chart of average price per square meter for 2 bedroom flats in Dubai by area"

data_intent = extract_data_intent.invoke(query)

    # Step 3: Use PandasAI to retrieve relevant data
data_json_str = safe_dataframe_tool.invoke(data_intent)
data_dict = json.loads(data_json_str)


viz_input = json.dumps({"data": data_dict["result"], "query": query})

Dataset loaded successfully.


In [6]:
create_plotly_code(viz_input)

  create_plotly_code(viz_input)


'{"success": true, "result": "import pandas as pd\\nimport plotly.graph_objs as go\\n\\n# Your data\\ndata = [\\n    [\'Dubai Marina\', 62749.367424],\\n    [\'Dubai Sports City\', 23636.363636],\\n    [\'Al Barari\', 131818.181818],\\n    [\'Al Wasl\', 60818.181818],\\n    [\'Al Furjan\', 52272.727273],\\n    [\'Business Bay\', 81587.121212],\\n    [\'Palm Jumeirah\', 183080.805556],\\n    [\'City Walk\', 104829.545455],\\n    [\'International City\', 15454.545455],\\n    [\'Dubai Residence Complex\', 27240.909091],\\n    [\'Town Square\', 29545.454545],\\n    [\'Downtown Dubai\', 85833.333333],\\n    [\'Zabeel\', 108712.121212],\\n    [\'Arjan\', 32681.818182],\\n    [\'Jumeirah Beach Residence\', 169409.090909],\\n    [\'Dubai Harbour\', 148863.636364],\\n    [\'Hills\', 75000.000000],\\n    [\'Views\', 60000.000000],\\n    [\'Al Quoz\', 32954.545455],\\n    [\'Dubai Silicon Oasis\', 37954.545455],\\n    [\'Springs\', 74983.762987],\\n    [\'Culture Village\', 85227.272727],\\n    [

In [None]:
def main_agent(query: str):
    # Step 1: Check if query is relevant
    if not is_uae_real_estate_query(query):
        return {"type": "output", "data": "This is an irrelevant question to UAE property."}

    # Step 2: Extract data intent from user query
    data_intent = extract_data_intent.invoke(query)

    # Step 3: Use PandasAI to retrieve relevant data
    data_json_str = safe_dataframe_tool.invoke(data_intent)
    data_dict = json.loads(data_json_str)

    if not data_dict.get("success"):
        return {"type": "error", "error": data_dict.get("error"), "solution": data_dict.get("solution")}

    # Step 4: Classify the user's goal
    action = llm_classifier(query)
    

    # Step 5: Take action based on user intent
    if action == "output":
        return {"type": "data", "data": data_dict["result"]}
    
    
    elif action == "plot_stats":
       
        viz_input = json.dumps({"data": data_dict["result"], "query": query})
        result_json_str = create_plotly_code.invoke(viz_input)
        result = json.loads(result_json_str)
        return {"type": "plot", **result}

    elif action == "geospatial_plot":
        
        html = generate_google_maps_html(data_dict["result"])
        return {"type": "html", "content": html}

    else:
        return {"type": "error", "error": f"Unknown action '{action}' from classifier."}


In [6]:
data_intent = extract_data_intent.invoke("create time series plot of the number of properties added in dubai per month")

    # Step 3: Use PandasAI to retrieve relevant data
data_json_str = safe_dataframe_tool.invoke(data_intent)
data_dict = json.loads(data_json_str)

Dataset loaded successfully.


In [8]:
data_dict

{'success': False,
 'result': None,
 'error': 'Object of type Timestamp is not JSON serializable',
 'solution': 'Check your query syntax or the dataset structure.'}

In [30]:
import streamlit as st
import pandas as pd
import openai
from typing import Optional

def get_openai_llm():
    """Initialize OpenAI client"""
    return openai.OpenAI(api_key=st.secrets["OPENAI_API_KEY"])

def extract_python_code(text: str) -> Optional[str]:
    """
    Extract Python code from markdown code blocks.
    Returns None if no code found.
    """
    if '```python' in text:
        return text.split('```python')[1].split('```')[0].strip()
    elif '```' in text:
        return text.split('```')[1].split('```')[0].strip()
    return None

def create_plotly_code(data_dict: dict, user_input: str, verbose: bool = True):
    """Generate and execute Plotly code using your exact prompt format"""
    # Prepare data
    data = pd.DataFrame(data_dict["result"])
    
    # Create prompt (using your exact format)
    code_prompt = f"""
        Generate the code <code> for plotting the data, {data}, in plotly,
        in the format requested by: {user_input}.
        The solution should be given using plotly and only plotly.
        Do not use matplotlib. Return the code <code> in the following
        format python <code>
    """
    # Get AI response
    official_ai = get_openai_llm()
    response = official_ai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": code_prompt}],
        max_tokens=1000
    )
    
    # Extract code
    raw_response = response.choices[0].message.content
    code = extract_python_code(raw_response)

    # Modify code for Streamlit
    code = code.replace("fig.show()", "")
    code += "\nst.plotly_chart(fig, use_container_width=True)"
    
    return code

In [5]:
data_intent = extract_data_intent.invoke("make bar plot of the average property prices in Jumeirah by the number of rooms")

    # Step 3: Use PandasAI to retrieve relevant data
data_json_str = safe_dataframe_tool.invoke(data_intent)
data_dict = json.loads(data_json_str)

Dataset loaded successfully.


In [None]:
from src.tools import get

In [6]:
data_intent

'Extract average property prices in Jumeirah grouped by the number of rooms.'

In [6]:
result

{'type': 'plot',
 'success': True,
 'result': {'code': 'import pandas as pd\nimport plotly.express as px\n\ndata = [\n  {\n    "bedrooms": 0,\n    "avg_price": 6823900.0\n  },\n  {\n    "bedrooms": 2,\n    "avg_price": 6170666.619047619\n  },\n  {\n    "bedrooms": 1,\n    "avg_price": 2423169.0434782607\n  },\n  {\n    "bedrooms": 3,\n    "avg_price": 15367307.692307692\n  },\n  {\n    "bedrooms": 4,\n    "avg_price": 15542356.6\n  }\n]\ndf = pd.DataFrame(data)\nfig = px.bar(df, x=\'bedrooms\', y=\'avg_price\', title=\'Average Property Prices in Jumeirah by Number of Rooms\')\nfig.update_layout(xaxis_title=\'Number of Rooms\', yaxis_title=\'Average Price\')',
  'image_base64': 'iVBORw0KGgoAAAANSUhEUgAAArwAAAH0CAYAAADfWf7fAAAgAElEQVR4Xu29CZQV1dW+v0HEWVFEnIfEDyODihglahIQh0icEcGoOEBQgyGIc4woSBSIgoBGRUEFMUYGURSnEBWNGo2QgDitqPkIg0FRQRGFAL/1nu9f/b/ddsu93b3vrVP3qbVYQHfVrn2e95x93zp1qm6D9evXrzc2CEAAAhCAAAQgAAEIZJRAAwxvRpWlWRCAAAQgAAEIQAACgQCGl44AAQhAAAIQgAAEIJBpAhjeTMtL4yAAAQhAAAIQgAAEMLz0AQhAA

In [3]:
DATA_DIR = r"../data/uae_real_estate_2024_geo_ready2.parquet"
df =  pd.read_parquet(DATA_DIR)

In [6]:
df.drop(columns="addedOn", inplace=True)

In [7]:
df.columns

Index(['title', 'displayAddress', 'bathrooms', 'bedrooms', 'type', 'price',
       'verified', 'priceDuration', 'sizeMin', 'furnishing', 'description',
       'latitude', 'longitude', 'City'],
      dtype='object')

In [11]:
df.columns[df.isna().any()].tolist()


['bathrooms', 'bedrooms', 'sizeMin', 'furnishing']

In [15]:
df['bathrooms'].fillna(0, inplace=True)
df['bedrooms'].fillna(0, inplace=True)
df['furnishing'] = df['furnishing'].cat.add_categories('Unknown')
df['furnishing'] = df['furnishing'].fillna('Unknown')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bathrooms'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bedrooms'].fillna(0, inplace=True)


In [16]:
df["sizeMin"] = df["bedrooms"].apply(lambda x: 20 + 12 * x)

In [18]:
df['addedOn'] = df['addedOn'].dt.strftime('%Y-%m-%dT%H:%M:%S')

In [22]:
df["displayAddress"]

0         Binghatti Canal, Business Bay, Dubai
1      La Vie, Jumeirah Beach Residence, Dubai
2                    East 40, Al Furjan, Dubai
3                    Meadows 3, Meadows, Dubai
4          Princess Tower, Dubai Marina, Dubai
                        ...                   
558           Zulal 1, Zulal, The Lakes, Dubai
559           Cayan Tower, Dubai Marina, Dubai
560            West Yas, Yas Island, Abu Dhabi
561           Al Marjan Island, Ras Al Khaimah
562           Oia Residence, Motor City, Dubai
Name: displayAddress, Length: 563, dtype: string

In [19]:
df.columns[df.isna().any()].tolist()


[]

In [20]:
df.to_csv(r"../data/uae_real_estate_2024_geo_ready2.csv", index=False)

In [21]:
import pandasai as pai

companies = pai.create(
  path="my-org/properties3",
  df=pai.read_csv("../data/uae_real_estate_2024_geo_ready2.csv"),
  description="UAE properties dataset 2"
)

Dataset saved successfully to path: my-org/properties3


In [7]:
result = companies.chat("Retrieve properties located within a 2-mile radius of Dubai Eye.")