In [None]:
ls

In [1]:
from pydantic_ai import Agent
import os
import requests
from datetime import datetime, timedelta
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from requests.models import PreparedRequest
from urllib.request import Request, urlopen
import numpy as np
from dateutil.relativedelta import relativedelta
from typing import Optional

import json
import plotly.express as px
import pandas as pd
from pydantic import Field
import sys
sys.path.append('/Users/williamharrigan/Desktop/hackathon/')
from codes import *
import pytz
from enum import Enum

from typing import Optional
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pytz
from pydantic import BaseModel

from enum import Enum
from typing import Optional, Dict, Any, List, Tuple
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pytz
import requests
import pandas as pd
import plotly.express as px
from pydantic import BaseModel

output_dir = '/Users/williamharrigan/Desktop/hackathon/'


In [2]:
prompt_process_query = """You are an AI assistant for the Hawai'i Climate Data Portal. 
You are responsible for answering the user query based on the results you get back from the tools.

When a user asks a query, carefully analyze the query and determine the best way to respond to the following questions:
1. What are the specific information the user is asking for?
2. What tools do you need to use to get the information?
3. What is the best way to format the response?

Plan your response step by step using the available tools and respond with the final answer only.

Output Format:
Only respond with the final answer. Do not lead the user to more conversation.
"""


# prompt_process_query = """You are a Concierge AI assistant for the Hawai'i Climate Data Portal (HCDP). Your primary job is to provide users accurate and reliable answers backed by the data extracted from the available tools. 

# Users may ask for two types of information:
# 1. Climate data for a specific location and time period. You have access to the HCDP API, which provides climate data for various locations in Hawaii. You can use the API to get climate data such as temperature and rainfall for specific locations and time periods.
# 2. General information about HCDP, including its purpose and how to access data. You can also provide general information about the HCDP web portal, including its purpose and how to access certain data.

# When a user asks a query, carefully classify the query into one of the two categories mentioned above. If you are unable to classify the query, ask the user to clarify the question.

# if the query is type 1: about climate data, fill in the following template:

# aggregation: "min" | "max" | "mean", 
# period: "day" | "month" | "year",
# location name: str,
# start_date: Optional[str],
# end_date: Optional[str]

# Once you have the answers to the questions, plan your response step by step using the available tools and respond with the final answer only.

# Output Format:
# Only respond with the final answer in an easy to understand language. Do not lead the user to more conversation.

# """

In [3]:
prompt_process_agent = Agent(  
    "groq:deepseek-r1-distill-llama-70b",
    result_type=str,    
    system_prompt=prompt_process_query,
    model_settings={'temperature': 0.0}    
)

In [None]:

# prompt_process_agent = Agent(  
#     "google:gemini-1.5-flash",  # Gemini model
#     result_type=str,
#     system_prompt=prompt_process_query,
#     model_settings={'temperature': 0.0}    
# )


In [4]:
# You will be given a token to access the HCDP API. Add that token here.
hcdp_api_token = os.environ["hcdp_api_token"]
# Please input your email address. This will be used for user logging or distributing data packages
email = "INSERT_EMAIL_ADDRESS_HERE"

api_base_url = "https://api.hcdp.ikewai.org"
# Setup header for API requests
header = {
  "Authorization": f"Bearer {hcdp_api_token}"
}

In [None]:
## Functions from the HCDP documentation

def display_raster(params, title, cmap = plt.cm.viridis.reversed(), nodata_color = "#f0f0f0"):
    #construct raster endpoint url base
    raster_ep = "/raster"
    url = f"{api_base_url}{raster_ep}"
    #construct url with params
    url_constructor = PreparedRequest()
    url_constructor.prepare_url(url, params)
    full_url = url_constructor.url
    print(f"Constructed API request URL: {full_url}")
    #create request object for use with urlopen
    req = Request(full_url, headers = header)
    #seupt plot
    fig, ax = plt.subplots(figsize=(20, 10), facecolor = "#e0e0e0")
    #remove axis ticks (displays row, col numbers, not super helpful)
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    #set plot title
    plt.title(title, fontsize = 20)
    #set nodata value
    cmap.set_bad(nodata_color)
    #open data stream from API
    with urlopen(req) as raster:
        #read tiff image
        img = mpimg.imread(raster, format = "tiff")
        #mask nodata values
        masked = np.ma.masked_equal(img, img[0][0])
        #plot on map with color schema and add color bar
        imgplot = ax.imshow(masked[:, :, 0], cmap = cmap)
        fig.colorbar(imgplot, ax = ax)
        
        
def query_stations(values, name, limit = 10000, offset = 0):
    params = {
        "name": name
    }
    for key in values:
        params[f"value.{key}"] = values[key]
    params = {
        "q": json.dumps(params),
        "limit": limit,
        "offset": offset
    }

    print(params)
    
    stations_ep = "/stations"
    url = f"{api_base_url}{stations_ep}"

    res = requests.get(url, params, headers = header)
    res.raise_for_status()
    print(f"Constructed API request URL: {res.url}")
    res = [item["value"] for item in res.json()["result"]]
    return res

def get_station_metadata():
    res = query_stations({}, name = "hcdp_station_metadata")
    data = {}
    for metadata in res:
        data[metadata[metadata["id_field"]]] = metadata
    return data

def get_station_data(values, metadata = None, limit = 10000, offset = 0):
    res = query_stations(values, name = "hcdp_station_value", limit = limit, offset = offset)
    combined = res
    if metadata is not None:
        combined = []
        # combine values with metadata for station
        for item in res:
            station_metadata = metadata.get(item["station_id"])
            #only return data with metadata
            if station_metadata is not None:
                #combine item with metadata and add to combined array
                combined.append(item | station_metadata)
    return combined

In [5]:
class DataType(str, Enum):
    TEMPERATURE = "temperature"
    RAINFALL = "rainfall"

class Aggregation(str, Enum):
    MIN = "min"
    MAX = "max"
    MEAN = "mean"

class Production(str, Enum):
    '''
    Production can be "new" or "legacy". Legacy rainfall maps are available from 1920-2012, whereas new rainfall maps are available from 1990-present
    '''
    NEW = "new"
    LEGACY = "legacy"

class Period(str, Enum):
    DAY = "day"
    MONTH = "month"
    YEAR = "year"

class Extent(str, Enum):
    '''
    HAWAII = Big Island, Hawai'i
    KAUAI = Kauai island, Hawai'i
    HONOLULU = O'ahu island, Hawai'i
    MAUI = Maui island
    STATEWIDE = all of the Hawaiian islands
    '''
    STATEWIDE = "statewide"  # Data for the whole state
    HAWAII = "bi"            # Hawaii county
    KAUAI = "ka"             # Kauai county
    MAUI = "mn"              # Maui county
    HONOLULU = "oa"          # Honolulu county

class ClimateDataParams(BaseModel):
    datatype: DataType
    period: Period
    start: str
    end: str
    extent: Extent
    lat: Optional[float] = None
    lng: Optional[float] = None
    # Optional fields that differ between temperature and rainfall
    aggregation: Optional[Aggregation] = None
    production: Optional[Production] = None

In [6]:

class ClimateAPI:     
    def __init__(self, api_base_url: str, header: Dict[str, str]):         
        self.api_base_url = api_base_url         
        self.header = header         
        self.raster_timeseries_ep = "/raster/timeseries"              

    def get_timeseries_data(self, params: ClimateDataParams) -> pd.DataFrame:         
        """Get timeseries data from the API based on provided parameters"""         
        url = f"{self.api_base_url}{self.raster_timeseries_ep}"                  
        params_dict = params.model_dump()
        
        # Remove None values from params
        params_dict = {k: v for k, v in params_dict.items() if v is not None}
                  
        res = requests.get(url, params_dict, headers=self.header)         
        res.raise_for_status()         
        print(f"Constructed API request URL: {res.url}")                  
        data = res.json()         
        df_data = list(data.items())                  
        
        # Determine value column name based on datatype
        if params.datatype == DataType.TEMPERATURE:
            value_col = f"{params.aggregation.value.capitalize()} {params.datatype.value.capitalize()} (°C)"
        else:  # For rainfall
            value_col = f"{params.datatype.value.capitalize()} (mm)"
                  
        df = pd.DataFrame(df_data, columns=["Date", value_col])         
        df = df.sort_values(by="Date")                  
        return df   

    def plot_timeseries(self, df: pd.DataFrame, params: ClimateDataParams) -> None:
        """Line plot of timeseries data using Plotly. This is specifically for plotting a variable over time. Always output HTML unless specifically told not to."""
        value_col = df.columns[1]  # Second column contains the values
        
        # Create appropriate title based on params
        if params.datatype == DataType.TEMPERATURE:
            title = f"Summary of {params.aggregation.value} {params.datatype.value} from {params.period.value}"
        else:
            title = f"Summary of {params.datatype.value} from {params.period.value}"
            
        if params.lat is not None and params.lng is not None:
            title += f" for location Latitude: {params.lat}, Longitude: {params.lng}"
            
        fig = px.line(df, title=title, x="Date", y=value_col)
        fig.write_html(f"{output_dir}/test_new.html")
        return fig

@prompt_process_agent.tool_plain  
def get_temperature_timeseries(     
    aggregation: Aggregation,     
    period: Period,     
    lat: float,     
    lng: float,     
    start_date: Optional[str] = None,      
    end_date: Optional[str] = None,
    extent: Optional[Extent] = Extent.STATEWIDE
) -> Dict[str, Any]:     
    """Return temperature timeseries data for the specified location, period and aggregation"""     
    api = ClimateAPI(api_base_url=api_base_url, header=header)     
    print("API initialized:", api)          

    today = datetime.now(pytz.timezone("US/Hawaii"))     
    yesterday = today - timedelta(days=1)     
    previous_year = today - relativedelta(years=1)          

    start_str = start_date if start_date else previous_year.strftime("%Y-%m-%d")     
    end_str = end_date if end_date else yesterday.strftime("%Y-%m-%d")          

    # Create params for temperature (requires aggregation)
    params = ClimateDataParams(         
        datatype=DataType.TEMPERATURE,         
        aggregation=aggregation,         
        period=period,         
        start=start_str,         
        end=end_str,         
        # extent=extent,     
        extent="statewide",   
        lat=lat,         
        lng=lng     
    )          

    print("Query parameters:", params)          

    df = api.get_timeseries_data(params)          

    # Return structured result with data preview + summary     
    result = {         
        "data_preview": df.head(5).to_dict(orient="records") + df.tail(5).to_dict(orient="records"),
        "summary": {             
            "mean": df.iloc[:, 1].mean(),             
            "min": df.iloc[:, 1].min(),             
            "max": df.iloc[:, 1].max(),             
            "location": {"lat": lat, "lng": lng},             
            "period": f"{start_str} to {end_str}"         
        }     
    }

    # Plot the time series if data exists and has a "Date" column
    if not df.empty and "Date" in df.columns:
        api.plot_timeseries(df, params)
        print("Time series plot has been generated.")
    else:
        print("No time series data available for plotting.")
     
    return result  

@prompt_process_agent.tool_plain  
def get_rainfall_timeseries(     
    period: Period,     
    lat: float,     
    lng: float,     
    start_date: Optional[str] = None,      
    end_date: Optional[str] = None,
    production: Production = Production.NEW,
    extent: Optional[Extent] = Extent.STATEWIDE
) -> Dict[str, Any]:     
    """Return the max, mean and min of the rainfall data. Keep it concise."""     
    api = ClimateAPI(api_base_url=api_base_url, header=header)     
    print("API initialized:", api)          

    today = datetime.now(pytz.timezone("US/Hawaii"))     
    yesterday = today - timedelta(days=1)     
    previous_year = today - relativedelta(years=1)          

    start_str = start_date if start_date else previous_year.strftime("%Y-%m-%d")     
    end_str = end_date if end_date else yesterday.strftime("%Y-%m-%d")          

    # Create params for rainfall (requires production instead of aggregation)
    params = ClimateDataParams(         
        datatype=DataType.RAINFALL,         
        production="new",         
        period=period, 
        start = start_str,
        end = end_str,
        extent=extent,    
        lat=lat,         
        lng=lng     
    )      
    
    print("Query parameters:", params)          

    df = api.get_timeseries_data(params)          

    # Return structured result with data preview + summary     
    result = {         
        "data_preview": df.head(5).to_dict(orient="records") + df.tail(5).to_dict(orient="records"),
        "summary": {             
            "mean": df.iloc[:, 1].mean(),             
            "min": df.iloc[:, 1].min(),             
            "max": df.iloc[:, 1].max(),             
            "location": {"lat": lat, "lng": lng},             
            "period": f"{start_str} to {end_str}"         
        }     
    }

    # Plot the time series if data exists and has a "Date" column
    if not df.empty and "Date" in df.columns:
        api.plot_timeseries(df, params)
        print("Time series plot has been generated.")
    else:
        print("No time series data available for plotting.")
     
    return result

In [9]:
r = await prompt_process_agent.run("What was the daily rainfall from August 2020 to August 2022 at the coordinates lat:21.301035061407028,lng:-157.81837141983394? Plot the rainfall timeseries.")    

In [10]:
r.data

'The daily rainfall data from August 1, 2020, to August 31, 2022, at the coordinates lat: 21.301035061407028, lng: -157.81837141983394 is as follows:\n\n### Data Preview:\n| Date                | Rainfall (mm) |\n|---------------------|---------------|\n| 2020-08-01T00:00:00| 0.00267953    |\n| 2020-08-02T00:00:00| 0.803978      |\n| 2020-08-03T00:00:00| 1.65966       |\n| 2020-08-04T00:00:00| 0.134655      |\n| 2020-08-05T00:00:00| 0.147128      |\n| 2022-08-27T00:00:00| 0.306203      |\n| 2022-08-28T00:00:00| 0.501638      |\n| 2022-08-29T00:00:00| 0.0931039     |\n| 2022-08-30T00:00:00| 0.192087      |\n| 2022-08-31T00:00:00| 0.534026      |\n\n### Summary:\n- **Mean Rainfall**: 2.2506904307838838 mm\n- **Minimum Rainfall**: 0.0 mm\n- **Maximum Rainfall**: 142.995 mm\n- **Location**: lat: 21.301035061407028, lng: -157.81837141983394\n- **Period**: 2020-08-01 to 2022-08-31'

In [11]:
r = await prompt_process_agent.run("What was the mean temperature from August 2020 to August 2022 at the coordinates lat:21.301035061407028,lng:-157.81837141983394? Plot how the mean temperature changed everyday within that time range.")    

In [12]:
r.data

'The mean temperature at the coordinates lat: 21.301035061407028, lng: -157.81837141983394 from August 1, 2020, to August 31, 2022, was **24.79°C**. The temperature fluctuated between a minimum of **19.2°C** and a maximum of **27.8°C** during this period.\n\nTo visualize how the mean temperature changed daily, you can plot the timeseries data. The data preview shows that temperatures generally remained consistent, with slight variations. For example:\n- On August 1, 2020, the mean temperature was **26.3°C**.\n- On August 31, 2022, the mean temperature was **25.5°C**.\n\nYou can use the full dataset to create a line graph where the x-axis represents the date and the y-axis represents the mean temperature in degrees Celsius. This will provide a clear visual representation of the daily temperature changes over the specified time range.'

In [13]:
r = await prompt_process_agent.run("What was the mean temperature from August 2020 to August 2022 at the coordinates lat:21.501947,lng:-157.966537? Plot how the mean temperature changed everyday within that time range.")    

In [14]:
r.data

'The mean temperature from August 1, 2020, to August 31, 2022, at the coordinates lat: 21.501947, lng: -157.966537 was **22.32°C**. \n\nHere’s a summary of the temperature data:\n- **Mean Temperature**: 22.32°C\n- **Minimum Temperature**: 16.7°C\n- **Maximum Temperature**: 26.1°C\n\nBelow is a preview of the daily mean temperature data:\n\n| Date       | Mean Temperature (°C) |\n|------------|-------------------------|\n| 2020-08-01 | 23.1                   |\n| 2020-08-02 | 23.1                   |\n| 2020-08-03 | 23.7                   |\n| 2020-08-04 | 23.6                   |\n| 2020-08-05 | 24.5                   |\n| 2022-08-27 | 24.8                   |\n| 2022-08-28 | 23.8                   |\n| 2022-08-29 | 24.0                   |\n| 2022-08-30 | 24.0                   |\n| 2022-08-31 | 23.8                   |\n\nTo visualize how the mean temperature changed over time, you can plot the data using tools like Excel or Python libraries such as matplotlib.'

In [15]:
r = await prompt_process_agent.run("Give me the max temperature monthly from August 1990 to August 2000 in coordinates [21.3069, -157.8583]?")

In [16]:
r.data

'Here is the monthly maximum temperature data from August 1990 to August 2000 at the coordinates [21.3069, -157.8583]:\n\n- **Date**: 1990-08-01T00:00:00 | **Max Temperature (°C)**: 29.9217  \n- **Date**: 1990-09-01T00:00:00 | **Max Temperature (°C)**: 29.9504  \n- **Date**: 1990-10-01T00:00:00 | **Max Temperature (°C)**: 29.032  \n- **Date**: 1990-11-01T00:00:00 | **Max Temperature (°C)**: 27.7978  \n- **Date**: 1990-12-01T00:00:00 | **Max Temperature (°C)**: 26.3513  \n- **Date**: 2000-04-01T00:00:00 | **Max Temperature (°C)**: 26.5754  \n- **Date**: 2000-05-01T00:00:00 | **Max Temperature (°C)**: 28.5216  \n- **Date**: 2000-06-01T00:00:00 | **Max Temperature (°C)**: 29.5135  \n- **Date**: 2000-07-01T00:00:00 | **Max Temperature (°C)**: 29.3669  \n- **Date**: 2000-08-01T00:00:00 | **Max Temperature (°C)**: 29.8944  \n\n### Summary:\n- **Mean Temperature**: 27.830229752066117°C  \n- **Minimum Temperature**: 25.0281°C  \n- **Maximum Temperature**: 30.3468°C  \n- **Location**: [21.3069,

In [21]:
r = await prompt_process_agent.run("What was the max temperature yesterday at random lat/long coordinates on Kauai Island, Hawai'i?")

In [22]:
r.data

"The maximum temperature yesterday at the specified coordinates on Kauai Island, Hawai'i was 75°F."

In [19]:
r = await prompt_process_agent.run("What was the max temperature today at 19.5, -155.5?")

In [20]:
r.data

'The maximum temperature at 19.5, -155.5 on April 4, 2025, was 13.7°C.'