# AI tests: indicator suggestions. 
This notebook displays initial tests for an AI indicator suggestion system. The goal is to evaluate how well an AI model can suggest relevant indicators based on given data or context.  
The process involves providing sample data and instructions to an AI agent, and analyzing the responses for accuracy and relevance.

## 1. Setup. 
Import necessary libraries and load environment variables.

In [1]:
import json
import os

import dotenv
from langchain import tools
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from typing_extensions import TypedDict


In [2]:
# set openai api key as environment variable
dotenv.load_dotenv()


True

## 2. Simple test.  
Make a small test with mock data and just passing it to the LLM directly.

In [7]:
test_data = [
    {
        "indicator_id": "deforestation_rate",
        "region": "Amazonas",
        "year": 2022,
        "description": "Rate of deforestation in square kilometers",
    },
    {
        "indicator_id": "biodiversity_index",
        "region": "Amazonas",
        "year": 2022,
        "description": "Biodiversity index score indicating species richness and rarity",
    },
    {
        "indicator_id": "carbon_emissions",
        "region": "Amazonas",
        "year": 2022,
        "description": "Total carbon emissions from deforestation in metric tons",
    },
    {
        "indicator_id": "forest_cover",
        "region": "Amazonas",
        "year": 2022,
        "description": "Total forest cover in square kilometers",
    },
    {
        "indicator_id": "carbon_sequestration",
        "region": "Amazonas",
        "year": 2022,
        "description": "Total carbon sequestration from forested areas in metric tons",
    },
    {"indicator_id": "fire_severity",
        "region": "Amazonas",
        "year": 2022,
        "description": "Average fire severity index for wildfires"
    },
    {"indicator_id": "burned_area_1990_2020",
        "region": "Amazonas",
        "year": 2022,
        "description": "Total area burned from 1990 to 2020 in square kilometers"
    },
    {"indicator_id": "protected_species_count",
        "region": "Amazonas",
        "year": 2022,
        "description": "Number of protected species in the region"
    }
]

test_data_json = json.dumps(test_data, indent=2)

In [8]:
system_template = f"""
You are an intelligent assistant that recomends indicators from the provided dataset list
 based on user's interest stated on the prompt.

DATA:
{test_data_json}

Be accurate and only use the information from the dataset above.
Provide as response just a list of indicator_ids.
"""



Set up a simple model and invoke it with a prompt prompt to check basic functionality.

In [None]:
model = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=7000)


In [7]:
prompt = "I want to monitor the effect of tree cover loss in the Amazonas region."
response = model.invoke([("system", system_template), ("user", prompt)])
print("Recommended Indicators:", response.content)

Recommended Indicators: ["deforestation_rate", "carbon_emissions", "forest_cover", "carbon_sequestration", "burned_area_1990_2020"]


## 3. Agent test with real data.
Make a test using an AI agent storing the instructions, to see if it improves results.  
Get the real list of indicators by parsin the indicators.json file to simplify it.  
Improve by adding capabilities later.

### 3.1 Load and prepare real indicator data


In [3]:
##Â Load full indicators data
indicators_path = '../../client/datum/indicators.json'
indicator_data = json.load(open(indicators_path, 'r'))


In [4]:
# Create a cleaned list of indicators with only name and short description in English
indicator_list_clean = []
for indicator in indicator_data:
    clean_indicator = {
        'name': indicator.get('name_en', ''),
        'description': indicator.get('description_short_en', '')
    }
    indicator_list_clean.append(clean_indicator)
print(f'Total indicators: {len(indicator_list_clean)}')
exclude_indicators =['Total area', 'Amazonia', 'Countries', 'States', 'Municipalities', 'Administrative Capitals']
indicator_list_clean = [ind for ind in indicator_list_clean if not any(excl in ind['name'] for excl in exclude_indicators)]
print(f'Total indicators after exclusion: {len(indicator_list_clean)}')
indicator_list_json = json.dumps(indicator_list_clean, indent=2)


Total indicators: 158
Total indicators after exclusion: 151


### 3.2 Set utilities to process outputs. 
- To export a list of indicator ids from the AI response.  
- To get a dictionary with token usage.

In [17]:
class IndicatorSelection(TypedDict):
    """Represents the selection of indicators as a list."""
    indicator_ids: list[str]

In [21]:
#create a class to be passed to the agent's renponse_format parameter, to specify a structured output format consisting on a list of dictionaries
class IndicatorInfo(TypedDict):
    """Represents detailed information about recommended indicators."""
    name: str
    explanation: str
    visualization: list[str]  # e.g., ['ðŸŒŽ Map', 'ðŸ“Š Chart', 'ðŸ”¢ Number']

In [7]:
def get_token_info(result):
    """ Extract token usage information from the agent result. """
    r = str(result['messages'][1])
    r = r.split('{')[3].split(',')
    token_dict = {}
    for item in r[:3]:
        k = item.split(':')[0].strip().replace("'", '')
        v = item.split(':')[1].strip().replace("'", '')
        token_dict[k] = v
    return token_dict

### 3.3 Set up agent with data and instructions.

In [6]:
system_template_full = f"""
You are an intelligent assistant that recomends indicators from the provided dataset list
    based on user's input. The suggestions must be relevant in the context of environmental, social and economic monitoring in the Amazonia region.
    The input must in the form of a list of dictionary objects with the following 3 fields:  
    -name: the name of the indicator
    -explanation: brief explanation of why the indicator is of interest
    -visualization: an array with the suggested type or types of visualization for the data, selecting one or many between 'ðŸŒŽ Map', 'ðŸ“Š Chart' or 'ðŸ”¢ Number'
    If the prompt is not related to the data or to environmental and social monitoring in the Amazonia region,
    respond with an empty list.
    Take into account all the available indicators, basing the selection on the fields: 'name_en', 'description_en'.

    If the user's input is a sentence or text, you must return a list of recommended indicator IDs that match the user's interest.
    If the user's prompt is a list of indicators, return a list of indicators from
    the dataset that are different from those provided, and related to the context of those included in the list.
    Recommend at least 3 indicators and at most 15 to match the user's interest when the input is a text.
    If the input is a list of indicators, recommend at least 3 and at most 8 different but related indicators.


    DATA:
    {indicator_list_json}

    Be accurate and only use the information from the dataset above.
    If the prompt is not related to the data, respond with an empty list.
"""

In [7]:
model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

agent_full = create_agent(model,
                     tools=[],
                     system_prompt=system_template_full,
                     #response_format=IndicatorInfo
                     )

### Full example

In [8]:
prompt = input("Enter your monitoring interest: ")
full_result = agent_full.invoke({'messages': [{"role": "user", "content": prompt}]})
print(prompt)
#print(full_result['structured_response']['indicator_ids'])
#print(get_token_info(full_result))

            id = uuid7()
Future versions will require UUID v7.
  input_data = validator(cls_, input_data)


deforestation and biodiversity with fires risk


In [20]:
print(full_result)

{'messages': [HumanMessage(content='areas of biodiversity importance with deforestation and extractive activities or agriculture', additional_kwargs={}, response_metadata={}, id='6875a8f6-040f-49f5-95be-896e121aa73b'), AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 63, 'prompt_tokens': 8719, 'total_tokens': 8782, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 8064}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_50906f2aac', 'id': 'chatcmpl-CgC4f3RxjtlRB1RYb3Wr6eb41IHGX', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--32cc56b5-6803-40ea-bb11-b6f0c95b05a7-0', tool_calls=[{'name': 'IndicatorSelection', 'args': {'indicator_ids': ['Protected Areas', 'Indigenous Territories', 'Forest 

In [10]:
print("AI Response:")
print(full_result['messages'][-1].content)

AI Response:
[
    {
        "name": "Forest Cover Loss",
        "explanation": "This indicator tracks the loss of forest cover in the Amazon from 2000 to 2024, providing insights into deforestation trends and their impact on biodiversity.",
        "visualization": ["ðŸŒŽ Map", "ðŸ“Š Chart", "ðŸ”¢ Number"]
    },
    {
        "name": "Fire Frequency",
        "explanation": "This indicator tracks the frequency and distribution of fires across the region, both natural and human-caused, which can significantly impact biodiversity and forest health.",
        "visualization": ["ðŸŒŽ Map", "ðŸ“Š Chart"]
    },
    {
        "name": "Species Richness",
        "explanation": "Quantifies the number of documented species across taxonomic groups in different areas of Amazonia, providing a baseline measure of biodiversity distribution and concentration, which is affected by deforestation and fire events.",
        "visualization": ["ðŸŒŽ Map", "ðŸ“Š Chart"]
    },
    {
        "name": "Key 

In [15]:
l = full_result['messages'][-1].content
#parse l into a list of dictionaries

list_of_dicts = json.loads(l)
print(list_of_dicts[0])

{'name': 'Forest Cover Loss', 'explanation': 'This indicator tracks the loss of forest cover in the Amazon from 2000 to 2024, providing insights into deforestation trends and their impact on biodiversity.', 'visualization': ['ðŸŒŽ Map', 'ðŸ“Š Chart', 'ðŸ”¢ Number']}


In [14]:
prompt = input("Enter your monitoring interest: ")
full_result = agent_full.invoke({'messages': [{"role": "user", "content": prompt}]})
print(prompt)
print(full_result['structured_response']['indicator_ids'])
print(get_token_info(full_result))

['Protected Areas', 'Indigenous Territories', 'Forest Cover Loss', 'Forest Cover', 'Land cover', 'Species Richness', 'Key Biodiversity Areas']
['Main watersheds in the Amazon', 'Carbon Storage', 'Fire Frequency', 'Ecosystem Types', 'Biome Types', 'Drought Risk', 'Riverine Flood Risk', 'Climate Zones', 'Yearly Rainfall', 'Mean Biomass']
{'completion_tokens': '73', 'prompt_tokens': '8831', 'total_tokens': '8904'}


In [None]:
indicator_list_clean

[{'name': 'Altitude range',
  'description': 'Layers of polygons representing altitude ranges for the entire Amazon'},
 {'name': 'Slope',
  'description': 'Raster layer that classifies terrain slope into 7 categories from flat terrain to very steep rocky walls.'},
 {'name': 'Main watersheds in the Amazon',
  'description': 'This polygon layer represents the 38 main watersheds and their sub-watersheds in the Amazon region. It provides a reference for the hydrographic systems present and the general direction of runoff. This information is crucial as water resource management is a priority for supplying populations, supporting agricultural and industrial activities, and enabling fluvial communication.'},
 {'name': 'Principals Amazon Rivers',
  'description': 'Named rivers and streams in the Amazon obtained from OpenStreetMap, showing the most known hydrographic network identified on OSM in the region.'},
 {'name': 'Coastal Influence Zones',
  'description': ' Classification of areas base

In [20]:
l = [ind['name'] for ind in indicator_list_clean]
#sort list alphabetically
l.sort()
l

['1975 population',
 '2000 population',
 '2015 population',
 '2025 population',
 '2030 population',
 'Altitude range',
 'Arabic Coffee Production ',
 'Arabic Coffee Production ',
 'Average altitude per cell',
 'Average slope per cell',
 'Bare / sparse vegetation',
 'Barriers and Dams',
 'Barriers and Dams',
 'Bioeconomy Centers',
 'Biome Types',
 'Border Infrastructure and Control Facilities',
 'Built Environments',
 'Built Environments',
 'Canopy height',
 'Canopy height',
 'Carbon Storage',
 'Cassava Production',
 'Cassava Production',
 'Children Under Five Distribution',
 'Children Under Five Distribution',
 'Climate Zones',
 'Close to a river',
 'Coastal Influence Zones',
 'Coastal influence',
 'Cocoa Production ',
 'Cocoa Production ',
 'Conflict Locations',
 'Conservation Priority',
 'Corn Production ',
 'Corn Production ',
 'Croplands',
 'Croplands',
 'Deprivation Levels',
 'Deprivation Levels',
 'Distance Military Inf.',
 'Distance to Border Inf. and Control Facilities',
 'Dist

In [15]:
'Water Bodies' in l

False