In [265]:
import os
from dotenv import load_dotenv
import pandas as pd
import pickle
import numpy as np
from S1_model import ZeroShot
from S2_model import rag_model 
from s3_model import agent_model 
from utils import LLMResponse
from sklearn.metrics import auc,roc_auc_score, accuracy_score, average_precision_score
import ast
import re
import math

import setup

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages.ai import AIMessage
from langchain_core.rate_limiters import InMemoryRateLimiter

In [8]:
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
os.environ["INATURALIST_API_KEY"] = os.getenv("INATURALIST_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_GEN_AI_API_KEY")

In [9]:
def gather_responses(model_system,system_prompt, queries, params, rate_limiter):

    print("Intialising model")
    match model_system:
        case 's1':
            model = ZeroShot(system_template=system_prompt,llm_choice = params['llm_choice'], model = params['model'],temperature=params['temperature'], rate_limiter = rate_limiter)
        case 's2':
            model =  rag_model(dossier_path='../data/retrieval_dossier/wikipedia-en-dwca-species-descriptions.csv', system_prompt= system_prompt,
                llm_choice = params['llm_choice'], model = params['model'], temperature=params['temperature'], persist_directory=params['persist_directory'], rate_limiter = rate_limiter)
        case 's3':
            model = agent_model(system_prompt = system_prompt,llm_choice = params['llm_choice'], model = params['model'], temperature=params['temperature'], rate_limiter = rate_limiter)

    responses = {}
    
    for i, question in enumerate(queries):
        
        print(question)
        
        responses[str(i)] = [model.invoke_response(question) for i in range(params['Nreplicates'])]
    
    
    out_file = open(params['pkl_out'])
    pickle.dump(responses, out_file)
    out_file.close()

    return responses


In [88]:
file_path = '../eval/species_point_presence_absence.csv'
sp_pa_df = pd.read_csv(file_path)


In [92]:
sp_pa_df['common']

0            Sulawesi Fruit Bat
1            Sulawesi Fruit Bat
2            Sulawesi Fruit Bat
3            Sulawesi Fruit Bat
4            Sulawesi Fruit Bat
                 ...           
1595    Paruline À Ailes Bleues
1596    Paruline À Ailes Bleues
1597    Paruline À Ailes Bleues
1598    Paruline À Ailes Bleues
1599    Paruline À Ailes Bleues
Name: common, Length: 1600, dtype: object

In [154]:

#System prompt for presence/absences
system_prompt = "You are a Foundational Nature AI capable of informing questions about biodiversity and conservation relevant for real-world decisions."
# List to hold all questions filled with data

filled_questions = []

# Fill the question template with data from each row
for binomial, group in sp_pa_df.groupby('binomial'):
    result_string = f"Can you tell me if the {group['common'].iloc[0]} ({binomial}) can be found at the following decimal degree latitude, longitude locations:\n"
    for index, row in group.iterrows():
        # Append each row's binomial, x, y, presence in the desired format
        result_string += f"{row['y']},{row['x']}\n"
    #filled_question = result_string + "Ensure that you only respond with a score between 0 and 1 (where 1 indicates that you think the species is very likely to be present there and 0 indicates the species is highly unlikely to be found there) then the justification separated by a comma."
    filled_question = result_string + "Please respond with a score for each location between 0 and 1, where 1 indicates that you think the species is very likely to be present there. Provide your answer as a python list in the following format [[coordinates, likelihood of species being present at location 1], ...]"
    filled_questions.append(filled_question)



In [136]:
(filled_questions[1])

'Can you tell me if the mosaic sea snake (Aipysurus mosaicus) can be found at the following decimal degree latitude, longitude locations:\n-13.42,-90.71\n-19.56,-78.4\n-31.01,-69.41\n-0.11,-59.71\n23.18,-52.74\n19.0,-48.16\n-8.82,-18.28\n17.92,45.09\n4.56,74.79\n-7.15,125.65\n-20.81,116.07\n-20.27,117.34\n-18.7,121.45\n-16.23,122.15\n-14.55,136.09\n-10.29,142.36\n-11.68,143.02\n-14.25,143.75\n-19.42,148.64\n-24.21,152.12\nPlease respond with a score for each location between 0 and 1, where 1 indicates that you think the species is very likely to be present there. Provide your answer as a python list in the following format [Likelihood of species being present at location 1, ...]'

In [229]:
####  Create model approaches ####
#Get the parameters
params = setup.get_evaluation_parameters()
#TBD: Create log files and pass into the models as they are initialised



In [None]:

rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.2,  # <-- Super slow! We can only make a request once every 10 seconds!!
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
)
model_system='s1'
params['pkl_out'] = f"../output/species_point_pres_abs_{params['llm_choice']}_{params['model']}_{model_system}_Q_responses.pkl"
s1_sp_p_a_responses = gather_responses(model_system=model_system,system_prompt=system_prompt,queries=filled_questions,params=params, rate_limiter = rate_limiter)


In [None]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.1,  # <-- Super slow! We can only make a request once every 10 seconds!!
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
)
model_system='s2'
params['pkl_out'] = f"../output/species_point_pres_abs_{params['llm_choice']}_{params['model']}_{model_system}_Q_responses.pkl"
s2_sp_p_a_responses = gather_responses(model_system=model_system,system_prompt=system_prompt,queries=filled_questions,params=params, rate_limiter = rate_limiter)


In [None]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.005,  # <-- Super slow! We can only make a request once every 10 seconds!!
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
)
s3_sp_p_a_responses = gather_responses(model_system='s3',system_prompt=system_prompt,queries=filled_questions,params=params, rate_limiter = rate_limiter)


In [196]:
# def extract_point_predictions(responses):
#     loc_responses = filter_response(responses).split('\n')

#     loc_preds = []

#     for r in loc_responses:
#         parts = r.strip().split(',', 1)
#         #Check that there are 2 parts and that it doesn't start with "Note:"
#         if len(parts) == 2:
#             numeric_value = float(parts[0].strip())  # Convert the numeric part to float
#             text_description = parts[1].strip()
#             loc_preds.append(numeric_value)
    
#     return loc_preds

def extract_python_list(content):
    """
    Extracts a Python list from the content string, handling different formats.
    """
    # Case 1: Content inside triple backticks (```python ...```)
    triple_backtick_match = re.search(r"```python\n(.*?)\n```", content, re.DOTALL)
    
    if triple_backtick_match:
        # Extract content between triple backticks
        python_code_str = triple_backtick_match.group(1)
        print("Extracted code block with backticks.")
    else:
        # Case 2 & 3: Strings without triple backticks
        # Remove surrounding quotes if present, and clean up potential issues
        python_code_str = content.strip("'").strip('"')
        print("Extracted list-like string without backticks.")
    
    # Step 2: Ensure brackets and commas are well-formed
    # Clean up any trailing or leading whitespace
    python_code_str = python_code_str.strip()

    # Optional: You can add more advanced cleaning here if needed, e.g., fixing malformed brackets
    
    # Step 3: Try to convert to a Python list using ast.literal_eval()
    try:
        python_list = ast.literal_eval(python_code_str)
        
        # Step 4: Extract the last value from each sublist
        preds = [sublist[-1] for sublist in python_list if isinstance(sublist, list)]
        return preds
    except (SyntaxError, ValueError) as e:
        print(f"Error converting the string to a Python list: {e}")
        return None


def extract_point_predictions(response):
    
    re.search(r"```python\n(.*?)\n```", response, re.DOTALL)
    points_preds = ast.literal_eval(group(1))
    
    df_pp = pd.DataFrame(points_preds, columns=['y','x','presence'])
    return df_pp


In [248]:
def calculate_metrics(y_true, y_pred, threshold):
    if y_pred == None:
        auc = np.NaN
        accuracy = np.NaN
    else:
        auc = roc_auc_score(y_true,y_pred)
        accuracy = accuracy_score(y_true, [1 if p >= threshold else 0 for p in y_pred])

    return auc, accuracy

In [249]:
s1_auc = {}
s1_accuracy = {}
s2_auc = {}
s2_accuracy = {}
random_auc = {}
random_accuracy = {}
always_zero_auc = {}
always_zero_accuracy = {}

response_counter = 0
for binomial, group in sp_pa_df.groupby('binomial'):
    print(response_counter)

    s1_preds = extract_python_list(filter_response(s1_sp_p_a_responses[str(response_counter)][0]))
    s1_auc[binomial],s1_accuracy[binomial] = calculate_metrics(y_true = group['presence'],y_pred = s1_preds, threshold= 0.5)

    s2_preds = extract_python_list(filter_response(s2_sp_p_a_responses[str(response_counter)][0]))
    s2_auc[binomial],s2_accuracy[binomial] = calculate_metrics(y_true = group['presence'],y_pred = s2_preds, threshold= 0.5)

    random_preds = np.random.random(len(s1_preds))
    random_auc[binomial] = roc_auc_score(group['presence'],random_preds)
    random_accuracy[binomial] = accuracy_score(group['presence'],[1 if p >= 0.5 else 0 for p in random_preds])

    
    always_zero_auc[binomial] = roc_auc_score(group['presence'],np.zeros(shape = len(s1_preds)))
    always_zero_accuracy[binomial] = accuracy_score(group['presence'],[1 if p >= 0.5 else 0 for p in np.zeros(shape = len(s1_preds))])


    response_counter += 1


0
Extracted list-like string without backticks.
Extracted list-like string without backticks.
1
Extracted code block with backticks.
Extracted list-like string without backticks.
Error converting the string to a Python list: invalid syntax (<unknown>, line 1)
2
Extracted list-like string without backticks.
Extracted list-like string without backticks.
Error converting the string to a Python list: invalid syntax (<unknown>, line 1)
3
Extracted list-like string without backticks.
Extracted list-like string without backticks.
Error converting the string to a Python list: unterminated string literal (detected at line 1) (<unknown>, line 1)
4
Extracted list-like string without backticks.
Extracted list-like string without backticks.
Error converting the string to a Python list: invalid syntax (<unknown>, line 1)
5
Extracted list-like string without backticks.
Extracted list-like string without backticks.
Error converting the string to a Python list: invalid syntax (<unknown>, line 1)
6
Extr

In [284]:
df_metrics = pd.DataFrame([s1_auc,s1_accuracy,s2_auc,s2_accuracy,random_auc,
              random_accuracy,always_zero_auc,always_zero_accuracy]).transpose()
df_metrics.columns = ['s1_auc','s1_accuracy','s2_auc','s2_accuracy',
                    'random_auc','random_accuracy','always_zero_auc','always_zero_accuracy']

In [286]:
df_metrics.to_csv(f'../output/accuracy_precision_metrics_{params['llm_choice']}_{params['model']}.csv')

In [269]:
def mean_auc_accuracy(auc, accuracy, label):
    auc_mean = np.nanmean(list(auc.values()))
    print(f'{label} AUC = {auc_mean}' )
    accuracy_mean = np.nanmean(list(accuracy.values()))
    print(f'{label} accuracy = {accuracy_mean}' )
    nan_count = sum(math.isnan(x) for x in list(auc.values()))
    print(f'Invalid responses = {nan_count}')
    return auc_mean, accuracy_mean

In [271]:
mean_auc_accuracy(s1_auc,s1_accuracy,'s1')
mean_auc_accuracy(s2_auc,s2_accuracy,'s2')
mean_auc_accuracy(random_auc,random_accuracy,'random')
mean_auc_accuracy(always_zero_auc,always_zero_accuracy,'zeros')



s1 AUC = 0.7345625
s1 accuracy = 0.729375
Invalid responses = 0
s2 AUC = 0.7301315789473685
s2 accuracy = 0.7263157894736842
Invalid responses = 42
random AUC = 0.48425
random accuracy = 0.48250000000000004
Invalid responses = 0
zeros AUC = 0.5
zeros accuracy = 0.5
Invalid responses = 0


(0.5, 0.5)

In [220]:
mean_auc_accuracy(s1_auc,s1_accuracy,'s1')

auc_mean = np.mean(list(auc.values()))
print(f's1 AUC = {auc_mean}' )
accuracy_mean = np.mean(list(accuracy.values()))
print(f's1 accuracy = {accuracy_mean}' )


rand_auc_mean = np.mean(list(random_auc.values()))
print(f'rand AUC = {rand_auc_mean}' )
rand_accuracy_mean = np.mean(list(random_accuracy.values()))
print(f'rand accuracy = {rand_accuracy_mean}' )

always_zero_auc_mean = np.mean(list(always_zero_auc.values()))
print(f'Zeros AUC = {always_zero_auc_mean}' )
always_zero_accuracy_mean = np.mean(list(always_zero_accuracy.values()))
print(f'Zeros accuracy = {always_zero_accuracy_mean}' )


s1 AUC = 0.7345625
s1 accuracy = 0.729375
rand AUC = 0.5237499999999999
rand accuracy = 0.5075000000000001
Zeros AUC = 0.5
Zeros accuracy = 0.5


In [163]:
def filter_response(response):
    if isinstance(response,AIMessage):
        filtered_response = response.content
    else:
        if {'answer'} <= response.keys():
            filtered_response = response['answer']
        elif {'output'} <= response.keys():
            filtered_response = response['output']
        
    return filtered_response

In [None]:
filter_response(response)

In [None]:
filter_response(response_s3)

In [None]:
with open(f'../output/Q0_s1.txt', 'r') as f:
    print(pd.read_table(f, skiprows=0))
    # for line in f:
    #     line.to_dict
    for index, row in pd.read_table(f, skiprows=0).iterrows():
        response = row
        print(response)

In [None]:
import pickle

params['pkl_out'] = f"output/{params['llm_choice']}_All_Model_Q_responses.pkl"

In [None]:
in_file = open('../'+params['pkl_out'], 'rb')
responses = pickle.load(in_file)
in_file.close()

In [None]:
def quantitative_species_presence_metric(responses,i):
    df = read_and_process_species_responses(responses)
    diffs = df['Value'] - data['Value'][i]
    mean_diff = np.mean(diffs)
    return df['Value'], diffs, mean_diff



def read_and_process_species_responses(responses):
    """
    Reads a text file and splits each line into a numeric value and a text description.
    Assumes each line starts with a numeric value followed by a comma, then the text.

    Parameters:
    responses (dict): dictionary of .
    
    Returns:
    DataFrame: A pandas DataFrame with two columns: 'Numeric' and 'Text'.
    """
    data = []  # List to store the split data

    # Open the file and process each line
    #with open(file_path, 'r', encoding='utf-8') as file:
    for r in responses:
        # Strip whitespace and split the line at the first comma
        parts = r.strip().split(',', 1)
        #Check that there are 2 parts and that it doesn't start with "Note:"
        if len(parts) == 2 and not parts[0].startswith("Note:"):
            numeric_value = float(parts[0].strip())  # Convert the numeric part to float
            text_description = parts[1].strip()
            data.append([numeric_value, text_description])

    # Convert the list of data into a DataFrame
    df = pd.DataFrame(data, columns=['Value', 'Justification'])
    return df


In [None]:
def read_query_file_and_construct_questions(file_path):
    """
    Reads a custom-formatted file where the first line contains a question with placeholders,
    and the subsequent lines contain data in CSV format. This function replaces the placeholders
    in the question with data from each row.

    Parameters:
    file_path (str): The path to the file to be read.
    
    Returns:
    list: A list of questions with data inserted from each row.
    """
    with open(file_path, 'r') as file:
        # Extract the first line and get the question template
        first_line = file.readline().strip()
        if first_line.startswith('#System_prompt: '):
            system_prompt = first_line[len('"#System_prompt:'):].strip()
        else:
            raise ValueError("The file does not start with a proper system prompt header.")
        
        second_line = file.readline().strip()
        if second_line.startswith('#Prompt: '):
            question_template = second_line[len('#Prompt: '):].strip()
        else:
            raise ValueError("The file does not start with a proper question header.")

        # Read the CSV data that follows
        data = pd.read_csv(file, skiprows=0)  # We already read the first and second lines, so the cursor is at the second line

    # print(repr(question_template))
    # q_data_columns = re.findall(r'\{([^}]+)\}',question_template)
    # print(q_data_columns)


    # List to hold all questions filled with data
    filled_questions = []

    # Fill the question template with data from each row
    for index, row in data.iterrows():
        filled_question = question_template.format(**row.to_dict())
        filled_questions.append(filled_question)
    
    return system_prompt,filled_questions, data

def filter_response(response):
    if {'answer'} <= response.keys():
        filtered_response = response['answer']
    elif {'output'} <= response.keys():
        filtered_response = response['output']

    return filtered_response

In [None]:
#Species locations
file_path = '../eval/species_geospatial_queries_responses - species_presences.csv'  # Replace this with the path to your file
try:
    system_prompt, queries, data = read_query_file_and_construct_questions(file_path)
    print(system_prompt)

except Exception as e:
    print("Error reading queries:", e)

In [None]:
params['pkl_out'] = f"../output/{params['llm_choice']}_All_Model_Q_responses.pkl"
models = ['s1','s2','s3']
diffs = {}
vals = {}
mean_diff = {}
filtered_responses = {}
responses = {}

if not responses:
    print(f'Loading responses from: {params['pkl_out']}')
    in_file = open(params['pkl_out'], 'rb')
    responses = pickle.load(in_file)
    in_file.close()


In [None]:
responses['0']['s2']

In [None]:
i = 2
model= 's1'
[filter_response(r) for r in responses[str(i)][model]]

In [None]:


for i, question in enumerate(queries):
    
    diffs[str(i)] = {}
    mean_diff[str(i)] = {}
    vals[str(i)] = {}
    for model in models:
        filtered_responses = [filter_response(r) for r in responses[str(i)][model]]
 
        vals[str(i)]['vals_'+model],diffs[str(i)]['diffs_'+model], mean_diff[str(i)][model] = quantitative_species_presence_metric(filtered_responses,i)
        


In [None]:
k = '5'
pdf_details = pd.concat([pd.DataFrame(vals[k]),pd.DataFrame(diffs[k])], axis = 1)

np_diffs = np.array(pd.DataFrame(diffs[k]))
print(np_diffs)
rmse = np.sqrt(np.mean(np.square(np_diffs),axis = 0))
print(f"rmse = {rmse}. mean = {np.mean(np_diffs,axis = 0)}")

In [None]:
np_diffs