In [None]:
import LLMUtils
import Datasets

import time
import datetime
from tqdm import tqdm

import torch
import gc

import os
import json

from google import genai
from google.genai import types

In [None]:
HUGGINGFACE_TOKEN = "" # Needed for restricted LLMs with EULAs (Llama...)
GEMINI_API_KEY = "" # Needed to interact with Google's LLM API

# Run the tests
Note that, depending on the model, this test may run for up to 12 hours. The progress and remaining time will be logged to `progress.log`

## Local models

In [None]:
dataset = Datasets.WebNLGDataset()

In [None]:
# Prompts the LLM and saves all results to a file
def get_and_save_results(n_samples, LLM, dataset, avoid_explanations):
    generated_triples = []

    invalid_responses = 0

    for i, (text, triples) in (pbar := tqdm(enumerate(dataset.test_samples), file=open("progress.log", "w"), total=len(dataset.test_samples))):
        pbar.set_description(f"\r{n_samples} test: Obtained sample {i+1}\\{len(dataset.test_samples)} (invalid responses: {invalid_responses})")

        system_prompt, user_prompt = dataset.get_prompts(None, text, n_samples, avoid_explanations)
        triples = LLM.get_triples(system_prompt, user_prompt)

        # All samples contain triples. Whether this was due to detecting 
        # an error or the model believing there are no relations to extract,
        # it will be treated as an invalid response, with its subsequent
        # penalization during evaluation
        if len(triples) == 0: 
            invalid_responses += 1

        generated_triples.append(triples)

    with open(f'results_llm_testing/results_{LLM.get_human_readable_model_name()}_webnlg_{n_samples}_samples.txt', 'w') as file:
        for (text, ground_truth_triples), gen_triples in zip(dataset.test_samples, generated_triples):
            file.write(f'Sample: {text}\n')
            file.write(f'Ground truth: {ground_truth_triples}\n')
            file.write(f'Generated triples: {gen_triples}\n\n\n')

In [None]:
for llm_name in LLMUtils.LLM.models_for_testing:
    torch.cuda.empty_cache()
    gc.collect()

    LLM = LLMUtils.LLM(llm_name, hf_token=HUGGINGFACE_TOKEN)

    avoid_explanations = True

    get_and_save_results(5, LLM, dataset, avoid_explanations)
    get_and_save_results(8, LLM, dataset, avoid_explanations)
    get_and_save_results(16, LLM, dataset, avoid_explanations)

# Gemini

In [None]:
def get_and_save_results_gemini(n_samples, dataset, avoid_explanations):
    client = genai.Client(api_key=GEMINI_API_KEY)

    if os.path.exists(f"gemini_results_{n_samples}.json"):
        with open(f"gemini_results_{n_samples}.json") as f:
            gemini_results = json.load(f)
    else:
        gemini_results = dict()

    requests_made = 0  # Counter for requests made in the current minute
    start_time = time.time()  # Track the start of the 1-minute window

    for i, (text, _) in (pbar := tqdm(enumerate(dataset.test_samples), total=len(dataset.test_samples))):
        pbar.set_description(f"\r{n_samples} test: Obtained sample {i+1}\\{len(dataset.test_samples)}")

        if text in gemini_results:
            continue

        # Check if we've hit the 15 RPM limit of the API
        if requests_made >= 15:
            elapsed_time = time.time() - start_time
            if elapsed_time < 60:
                time.sleep(60 - elapsed_time)  # Wait until the 1-minute window resets (+15 extra seconds)
            requests_made = 0
            start_time = time.time()

        system_prompt, user_prompt = dataset.get_prompts(None, text, n_samples, avoid_explanations)
        while True:
            try:
                raw_response = client.models.generate_content(
                    model="gemini-2.0-flash",
                    config=types.GenerateContentConfig(system_instruction=system_prompt),
                    contents=[user_prompt]
                )

                requests_made += 1
            
                break  
            except Exception as e:
                print(e)
                requests_made = 0
                time.sleep(60) # Wait a full minute, it may be a hidden limit

        LLM = LLMUtils.LLM(None)
        generated_triples = LLM.get_triples_from_existing_response(raw_response.text)

        gemini_results[text] = generated_triples
        with open(f"gemini_results_{n_samples}.json", "w") as f: 
            json.dump(gemini_results, f)

    with open(f'results_llm_testing/results_Gemini_2.0_Flash_webnlg_{n_samples}_samples.txt', 'w') as file:
        for text, ground_truth_triples in dataset.test_samples:
            file.write(f'Sample: {text}\n')
            file.write(f'Ground truth: {ground_truth_triples}\n')
            file.write(f'Generated triples: {gemini_results[text]}\n\n\n')

In [None]:
get_and_save_results_gemini(5, dataset, True)
get_and_save_results_gemini(8, dataset, True)
get_and_save_results_gemini(16, dataset, True)