# Load 100 question dataset.

In [1]:
import json
import torch

with open('/kaggle/input/100-geo-questions-uri/100_Sub_Dataset_URI.json', 'r') as file:
    original_dataset = json.load(file)

# NER Pipeline

* Load model

In [None]:
import os
from huggingface_hub import login

login(token='')

* Mistral it

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

* chat template inference function

In [5]:
import torch

def run_chat_inference(model, tokenizer, system_role, question, max_tokens=15):    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    messages = [
        {"role": "system", "content": system_role},
        {"role": "user", "content": question}
    ]

    tokenizer.apply_chat_template(messages, tokenize=False)

    model_inputs = tokenizer.apply_chat_template(messages, return_tensors = "pt").to(device)
    
    generated_ids = model.generate(
        model_inputs,
        max_new_tokens = 15,
        do_sample = True,
    )

    # Decode generated text
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # Remove the system message
    if system_role in generated_text:
        generated_text = generated_text.split(system_role)[-1].strip()

    # Remove the user message from the output to get only the assistant's response
    if question in generated_text:
        generated_text = generated_text.split(question)[-1].strip()

    # Clear model from RAM
    del model
    torch.cuda.empty_cache()
    
    return generated_text

* NER cleanup

In [6]:
import re

def ner_cleanup(results):
    if ';' not in results: 
        return ''
    
    results = results.replace("[/INST]", "").strip()
    # Remove any leading or trailing whitespace
    results = results.strip()
    # Search for the pattern in the text
    match = re.search(r'(.*?);', results, re.DOTALL)
    entities = results
    # If a match is found, return the matched text
    if match:
        entities = match.group(1).strip()

    return entities

* Function to retrieve URIs of a certain toponym.

In [7]:
import requests
import pandas as pd
from io import StringIO

def graphdb_send_request(entities, endpoint_url="end_url", accept_format='application/sparql-results+json'):
    """
    Sends a SPARQL query to a GraphDB endpoint.

    :param query: SPARQL query to be sent
    :param endpoint_url: URL of the GraphDB SPARQL endpoint
    :param accept_format: Desired response format (default is JSON)
    :return: Response from the endpoint
    """
    query = f"""SELECT ?s ?name (COUNT(?related) AS ?count) WHERE {{
  {{
    SELECT ?s ?name WHERE {{
      {{
        ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOSI_Name> ?name .
      }} UNION {{
        ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOSM_Name> ?name .
      }} UNION {{
        ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasGADM_Name> ?name .
      }} UNION {{
        ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOS_Name> ?name .
      }} UNION {{
        ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasGAG_Name> ?name .
      }} UNION {{
        ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOSNI_Name> ?name .
      }}
      FILTER(CONTAINS(LCASE(?name), LCASE("{entities}")))
    }}
  }}
  {{
    {{ ?s ?p ?related }} UNION {{ ?related ?p ?s }}
  }}
}}
GROUP BY ?s ?name
ORDER BY DESC(?count)"""
#     query = f"""SELECT * WHERE {{
#   {{
#     ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOSI_Name> ?name .
#   }} UNION {{
#     ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOSM_Name> ?name .
#   }} UNION {{
#     ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasGADM_Name> ?name .
#   }} UNION {{
#     ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOS_Name> ?name .
#   }} UNION {{
#     ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasGAG_Name> ?name .
#   }} UNION {{
#     ?s <http://kr.di.uoa.gr/yago2geo/ontology/hasOSNI_Name> ?name .
#   }}
#   FILTER(CONTAINS(LCASE(?name), LCASE("{entities}")))
# }}"""
    
    headers = {
        'Accept': accept_format,
        'Content-Type': 'application/x-www-form-urlencoded'
    }

    data = {
        'query': query
    }
    
    try:
        response = requests.post(endpoint_url, headers=headers, data=data, auth=requests.auth.HTTPBasicAuth('username', 'password'))

        if response.status_code == 200:
            if accept_format == 'application/sparql-results+json':
#                 print(response.json())
                json_response = response.json()
                return convert_json_to_csv(json_response)
            else:
#                 print(response.text)
                return response.text
        else:
            response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print("HTTP error (most likely invalid query)")
        #print(query)
        #print(err)
    except Exception as err:
        print(err)
        print("Endpoint error ENDPOINT DOWN")

def convert_json_to_csv(json_data):
    """
    Converts JSON data to CSV format.

    :param json_data: JSON data to be converted
    :return: CSV formatted data as a string
    """
    if 'boolean' in json_data:
        # Handling boolean result
        headers = ['value']
        rows = [[json_data['boolean']]]
    else:
        # Extracting header and rows from JSON response
        headers = json_data['head']['vars']
        rows = [{var: result.get(var, {}).get('value', '') for var in headers} for result in json_data['results']['bindings']]
    
    # Creating DataFrame and converting to CSV
    df = pd.DataFrame(rows, columns=headers)
    csv_output = StringIO()
    df.to_csv(csv_output, index=False)
    
    return csv_output.getvalue()

* ner system role

In [8]:
ner_system_role = """You are a specialized Named Entity Recognition (NER) system focused on identifying and extracting toponyms (place names) from the given text. Your task is to recognize and list all geographical entities, including but not limited to:

- Countries
- Cities
- States/Provinces
- Regions
- Mountains
- Rivers
- Oceans/Seas
- Lakes
- Islands
- Continents

For each input, provide a list of extracted toponyms, separated by commas. If no toponyms are found, respond with "No toponyms found;" After completing the analysis, end your response with a semicolon.

Examples:
Q: Where is Swansea located?
A: Swansea;

Q: Which Greek regions have between 500000 and 1000000 inhabitants?
A: Greece;

Q: Is Doolin to the south of Dublin?
A: Doolin, Dublin;

Q: What's the capital of France and how far is it from the Mediterranean Sea?
A: France, Mediterranean Sea;

Q: What is the biggest island in the world?
A: No toponyms found;

Now, analyze the following text. Remember to split the toponyms by commas so I can recognize them individualy."""

* prompt to select best uri candidate.

In [9]:
def select_uri_system_role(question):
    uri_select_system_role = f"""You perform disambiguation, this means selecting the most relevant URI for a toponym or entity. You will be supplied with a question and a list of URIs. You have to select one URI out of the provided. 
    The list might have multiple toponyms but the supplied URIs will be relevant to only one. Strictly respond with the URI id and no other information or explanation.
    
    For example:
    Q: Consider this question: Is New York south of Florida?
    Choose the best URI from the provided list: 
    1. yago:New_York, New York City
    2. yago:New_York_Municipality, Municipality of New York
    3. geof:York_Museum, Historical Museum of York
    
    A: 1
    
    Now consider this question: {question}
    And choose the best URI from the provided list:"""
    return uri_select_system_role

In [10]:
def select_uri_system_role(question):
    uri_select_system_role = f"""You perform disambiguation, this means selecting the most relevant URI for a toponym or entity. You will be supplied with a question and a list of URIs. You have to select one URI out of the provided. 
    The list might have multiple toponyms but the supplied URIs will be relevant to only one. Strictly respond with the URI id and no other information or explanation.
    
    For example:
    Q: Consider this question: Is New York south of Florida?
    Choose the best URI from the provided list: 
    1. yago:New_York
    2. yago:New_York_Municipality
    3. geof:York_Museum
    
    A: 1
    
    Now consider this question: {question}
    And choose the best URI from the provided list:"""
    return uri_select_system_role

In [11]:
# This prefix map will be used to shrink the uri's down to the prefix level, to help the model better understand them and decrease mistakes.
prefix_map = {"http://www.opengis.net/ont/geosparql#" : "geo:",
               "http://www.opengis.net/def/function/geosparql/" : "geof:",
               "http://www.w3.org/1999/02/22-rdf-syntax-ns#" : "rdf:",
               "http://www.w3.org/2000/01/rdf-schema#" : "rdfs:",
               "http://www.w3.org/2001/XMLSchema#" : "xsd:",
               "http://yago-knowledge.org/resource/" : "yago:",
               "http://kr.di.uoa.gr/yago2geo/resource/" : "y2geor:",
               "http://kr.di.uoa.gr/yago2geo/ontology/" : "y2geoo:",
               "http://strdf.di.uoa.gr/ontology#" : "strdf:",
               "http://www.opengis.net/def/uom/OGC/1.0/" : "uom:",
               "http://www.w3.org/2002/07/owl#" : "owl:"}

* Concept Identifier.

In [None]:
!pip install stanza
!pip install rapidfuzz

In [None]:
import stanza
import nltk
import csv
from nltk.util import ngrams
from rapidfuzz.distance import Levenshtein
from rapidfuzz import fuzz

# Ensure necessary packages are available
nltk.download('punkt')

# Initialize the Stanford CoreNLP Pipeline
stanza.download('en')
nlps = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse')

def read_file(filepath):
    with open(filepath, 'r') as file:
        reader = csv.reader(file)
        data = {row[0].lower(): row[1] for row in reader}  # Store labels and corresponding URIs
    return data

def compute_similarity(ngram, label):
    ngram_str = ' '.join(ngram)
    if ' ' in ngram_str:
        # It's a bigram (or higher order n-gram) if it contains a space
        #print(f"jaro {fuzz.WRatio(ngram_str, label) / 100}")
        return fuzz.WRatio(ngram_str, label) / 100
    else:
        # It's a unigram
        #print(f"Lev: {ngram_str},{label} : {1 - (Levenshtein.distance(ngram_str, label) / max(len(ngram_str), len(label)))}")
        return 1 - (Levenshtein.distance(ngram_str, label) / max(len(ngram_str), len(label)))

def Concept_Identifier(question):
    # Read files and prepare data
    file1_data = read_file('/kaggle/input/yagoclasses1/YAGO2geoClasses.txt')
    #file2_data = read_file('/kaggle/input/yagoclasses1/YAGOClasses.txt')
    
    uris = []
    
    # Process the question using the NLP pipeline
    doc = nlps(question)
    
    for sentence in doc.sentences:
        i = 0
        while i < len(sentence.words):
            word = sentence.words[i]
            ngrams_to_check = []
            
            # Check for specific POS tags to form n-grams
            if word.xpos in {"NN", "NNS", "NNP", "NNPS"}:
                current_ngram = [word.lemma.lower()]
                i += 1
                
                # Continue adding to the n-gram if subsequent words have the same relevant POS tags
                while i < len(sentence.words) and sentence.words[i].xpos in {"NN", "NNS", "NNP", "NNPS"}:
                    current_ngram.append(sentence.words[i].lemma.lower())
                    i += 1
                
                # Add the formed n-gram to the list
                ngrams_to_check.append(' '.join(current_ngram))
                
                max_similarity = 0
                threshold = 0.7
                best_uri = None

                # Compare against all labels in file1 and file2
                for label, uri in {**file1_data}.items():
                    for ngram in ngrams_to_check:
                        similarity = compute_similarity([ngram], label)
                        if similarity > max_similarity and similarity > threshold:
                            max_similarity = similarity
                            best_uri = uri
                
                if best_uri:
                    f_uri = best_uri.replace(" ", "")
                    uris.append(f_uri)
            else:
                i += 1  # Move to the next word if it doesn't match the POS tags
    
    return list(set(uris))

# Example usage
question = "Which bays intersect with county councils that border with County Mayo?"
uris = Concept_Identifier(question)
print(uris)

In [15]:
!pip install -q jellyfish Levenshtein

In [None]:
!pip install -q spacy
!python -m spacy download en_core_web_sm

In [None]:
import os
import spacy
from jellyfish import jaro_winkler_similarity
import Levenshtein as lev

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

def read_file(file_path):
    with open(file_path, 'r') as file:
        return file.readlines()

def levenshtein_similarity(str1, str2):
    distance = lev.distance(str1, str2)
    max_len = max(len(str1), len(str2))
    similarity = 1 - (distance / max_len)  # Normalize the distance to get a similarity measure
    return similarity

def lemmatize_word(word):
    doc = nlp(word)
    return doc[0].lemma_

def Plain_Concept_Identifier(question, threshold=0.99):
    file1_data = read_file('/kaggle/input/yagoclasses1/YAGO2geoClasses.txt')
    #file2_data = read_file('/kaggle/input/yagoclasses1/YAGOClasses.txt')
    labels = file1_data 
    
    words = question.lower().split()
    words = [lemmatize_word(word) for word in words]  # Lemmatize each word
    num_words = len(words)

    uris = []
    
    for label in labels:
        label_text = label.strip().split(',')[0].lower()  # Lowercase label
        uri = label.strip().split(',')[1]
        label_words = label_text.split()
        label_length = len(label_words)

        # Check similarity for single-word labels using Levenshtein distance
        if label_length == 1:
            for word in words:
                similarity = levenshtein_similarity(word, label_text)
                if similarity > threshold:
                    uris.append(uri)
#                     print(f"{word} matched with {label_text}")
#                     print(uri)

        # Check similarity for multi-word labels using Jaro-Winkler similarity
        else:
            for i in range(num_words - label_length + 1):
                word_sequence = " ".join(words[i:i + label_length])
                similarity = jaro_winkler_similarity(word_sequence, label_text)
                if similarity > threshold:
                    uris.append(uri)
#                     print(f"{word_sequence} matched with {label_text}")
#                     print(uri)
    return list(set(uris))

question = "Which bays intersect with county councils that border with County Mayo?"

print(Plain_Concept_Identifier(question))

In [18]:
def read_file_as_string(filepath):
    with open(filepath, 'r') as file:
        return file.read()

# Reading the two files and saving their contents as strings
file1_data = read_file_as_string('/kaggle/input/yagoclasses1/YAGO2geoClasses.txt')
file2_data = read_file_as_string('/kaggle/input/yagoclasses1/YAGOClasses.txt')

concept_identifier_role = f"""You are an expect concept identifier. You are given a knowledge base of URIs that represent various concepts.
Each URI is associated by a descriptive label. The format is "label,URI". Your job is to identify concepts within the user-supplied questions and return only the URIs that correspond to them.
If a specific concept is not mentioned you do not report it, even if it is semantically relevant.
Your answers include only the relevant URIs seperated by commas. If no URI is present you do not answer anything. You do not provide any explanations.

Your knowledge base is the following: {file1_data}

For example: 
Q: "Which bays intersect with county councils that border with County Mayo?"
A: "y2geoo:OSI_City_and_County_Council,y2geoo:OSM_bay"
Q: "Is Doolin to the south of Dublin?"
A: ""
Q: "Which forests are within baronies in the Republic of Ireland?"
A: "y2geoo:OSM_forest,y2geoo:OSI_Barony"

Now based on your knowledge base answer the user question.
"""

In [None]:
def LLM_Concept_Identifier(question):
    result = run_chat_inference(model, tokenizer, concept_identifier_role, question, max_tokens=20)
    result = result.replace("[/INST]", "").strip()
    result = result.replace("\\", "").strip()
    uri_pattern = r'\b\w+:[\w_]+\b'
    
    # Find all URIs that match the pattern
    uris = re.findall(uri_pattern, result)
    
    #print(uris)
    return uris
    
LLM_Concept_Identifier("Which forests are within baronies in the Republic of Ireland?")

* GeoQA Instance Identifier system:

In [22]:
import requests, json

class WATAnnotation:
    # An entity annotated by WAT

    def __init__(self, d):

        # char offset (included)
        self.start = d['start']
        # char offset (not included)
        self.end = d['end']

        # annotation accuracy
        self.rho = d['rho']
        # spot-entity probability
        self.prior_prob = d['explanation']['prior_explanation']['entity_mention_probability']

        # annotated text
        self.spot = d['spot']

        # Wikpedia entity info
        self.wiki_id = d['id']
        self.wiki_title = d['title']


    def json_dict(self):
        # Simple dictionary representation
        return {'wiki_title': self.wiki_title,
                'wiki_id': self.wiki_id,
                'start': self.start,
                'end': self.end,
                'rho': self.rho,
                'prior_prob': self.prior_prob
                }
    
def wat_entity_linking(text):
    # Main method, text annotation with WAT entity linking system
    wat_url = 'https://wat.d4science.org/wat/tag/tag'
    payload = [("gcube-token", MY_GCUBE_TOKEN),
               ("text", text),
               ("lang", 'en'),
               ("tokenizer", "nlp4j"),
               ('debug', 9),
               ("method",
                "spotter:includeUserHint=true:includeNamedEntity=true:includeNounPhrase=true,prior:k=50,filter-valid,centroid:rescore=true,topk:k=5,voting:relatedness=lm,ranker:model=0046.model,confidence:model=pruner-wiki.linear")]

    response = requests.get(wat_url, params=payload)
    return [WATAnnotation(a) for a in response.json()['annotations']]

In [23]:
import requests

def test_uri(uri, endpoint_url="endp", accept_format='application/sparql-results+json'):
    """
    Sends a SPARQL ASK query to a GraphDB endpoint to check if a URI exists.

    :param uri: URI to be checked
    :param endpoint_url: URL of the GraphDB SPARQL endpoint
    :param accept_format: Desired response format (default is JSON)
    :return: Boolean indicating if the URI exists
    """
    query = f"""ASK WHERE {{
  <{uri}> ?p ?o .
}}"""
    headers = {
        'Accept': accept_format,
        'Content-Type': 'application/x-www-form-urlencoded'
    }

    data = {
        'query': query
    }
    
    try:
        response = requests.post(endpoint_url, headers=headers, data=data, auth=requests.auth.HTTPBasicAuth('user', 'pass'))

        if response.status_code == 200:
            if accept_format == 'application/sparql-results+json':
                json_response = response.json()
                return json_response.get('boolean', False)  # Return True if URI exists, otherwise False
            else:
                return False  # If not using JSON format, default to False (or handle differently if necessary)
        else:
            response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print("HTTP error (most likely invalid query):", err)
        return False
    except Exception as err:
        print("Endpoint error or endpoint is down:", err)
        return False


In [24]:
def GeoQAInstanceIdentifier(question):
    # Process the question using the NLP pipeline
    doc = nlp(question)
    uris = []
    
    for sentence in doc.sentences:
        for word in sentence.words:
            # Display the word, lemma, POS tag, and dependency information in CoNLL-U format
            #print(f"{word.id}\t{word.text}\t{word.lemma}\t{word.upos}\t{word.xpos}\t_\t{word.head}\t{word.deprel}\t_\t_")
            
            #print (word.upos)
            # Check for specific POS tags and map to the geospatial relations
            if word.xpos in {"NN", "NNS", "NNP", "NNPS"}:
                print(word.text)
                #lemma = word.lemma.lower()
                ann = wat_entity_linking(word.text)
                for result in ann: 
                    title = result.wiki_title

                    yago_link = "http://yago-knowledge.org/resource/"
                    target_uri = yago_link + title
                    full_uri = target_uri
                    
                    # Shorten the uris down to prefixes.
                    for uri_map, prefix in prefix_map.items():
                        target_uri = target_uri.replace(uri_map, prefix)

                    target_uri = target_uri.replace("&amp;", "&")
                    target_uri = target_uri.replace(" ", "_")
                    
                    exists = test_uri(full_uri)
                    
                    if exists == True:
                        uris.append(target_uri)
#                     else:
#                         ##### SEARCH IN KG ##### This performed worse.
#                         recognized_uris = graphdb_send_request(title)
#                         # Use StringIO to treat the CSV string as a file
#                         csv_file = StringIO(recognized_uris)
#                         csv_reader = csv.reader(csv_file)
#                         # Skip the first row (headers)
#                         next(csv_reader)

#                         candidate_uris = ""
#                         relevant_uris = []
#                         for row in csv_reader:
#                             uri = row[0]
#                             name = row[1]
#                             count = row[2]
#                             # Shorten the uris down to prefixes.
#                             for uri_map, prefix in prefix_map.items():
#                                 uri = uri.replace(uri_map, prefix)

#                             relevant_uris.append((uri, name, count))
#                             final_uri = f"Uri: {uri}, Name: {name}, Count: {count}"
#                             candidate_uris += final_uri

#                         # Popular uri choice.
#                         if relevant_uris != []:
#                             uris.append(relevant_uris[0][0])
                  
    return uris

* LLM-powered NER Pipeline function

In [None]:
import re

def extract_first_integer(s):
    match = re.search(r'\d+', s)
    if match:
        return int(match.group(0))
    return None

In [None]:
import csv
from io import StringIO

def retrieve_uris(question):
    # Extract toponyms from the questions using NER.
    result = run_chat_inference(model, tokenizer, ner_system_role, question)
    result = ner_cleanup(result)
    # Split the toponym string to a list of toponyms.
    if result == '':
        toponyms = []
    else:
        toponyms = result.split(',')
    
    uris = []
    # Disambigation with WAT
    for toponym in toponyms: 
        ann = wat_entity_linking(toponym)
        for result in ann: 
            title = result.wiki_title

            yago_link = "http://yago-knowledge.org/resource/"
            target_uri = yago_link + title
            full_uri = target_uri

            # Shorten the uris down to prefixes.
            for uri_map, prefix in prefix_map.items():
                target_uri = target_uri.replace(uri_map, prefix)

            target_uri = target_uri.replace("&amp;", "&")
            target_uri = target_uri.replace(" ", "_")

            exists = test_uri(full_uri)

            if exists == True:
                uris.append(target_uri)
            else:
                ##### SEARCH IN KG #####
                recognized_uris = graphdb_send_request(title)
                # Use StringIO to treat the CSV string as a file
                csv_file = StringIO(recognized_uris)
                csv_reader = csv.reader(csv_file)
                # Skip the first row (headers)
                next(csv_reader)

                candidate_uris = ""
                relevant_uris = []
                for row in csv_reader:
                    uri = row[0]
                    name = row[1]
                    count = row[2]
                    # Shorten the uris down to prefixes.
                    for uri_map, prefix in prefix_map.items():
                        uri = uri.replace(uri_map, prefix)

                    relevant_uris.append((uri, name, count))
                    final_uri = f"Uri: {uri}, Name: {name}, Count: {count}"
                    candidate_uris += final_uri

                # Popular uri choice.
                if relevant_uris != []:
                    uris.append(relevant_uris[0][0])
    return uris
        
    # List of the uris that will be used.
    uris = []
    for toponym in toponyms:
        recognized_uris = graphdb_send_request(toponym)
        #print(recognized_uris)

        # Use StringIO to treat the CSV string as a file
        csv_file = StringIO(recognized_uris)
        csv_reader = csv.reader(csv_file)
        # Skip the first row (headers)
        next(csv_reader)
        
        candidate_uris = ""
        relevant_uris = []
        for row in csv_reader:
            uri = row[0]
            name = row[1]
            count = row[2]
            # Shorten the uris down to prefixes.
            for uri_map, prefix in prefix_map.items():
                uri = uri.replace(uri_map, prefix)
            
            relevant_uris.append((uri, name, count))
            final_uri = f"Uri: {uri}, Name: {name}, Count: {count}"
            candidate_uris += final_uri
    
        if relevant_uris == []:
            return []
        
        # Limit the model to select from the 3 most frequently used relevant uris (usually the first is the target).
        supplied_uris = ""
        limit = min(3, len(relevant_uris))
        for i in range (0, limit):
            #supplied_uris += f"{i}. {relevant_uris[i][0], relevant_uris[i][1]}"
            # THIS APPROACH DOES NOT INCLUDE THE NAME OF EACH URI SUPPLIED TO THE MODEL #
            # This works better, not because the model is confused by the amount of info
            # but rather because the entities of the KG have sometimes odd and misleading names.
            supplied_uris += f"{i}. {relevant_uris[i][0]}"
            
        print(supplied_uris)
        # Prompt model again, but this time to choose the correct uri to be used.
        uri_select_system_role = select_uri_system_role(question)
        
        result = run_chat_inference(model, tokenizer, uri_select_system_role, supplied_uris)
        #print(f"hello? {result}")
        # Model might return some characters along the number e.g. "A: 0". Simply extract the int from the string.
        target = extract_first_integer(result)
        
        # Instead of using the LLM again to select, naively select the most frequently used URI.
#         uris.append(relevant_uris[0][0])
        
        # "Intelligent" uri selection powered by LLM disambiguation.
        if target in range(0, limit):
            uris.append(relevant_uris[target][0])

    return uris

* Evaluation and comparisson

In [None]:
count = 0 

for key in original_dataset:
    uris = original_dataset[key]['URI']
    for uri in uris:
        count += 1
        
print (count)

In [None]:
scores = []
fp_scores = []
w_count = 0
c_count = 0

w_scores = []
wfp_scores = []
ww_count = 0
wc_count = 0

for key in original_dataset:
    # Get the dataset question and dataset uris.
    question = original_dataset[key]['Question']
    ground_truth_uris = original_dataset[key]['URI']
    
    # Generate uris from the question.
#     generated_uris = retrieve_uris(question)
#     concept_uris = LLM_Concept_Identifier(question)
#     if concept_uris != []:
#         generated_uris.extend(concept_uris)
    generated_uris = []
    print(f"LLM: {generated_uris}")
    # Generate uris with the GeoQA method for comparisson.
#     wat_uris = GeoQAInstanceIdentifier(question)
    wat_uris = Concept_Identifier(question)
#     wat_uris = []
    print(f"generated: {wat_uris}")
    print(f"gt: {ground_truth_uris}")

    # Save the results to the dataset.
    original_dataset[key]['Gen_URI'] = wat_uris
    
    # Evaluate the generated results compared to the ground truth uris.
    correct = 0
    wrong = 0
    for uri in ground_truth_uris: 
        if generated_uris: 
            if uri in generated_uris:
                correct += 1
            
    # Count False positives.
    if generated_uris: 
        for uri in generated_uris: 
            if uri not in ground_truth_uris: 
                wrong += 1
    
    if len(ground_truth_uris) != 0:
        accuracy = correct/len(ground_truth_uris)
        fp_perc = wrong/len(ground_truth_uris)
        print(accuracy)
        scores.append(accuracy)
        fp_scores.append(fp_perc)
        
    ###### SAME FOR WAT. ######
    w_correct = 0
    w_wrong = 0
    for uri in ground_truth_uris: 
        if wat_uris:
            if uri in wat_uris:
                w_correct += 1
            
    # Count False positives.
    if wat_uris: 
        for uri in wat_uris: 
            if uri not in ground_truth_uris: 
                w_wrong += 1
    
    if len(ground_truth_uris) != 0:
        accuracy = w_correct/len(ground_truth_uris)
        fp_perc = w_wrong/len(ground_truth_uris)
        print(accuracy)
        w_scores.append(accuracy)
        wfp_scores.append(fp_perc)
    
    c_count += correct
    w_count += wrong
    
    wc_count += w_correct
    ww_count += w_wrong
    
# Print average of scores.
average_accuracy = sum(scores) / len(scores) if scores else 0
fp_perc = sum(fp_scores) / len(fp_scores) if fp_scores else 0
print(f"Average accuracy: {average_accuracy:.2f}. Total corrects: {c_count}")
print(f"False Positive rate percentage: {fp_perc:.2f}. Total mistakes: {w_count}")

# Print average of scores for wat.
average_accuracy = sum(w_scores) / len(w_scores) if w_scores else 0
fp_perc = sum(wfp_scores) / len(wfp_scores) if wfp_scores else 0
print(f"GeoQA Average accuracy: {average_accuracy:.2f}. Total corrects: {wc_count}")
print(f"GeoQA False Positive rate percentage: {fp_perc:.2f}. Total mistakes: {ww_count}")

# Save the JSON data to a file
with open('concepts_dataset.json', 'w') as json_file:
    json.dump(original_dataset, json_file, indent=4)