Define the model and the inference function

In [1]:
import torch

def run_inference(model, tokenizer, prompt):
    results = []
    
    if tokenizer == None:
        # Generate output
        with torch.no_grad():
            outputs = model(prompt)
            
        # Decode and print output
        #print("Prompt:", prompt)
        #print("Generated text:" + outputs + "\n")
        results.append("Generated text:" + outputs)
    else:
        # Move model to GPU
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        model.eval()  # Set model to evaluation mode
            
        # Tokenize prompt
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
            
        # Generate output
        with torch.no_grad():
            outputs = model.generate(**inputs, 
                            max_length=1800,  # Set a maximum length for generated text
                            #do_sample=True,  # Enable sampling
                            #top_k=7,        # Top-k sampling
                            #top_p=0.1,      # Top-p sampling (nucleus sampling)
                            #num_return_sequences=1,
                            #repetition_penalty=1, # No penalty for instruction tuned models.
                            repetition_penalty=1.2, # Penalty on repeating tokens.
                            eos_token_id=tokenizer.eos_token_id,  # Specify EOS token ID
                            pad_token_id=tokenizer.pad_token_id  # Specify PAD token ID
                            )
        
        # Extract generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Remove the prompt text
        prompt_length = len(prompt)
        generated_text = generated_text[prompt_length:]

        # Decode and print output
        #print("Prompt:", prompt)
        #print(generated_text)
        results.append(generated_text)
    
    # Clear model from RAM
    del model
    torch.cuda.empty_cache()
    
    return results

In [2]:
import torch

def Quantized_Inference(model, tokenizer, prompt):
    results = []
    
    # Move model to GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()  # Set model to evaluation mode
            
    # Tokenize prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
            
    # Generate output
    with torch.no_grad():
        outputs = model.generate(**inputs, 
                            max_new_tokens=350,  # Set a maximum length for generated text
                            #do_sample=True,  # Enable sampling
                            #top_k=7,        # Top-k sampling
                            #top_p=0.1,      # Top-p sampling (nucleus sampling)
                            #num_return_sequences=1,
                            repetition_penalty=1.2, # Penalty on repeating tokens.
                            eos_token_id=tokenizer.eos_token_id,  # Specify EOS token ID
                            pad_token_id=tokenizer.pad_token_id  # Specify PAD token ID
                            )
        
    # Extract generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the prompt text
    prompt_length = len(prompt)
    generated_text = generated_text[prompt_length:]

    # Decode and print output
    #print("Prompt:", prompt)
    #print("Generated text:" + generated_text + "\n")
    results.append("Generated text:" + generated_text)
    
    # Clear model from RAM
    del model
    torch.cuda.empty_cache()
    
    return results

Define two functions for extracting URIs from the dataset queries, one with direct extraction and the other with the ability to expand prefixes.

In [3]:
import re

def extract_uris(query):
    # Regular expression to match both fully expanded and prefixed URIs
    uri_pattern = r'<([^>]+)>|(\b[a-zA-Z0-9_]+):([a-zA-Z0-9_]+)'
    
    uris = []
    matches = re.findall(uri_pattern, query)
    for match in matches:
        if match[0]:  # Fully expanded URI
            uris.append(match[0])
        else:  # Prefixed URI
            uris.append(f"{match[1]}:{match[2]}")
    return uris

def expand_uris(query, prefix_dict):
    prefixed_pattern = r'(\b[a-zA-Z0-9_]+):([a-zA-Z0-9_]+)'
    expanded_pattern = r'<([^>]+)>'
    
    expanded_uris = []
    
    # Find and expand prefixed URIs
    matches = re.findall(prefixed_pattern, query)
    for prefix, suffix in matches:
        if prefix in prefix_dict:
            expanded_uris.append(f"{prefix_dict[prefix]}{suffix}")
        else:
            expanded_uris.append(f"{prefix}:{suffix}")
    
    # Find and add already expanded URIs
    matches = re.findall(expanded_pattern, query)
    for uri in matches:
        expanded_uris.append(uri)
    
    return expanded_uris

prefix_dict = {
    'geo': 'http://www.opengis.net/ont/geosparql#',
    'osm': 'http://www.openstreetmap.org/ontology#',
    'xsd': 'http://www.w3.org/2001/XMLSchema#',
    'geof': 'http://www.opengis.net/def/function/geosparql/',
    'uom': 'http://www.opengis.net/def/uom/OGC/1.0/'
}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", load_in_8bit=True)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel

model = AutoModelForCausalLM.from_pretrained("alpindale/Mistral-7B-v0.2-hf", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("alpindale/Mistral-7B-v0.2-hf")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto", torch_dtype=torch.float32)

Run inference on the entire dataset and store them for evaluation

In [None]:
# Run inference and cleanup.
def generate_query(model, tokenizer, user_prompt):
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
Human: {user_prompt}
Generator: ```"""
    
    results = run_inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

In [None]:
# Run inference and cleanup.
def generate_query(model, tokenizer, user_prompt):
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
Human: In Breckland district, which forests are south of streams?
Generator: ```SELECT DISTINCT ?forest WHERE {{ yago:Breckland_District geo:hasGeometry ?o1 . ?o1 geo:asWKT ?geoWKT1 . ?forest rdf:type y2geoo:OSM_forest . ?forest geo:hasGeometry ?o2 . ?o2 geo:asWKT ?geoWKT2 . ?stream rdf:type y2geoo:OSM_stream . ?stream geo:hasGeometry ?o3 . ?o3 geo:asWKT ?geoWKT3 . FILTER (strdf:within(?geoWKT2, ?geoWKT1) && strdf:within(?geoWKT3, ?geoWKT1) && strdf:below(?geoWKT2, ?geoWKT3)) }}```
Human: How many streams intersect with lakes?
Generator: ```SELECT (COUNT (DISTINCT ?p1) as ?streams) WHERE {{ ?p1 rdf:type y2geoo:OSM_stream; geo:hasGeometry ?p1geo. ?p1geo geo:asWKT ?p1WKT. ?p2 rdf:type y2geoo:OSM_lake; geo:hasGeometry ?p2geo. ?p2geo geo:asWKT ?p2WKT. FILTER(geof:sfIntersects(?p1WKT, ?p2WKT)) }}```
Human: Which Municipalities are on Thessaly's border?
Generator: ```SELECT distinct ?rg where {{ yago:Thessaly geo:hasGeometry ?tgeo . ?tgeo geo:asWKT ?tgWKT . ?rg rdf:type y2geoo:GAG_Municipality . ?rg geo:hasGeometry ?rggeo . ?rggeo geo:asWKT ?rgWKT . FILTER (strdf:touches(?tgWKT,?rgWKT)) . }}```
Human: {user_prompt}
Generator: ```"""
    
    results = run_inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

In [None]:
# Run llama 3 inference and cleanup.
def generate_quantized_query(model, tokenizer, user_prompt):
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
Human: In Breckland district, which forests are south of streams?
Generator: ```SELECT DISTINCT ?forest WHERE {{ yago:Breckland_District geo:hasGeometry ?o1 . ?o1 geo:asWKT ?geoWKT1 . ?forest rdf:type y2geoo:OSM_forest . ?forest geo:hasGeometry ?o2 . ?o2 geo:asWKT ?geoWKT2 . ?stream rdf:type y2geoo:OSM_stream . ?stream geo:hasGeometry ?o3 . ?o3 geo:asWKT ?geoWKT3 . FILTER (strdf:within(?geoWKT2, ?geoWKT1) && strdf:within(?geoWKT3, ?geoWKT1) && strdf:below(?geoWKT2, ?geoWKT3)) }}```
Human: How many streams intersect with lakes?
Generator: ```SELECT (COUNT (DISTINCT ?p1) as ?streams) WHERE {{ ?p1 rdf:type y2geoo:OSM_stream; geo:hasGeometry ?p1geo. ?p1geo geo:asWKT ?p1WKT. ?p2 rdf:type y2geoo:OSM_lake; geo:hasGeometry ?p2geo. ?p2geo geo:asWKT ?p2WKT. FILTER(geof:sfIntersects(?p1WKT, ?p2WKT)) }}```
Human: Which Municipalities are on Thessaly's border?
Generator: ```SELECT distinct ?rg where {{ yago:Thessaly geo:hasGeometry ?tgeo . ?tgeo geo:asWKT ?tgWKT . ?rg rdf:type y2geoo:GAG_Municipality . ?rg geo:hasGeometry ?rggeo . ?rggeo geo:asWKT ?rgWKT . FILTER (strdf:touches(?tgWKT,?rgWKT)) . }}```
Human: {user_prompt}
Generator: ```"""

    results = Quantized_Inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

Save the results as a json file of the same format as the dataset.

In [None]:
import json

with open('/kaggle/input/geoqa200/200_Sub_Dataset.json', 'r') as file:
    original_dataset = json.load(file)

# Create a new dataset with questions and generated queries
new_dataset = {}
i = 0
for key, item in original_dataset.items():
    i += 1
    question = item['Question']
    query = generate_query(model, tokenizer, question)
    new_dataset[key] = {'Question': question, 'Query': query}
    print (f"{i}/{len(original_dataset.items())}")
    
# Save the new dataset to a JSON file
with open('/kaggle/working/generated_dataset1.json', 'w') as file:
    json.dump(new_dataset, file, indent=4)

In [None]:
import json

with open('/kaggle/input/80geoqa/80_Sub_Dataset.json', 'r') as file:
    original_dataset = json.load(file)

# Create a new dataset with questions and generated queries
new_dataset = {}
i = 0
for key, item in original_dataset.items():
    i += 1
    question = item['Question']
    query = generate_quantized_query(model, tokenizer, question)
    new_dataset[key] = {'Question': question, 'Query': query}
    print (f"{i}/{len(original_dataset.items())}")

# Save the new dataset to a JSON file
with open('/kaggle/working/generated_dataset1.json', 'w') as file:
    json.dump(new_dataset, file, indent=4)

Alternatively, run inference and URI extraction and injection into the prompt.

In [None]:
# Run inference and cleanup.
def generate_URI_injected_query(model, tokenizer, user_prompt, gt_query):
    # Direct extraction, NOTE: try expanded extraction.
    uris = extract_uris(gt_query)
    
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
Human: {user_prompt}
The generator must use these URIs to answer the question: {uris}
Generator: ```"""
    
    results = run_inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

Try as well with supplying the model with GeoSPARQL ontology description.

In [None]:
# Run inference and cleanup.
def generate_URI_injected_query(model, tokenizer, user_prompt, gt_query):
    # Direct extraction, NOTE: try expanded extraction.
    uris = extract_uris(gt_query)
    
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
The resulting query may have to be in GeoSPARQL. The GeoSPARQL ontology is defined by:
URI: http://www.opengis.net/ont/geosparql
Classes: Feature, Feature Collection, Geometry, Geometry Collection, Spatial Object, Spatial Object Collection
Object Properties: default geometry, contains, covered by, covers, disjoint, equals, inside, meet, overlap, has area, has bounding box, has centroid, has default geometry, has geometry, has length, has perimeter length, has size, has spatial accuracy, has spatial resolution, has volume, disconnected, externally connected, equals, non-tangential proper part, non-tangential proper part inverse, partially overlapping, tangential proper part, tangential proper part inverse, contains, crosses, disjoint, equals, intersects, overlaps, touches, within
Datatype Properties: as DGGS, as GML, as GeoJSON, as KML, as WKT, coordinate dimension, dimension, has area in square meters, has length in meters, has perimeter length in meters, has metric size, has spatial accuracy in meters, has spatial resolution in meters, has volume in cubic meters, has serialization, is empty, is simple, spatial dimension
Human: {user_prompt}
The generator must use these URIs to answer the question: {uris}
Generator: ```"""
    
    results = run_inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

Try few shot training.

In [None]:
# Run inference and cleanup.
def generate_URI_injected_query(model, tokenizer, user_prompt, gt_query):
    # Direct extraction, NOTE: try expanded extraction.
    uris = extract_uris(gt_query)
    
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
Human: In Breckland district, which forests are south of streams?
The generator must use these URIs to answer the question: ['yago:Breckland_District', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:OSM_forest', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:OSM_stream', 'geo:hasGeometry', 'geo:asWKT', 'strdf:within', 'strdf:within', 'strdf:below']
Generator: ```SELECT DISTINCT ?forest WHERE {{ yago:Breckland_District geo:hasGeometry ?o1 . ?o1 geo:asWKT ?geoWKT1 . ?forest rdf:type y2geoo:OSM_forest . ?forest geo:hasGeometry ?o2 . ?o2 geo:asWKT ?geoWKT2 . ?stream rdf:type y2geoo:OSM_stream . ?stream geo:hasGeometry ?o3 . ?o3 geo:asWKT ?geoWKT3 . FILTER (strdf:within(?geoWKT2, ?geoWKT1) && strdf:within(?geoWKT3, ?geoWKT1) && strdf:below(?geoWKT2, ?geoWKT3)) }}```
Human: How many streams intersect with lakes?
The generator must use these URIs to answer the question: ['rdf:type', 'y2geoo:OSM_stream', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:OSM_lake', 'geo:hasGeometry', 'geo:asWKT', 'geof:sfIntersects']
Generator: ```SELECT (COUNT (DISTINCT ?p1) as ?streams) WHERE {{ ?p1 rdf:type y2geoo:OSM_stream; geo:hasGeometry ?p1geo. ?p1geo geo:asWKT ?p1WKT. ?p2 rdf:type y2geoo:OSM_lake; geo:hasGeometry ?p2geo. ?p2geo geo:asWKT ?p2WKT. FILTER(geof:sfIntersects(?p1WKT, ?p2WKT)) }}```
Human: Which Municipalities are on Thessaly's border?
The generator must use these URIs to answer the question: ['yago:Thessaly', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:GAG_Municipality', 'geo:hasGeometry', 'geo:asWKT', 'strdf:touches']
Generator: ```SELECT distinct ?rg where {{ yago:Thessaly geo:hasGeometry ?tgeo . ?tgeo geo:asWKT ?tgWKT . ?rg rdf:type y2geoo:GAG_Municipality . ?rg geo:hasGeometry ?rggeo . ?rggeo geo:asWKT ?rgWKT . FILTER (strdf:touches(?tgWKT,?rgWKT)) . }}```
Human: {user_prompt}
The generator must use these URIs to answer the question: {uris}
Generator: ```"""
    
    results = run_inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

Let's try with even more examples.

In [None]:
# Run inference and cleanup.
def generate_URI_injected_query(model, tokenizer, user_prompt, gt_query):
    # Direct extraction, NOTE: try expanded extraction.
    uris = extract_uris(gt_query)
    
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
Human: In Breckland district, which forests are south of streams?
The generator must use these URIs to answer the question: ['yago:Breckland_District', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:OSM_forest', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:OSM_stream', 'geo:hasGeometry', 'geo:asWKT', 'strdf:within', 'strdf:within', 'strdf:below']
Generator: ```SELECT DISTINCT ?forest WHERE {{ yago:Breckland_District geo:hasGeometry ?o1 . ?o1 geo:asWKT ?geoWKT1 . ?forest rdf:type y2geoo:OSM_forest . ?forest geo:hasGeometry ?o2 . ?o2 geo:asWKT ?geoWKT2 . ?stream rdf:type y2geoo:OSM_stream . ?stream geo:hasGeometry ?o3 . ?o3 geo:asWKT ?geoWKT3 . FILTER (strdf:within(?geoWKT2, ?geoWKT1) && strdf:within(?geoWKT3, ?geoWKT1) && strdf:below(?geoWKT2, ?geoWKT3)) }}```
Human: How many streams intersect with lakes?
The generator must use these URIs to answer the question: ['rdf:type', 'y2geoo:OSM_stream', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:OSM_lake', 'geo:hasGeometry', 'geo:asWKT', 'geof:sfIntersects']
Generator: ```SELECT (COUNT (DISTINCT ?p1) as ?streams) WHERE {{ ?p1 rdf:type y2geoo:OSM_stream; geo:hasGeometry ?p1geo. ?p1geo geo:asWKT ?p1WKT. ?p2 rdf:type y2geoo:OSM_lake; geo:hasGeometry ?p2geo. ?p2geo geo:asWKT ?p2WKT. FILTER(geof:sfIntersects(?p1WKT, ?p2WKT)) }}```
Human: Which Municipalities are on Thessaly's border?
The generator must use these URIs to answer the question: ['yago:Thessaly', 'geo:hasGeometry', 'geo:asWKT', 'rdf:type', 'y2geoo:GAG_Municipality', 'geo:hasGeometry', 'geo:asWKT', 'strdf:touches']
Generator: ```SELECT distinct ?rg where {{ yago:Thessaly geo:hasGeometry ?tgeo . ?tgeo geo:asWKT ?tgWKT . ?rg rdf:type y2geoo:GAG_Municipality . ?rg geo:hasGeometry ?rggeo . ?rggeo geo:asWKT ?rgWKT . FILTER (strdf:touches(?tgWKT,?rgWKT)) . }}```
Human: Which is the largest island in Ireland?
The generator must use these URIs to answer the question: ['strdf:area', 'yago:Republic_of_Ireland', 'geo:hasGeometry', 'geo:asWKT', 'y2geoo:OSM_island', 'geo:hasGeometry', 'geo:asWKT', 'geof:sfContains']
Generator: ```select distinct ?x (strdf:area(?lWKT) as ?area) where {{ yago:Republic_of_Ireland geo:hasGeometry ?geom . ?geom geo:asWKT ?mWKT . ?lake a y2geoo:OSM_island . ?lake geo:hasGeometry ?geol . ?geol geo:asWKT ?lWKT . FILTER (geof:sfContains(?mWKT, ?lWKT)) }} ORDER BY (?area) LIMIT 1```
Human: Is Crete south of Thessaly?
The generator must use these URIs to answer the question: ['http://yago-knowledge.org/resource/Crete', 'geo:hasGeometry', 'http://yago-knowledge.org/resource/Thessaly', 'geo:hasGeometry', 'geo:asWKT', 'geo:asWKT', 'strdf:below']
Generator: ```ASK {{ <http://yago-knowledge.org/resource/Crete> geo:hasGeometry ?geo1 . <http://yago-knowledge.org/resource/Thessaly> geo:hasGeometry ?geo2 . ?geo1 geo:asWKT ?geoWKT1 . ?geo2 geo:asWKT ?geoWKT2 . FILTER(strdf:below(?geoWKT1, ?geoWKT2)) }}```
Human: What is the population of Northern Ireland?
The generator must use these URIs to answer the question: ['xsd:integer', 'yago:Northern_Ireland', 'yago:hasPopulation']
Generator: ```SELECT (xsd:integer (?population) as ?pop) WHERE {{ yago:Northern_Ireland yago:hasPopulation ?population. }}```
Human: {user_prompt}
The generator must use these URIs to answer the question: {uris}
Generator: ```"""
    
    results = run_inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

How is it without URI injection?

In [None]:
# Run inference and cleanup.
def generate_URI_injected_query(model, tokenizer, user_prompt, gt_query):
    prompt = f"""Generator is an expert SPARQL query generator. For each question that the user supplies, the generator will convert it into a valid SPARQL query that can be used to answer the question. The query will be based on the DBpedia knowledge graph. The query should be enclosed by three backticks on new lines, denoting that it is a code block.
Human: In Breckland district, which forests are south of streams?
Generator: ```SELECT DISTINCT ?forest WHERE {{ yago:Breckland_District geo:hasGeometry ?o1 . ?o1 geo:asWKT ?geoWKT1 . ?forest rdf:type y2geoo:OSM_forest . ?forest geo:hasGeometry ?o2 . ?o2 geo:asWKT ?geoWKT2 . ?stream rdf:type y2geoo:OSM_stream . ?stream geo:hasGeometry ?o3 . ?o3 geo:asWKT ?geoWKT3 . FILTER (strdf:within(?geoWKT2, ?geoWKT1) && strdf:within(?geoWKT3, ?geoWKT1) && strdf:below(?geoWKT2, ?geoWKT3)) }}```
Human: How many streams intersect with lakes?
Generator: ```SELECT (COUNT (DISTINCT ?p1) as ?streams) WHERE {{ ?p1 rdf:type y2geoo:OSM_stream; geo:hasGeometry ?p1geo. ?p1geo geo:asWKT ?p1WKT. ?p2 rdf:type y2geoo:OSM_lake; geo:hasGeometry ?p2geo. ?p2geo geo:asWKT ?p2WKT. FILTER(geof:sfIntersects(?p1WKT, ?p2WKT)) }}```
Human: Which Municipalities are on Thessaly's border?
Generator: ```SELECT distinct ?rg where {{ yago:Thessaly geo:hasGeometry ?tgeo . ?tgeo geo:asWKT ?tgWKT . ?rg rdf:type y2geoo:GAG_Municipality . ?rg geo:hasGeometry ?rggeo . ?rggeo geo:asWKT ?rgWKT . FILTER (strdf:touches(?tgWKT,?rgWKT)) . }}```
Human: {user_prompt}
Generator: ```"""
    
    results = run_inference(model, tokenizer, prompt)

    end_index = results[0].find("```")

    # Extract the substring from the start of the string up to the first occurrence of ```
    if end_index != -1:
        query = results[0][:end_index]
    else:
        # If ``` is not found, keep the original string
        query = results[0]

    # Now remove the SPARQL prefix that the model adds.
    start_index = query.find("SPARQL")
    if start_index == 0:
        # Remove the prefix and all characters leading up to it
        query = query[start_index + len("SPARQL"):]

    return query

In [None]:
import json

with open('/kaggle/input/geoqa200/200_Sub_Dataset.json', 'r') as file:
    original_dataset = json.load(file)

# Create a new dataset with questions and generated queries
new_dataset = {}
for key, item in original_dataset.items():
    question = item['Question']
    gt_query = item['Query']
    query = generate_URI_injected_query(model, tokenizer, question, gt_query)
    new_dataset[key] = {'Question': question, 'Query': query}
    #print (f"{key}/{len(original_dataset.items())}")

# Save the new dataset to a JSON file
with open('/kaggle/working/generated_dataset1.json', 'w') as file:
    json.dump(new_dataset, file, indent=4)

Evaluate the generated SPARQL queries.

In [None]:
!pip install rdflib

In [None]:
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rdflib.plugins.sparql.parser import parseQuery

def normalize_variables(query):
    if query is None:
        return ""
    variable_pattern = re.compile(r"\?\w+")
    variables = variable_pattern.findall(query)
    normalized_query = query
    for i, var in enumerate(variables):
        normalized_query = normalized_query.replace(var, f"?var{i}")
    return normalized_query

def is_parsable(query):
    try:
        parsed_query = parseQuery(query)
        return True
    except Exception as e:
        print(f"Error parsing query: {e}")
        return False

def extract_tokens(query):
    if query is None:
        return []
    # Tokenize by splitting on non-word characters
    tokens = re.findall(r'\b\w+\b', query)
    return tokens

def mod_jaccard_similarity(list1, list2):
    generated_freq = Counter(list1)
    reference_freq = Counter(list2)

    intersection = sum((generated_freq & reference_freq).values())
    union = sum((generated_freq | reference_freq).values())

    if not union:
        return 0.0

    return intersection / union

def cosine_similarity_score(generated_query, reference_query):
    # Normalize and extract tokens
    normalized_generated_query = normalize_variables(generated_query)
    normalized_reference_query = normalize_variables(reference_query)

    generated_tokens = ' '.join(extract_tokens(normalized_generated_query))
    reference_tokens = ' '.join(extract_tokens(normalized_reference_query))

    # Vectorize the tokens
    vectorizer = CountVectorizer().fit_transform([generated_tokens, reference_tokens])
    vectors = vectorizer.toarray()

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0, 1]

def bleu_score(generated_query, reference_query):
    # Normalize and extract tokens
    normalized_generated_query = normalize_variables(generated_query)
    normalized_reference_query = normalize_variables(reference_query)
    
    generated_tokens = extract_tokens(normalized_generated_query)
    reference_tokens = extract_tokens(normalized_reference_query)

    # Calculate BLEU score
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothie)

def calculate_similarity_score(generated_query, reference_query):
    # Normalize and extract tokens
    normalized_generated_query = normalize_variables(generated_query)
    normalized_reference_query = normalize_variables(reference_query)
    
    generated_tokens = extract_tokens(normalized_generated_query)
    reference_tokens = extract_tokens(normalized_reference_query)
    
    # Calculate Jaccard similarity
    m_j_s = mod_jaccard_similarity(generated_tokens, reference_tokens)
    cos_s = cosine_similarity_score(generated_query, reference_query)
    bleu = bleu_score(generated_query, reference_query)
    parsability_score = 1 if is_parsable(generated_query) else 0
    
    hybrid_bleu = 0.2 * m_j_s + 0.2 * cos_s + 0.05 * parsability_score + 0.55 * bleu
    
    return bleu, hybrid_bleu

In [None]:
def evaluate_generations(dataset, generated_queries):
    blues = []
    hybrid_blues = []
    
    for key in dataset:
        query = dataset[key]['Query']
        generated_query = generated_queries[key]['Query']
        
        blue, hbleu = calculate_similarity_score(query, generated_query)
        
        blues.append(blue)
        hybrid_blues.append(hbleu)
        
    average_bleu = sum(blues) / len(blues) if blues else 0
    average_hybrid_bleu = sum(hybrid_blues) / len(hybrid_blues) if hybrid_blues else 0
    
    return average_bleu, average_hybrid_bleu

In [None]:
with open('/kaggle/input/80geoqa/80_Sub_Dataset.json', 'r') as file:
    original_dataset = json.load(file)

avg_bleu, avg_hbleu = evaluate_generations(original_dataset, new_dataset)
print(avg_bleu, avg_hbleu)

In [None]:
with open('/kaggle/input/geoqa200/200_Sub_Dataset.json', 'r') as file:
    original_dataset = json.load(file)

avg_bleu, avg_hbleu = evaluate_generations(original_dataset, new_dataset)
print(avg_bleu, avg_hbleu)

Now evaluate the model's not based on their query generation abilities but by comparing the results of the original and generated queries. This means that this test is the final accuracy score of the models.

In [None]:
import re

def format_query(query):
    PREFIXES = """PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX geof: <http://www.opengis.net/def/function/geosparql/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX yago: <http://yago-knowledge.org/resource/>
PREFIX y2geor: <http://kr.di.uoa.gr/yago2geo/resource/>
PREFIX y2geoo: <http://kr.di.uoa.gr/yago2geo/ontology/>
PREFIX strdf: <http://strdf.di.uoa.gr/ontology#>
PREFIX uom: <http://www.opengis.net/def/uom/OGC/1.0/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>"""
    
    query = PREFIXES + ' ' + query
    
    query = query.replace('strdf:within', 'geof:sfWithin')
    query = query.replace('strdf:contains', 'geof:sfContains')
    query = query.replace('strdf:overlaps', 'geof:sfOverlaps')
    query = query.replace('strdf:distance', 'geof:sfDistance')
    
    # Use regex to find and replace strdf:buffer patterns
    query = re.sub(r'strdf:buffer\((\?\w+),\s*\d+,\s*uom:\w+\)', r'\1', query)
    
    return query

In [None]:
def gost_materialize_query(query: str):
    data = {
        "query": query
    }

    headers = {
        'Content-Type': 'application/json'
    }

    response = requests.post("materialize-api", headers=headers, data=json.dumps(data))
    
    if response.status_code == 200:
        return response.text
    else:
        print("Materialize failed:", response.text)
        return (query)

In [None]:
def graphdb_send_request(query, endpoint_url="endp_url", accept_format='application/sparql-results+json'):
    # Format the query, this means add the correct prefixes and fix some endpoint issues with regex.
    query = format_query(query)
    original_query = query
    query = gost_materialize_query(query)
    
    headers = {
        'Accept': accept_format
    }
    
    params = {
        'query': query,
        'infer': 'true',
        'sameAs': 'true'
    }
    
    try:
        response = requests.get(endpoint_url, headers=headers, params=params, auth=requests.auth.HTTPBasicAuth('username', 'password'))
        response.raise_for_status()
        
        if accept_format == 'application/sparql-results+json':
            return response.json()
        else:
            return response.text
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")
        print(f"Original query: {original_query} \n\n gost: {query}")
    except Exception as err:
        print(f"An error occurred: {err}")
        print(f"Original query: {original_query} \n\n gost: {query}")

In [None]:
import json, requests

with open('/kaggle/input/100geoquestions/100_Sub_Dataset.json', 'r') as file:
    original_dataset = json.load(file)
    
for key in original_dataset:
    query = original_dataset[key]['Query']
    #print(query)
    graphdb_send_request(query)