In [23]:
from google import genai
from google.genai import types

client = genai.Client()

def get_response(prompt, instruction):
    response = client.models.generate_content(
    model="gemini-2.5-flash-lite",
    config=types.GenerateContentConfig(
        system_instruction=instruction),
    contents=prompt
)
    return response.text


In [48]:
instruction = """
You are an expert in entity recognition and relation extraction. 
You will be given a question and you need to identify and extract all the entities and properties mentioned in the question. 
Please provide a list of entities and properties found in the following JSON format. Make sure to use double quotes for both the keys and string values. Make sure to use the raw json output instead of a markdown code block.

{
  "Entities": ["entity1", "entity2", "..."],
  "Properties": ["property1", "property2", "..."]
}

If no entities or properties are found, please return empty lists.
"""

prompt = "What data format does CHEMDNER corpus have?"
# prompt = "Where did the study with maximal geographic scale take place?"

response = get_response(prompt, instruction)
print(response)

```json
{
  "Entities": ["CHEMDNER corpus"],
  "Properties": ["data format"]
}
```


In [None]:


# Example usage
prompt = "What data format does CHEMDNER corpus have?"
response = get_response(prompt, instruction)  # <-- LLM output
print("Raw LLM output:", response)
print("Is response valid?", validate_response(response))



Raw LLM output: {
  "Entities": ["CHEMDNER corpus"],
  "Properties": ["data format"]
}
Is response valid? True


In [41]:
def build_sparql_query_entity(text: str) -> str:
    """
    Build a SPARQL query that filters resources by requiring all words
    in `text` to appear in the label (case-insensitive).
    """
    # Split text into lowercase tokens
    tokens = text.lower().split()

    # Build FILTER expression: require all tokens (AND logic)
    filter_parts = [f'CONTAINS(LCASE(STR(?label)), "{t}")' for t in tokens]
    # filter_expr = " && ".join(filter_parts)  # AND: all words must appear
    filter_expr = " || ".join(filter_parts)  # OR logic: any word can appear

    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?resource ?label
        WHERE {{
        ?resource ?p ?o .
        ?resource rdfs:label ?label .
        FILTER ( {filter_expr} )
        }}
        """
    return query

query = build_sparql_query_entity("CHEMDNER corpus")
print(f"query: {query}")

from helper import run_sparql_query
results = run_sparql_query(sparql_text = query, SPARQLPATH="http://localhost:8890/sparql")
print(results)



query: 
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?resource ?label
        WHERE {
        ?resource ?p ?o .
        ?resource rdfs:label ?label .
        FILTER ( CONTAINS(LCASE(STR(?label)), "chemdner") || CONTAINS(LCASE(STR(?label)), "corpus") )
        }
        
{"head": {"vars": ["resource", "label"]}, "results": {"bindings": [{"resource": {"type": "uri", "value": "http://orkg.org/orkg/resource/R36091"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "A Large Public Corpus of Web Tables containing Time and Context Metadata"}}, {"resource": {"type": "uri", "value": "http://orkg.org/orkg/predicate/P59232"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": " Computational Linguistics & Corpus Annotation"}}, {"resource": {"type": "uri", "value": "http://orkg.org/orkg/resource/R44348"}, "label": {"type": "typed-literal", "datatype": "http://

In [37]:
def build_sparql_query_properties(text: str) -> str:
    """
    Build a SPARQL query that filters resources by requiring all words
    in `text` to appear in the label (case-insensitive).
    """
    # Split text into lowercase tokens
    tokens = text.lower().split()

    # Build FILTER expression: require all tokens (AND logic)
    filter_parts = [f'CONTAINS(LCASE(STR(?label)), "{t}")' for t in tokens]
    # filter_expr = " && ".join(filter_parts)  # AND: all words must appear
    filter_expr = " || ".join(filter_parts)  # OR logic: any word can appear

    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?p ?label
        WHERE {{
        ?s ?p ?o .
        ?p rdfs:label ?label .
        FILTER ( {filter_expr} )
        }}
        """
    return query

# query = build_sparql_query("CHEMDNER corpus")
query = build_sparql_query_properties("maximal geographic scale")
print(f"query: {query}")

from helper import run_sparql_query
results = run_sparql_query(sparql_text = query, SPARQLPATH="http://localhost:8890/sparql")
print(results)



query: 
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?p ?label
        WHERE {
        ?s ?p ?o .
        ?p rdfs:label ?label .
        FILTER ( CONTAINS(LCASE(STR(?label)), "maximal") || CONTAINS(LCASE(STR(?label)), "geographic") || CONTAINS(LCASE(STR(?label)), "scale") )
        }
        
{"head": {"vars": ["p", "label"]}, "results": {"bindings": [{"p": {"type": "uri", "value": "http://orkg.org/orkg/predicate/P44135"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "Evaluates how the benefit of working with smaller vocabularies for numbers and geographic locations"}}, {"p": {"type": "uri", "value": "http://orkg.org/orkg/predicate/P15132"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "Activity\u2019s size (small scale of operat.)"}}, {"p": {"type": "uri", "value": "http://orkg.org/orkg/predicate/P37256"}, "label": {"type": "typed-literal

In [1]:
# read a csv file with columns: "p","label" into a pandas dataframe
import pandas as pd
property_df = pd.read_csv("/Users/sherrypan/GitHub/GAR_SKGQA/datasets/sciqa/project_data/property-labels.csv")
print(property_df.head())

                                                  p               label
0     http://www.w3.org/2002/07/owl#equivalentClass     equivalentClass
1  http://www.w3.org/2002/07/owl#equivalentProperty  equivalentProperty
2        http://www.w3.org/2002/07/owl#complementOf        complementOf
3             http://www.w3.org/2002/07/owl#unionOf             unionOf
4             http://www.w3.org/2002/07/owl#imports             imports


In [2]:
property_ls = property_df["label"].tolist()

In [3]:
len(property_ls)

6892

In [18]:
## entity embedding 
# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0
import time
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

In [8]:
candidate_properties = property_ls
print("Start encoding candidate properties...")
start = time.time()
candidate_embeddings = model.encode(candidate_properties)
print("Finish encoding candidate properties...")
end = time.time()
print(f"Time taken to encode {len(candidate_properties)} candidate properties: {end - start} seconds")

Start encoding candidate properties...
Finish encoding candidate properties...
Time taken to encode 6892 candidate properties: 19.817121028900146 seconds


In [22]:
source_properties = [
    "maximal geographic scale",
    "data format"
]
source_embeddings = model.encode(source_properties, prompt_name="query")
# Compute the (cosine) similarity between the source and candidates embeddings
similarity = model.similarity(source_embeddings, candidate_embeddings)

# print the top 5 most similar candidate properties for each source property
import torch
similarity = torch.tensor(similarity)
top_k = 5
values, indices = torch.topk(similarity, k=top_k, dim=1)
top_candidates = []
top_scores = []
top_sources = []
for i, source_property in enumerate(source_properties):
    # print(f"Source property: {source_property}")
    for j in range(top_k):
        candidate_property = candidate_properties[indices[i][j]]
        sim_score = values[i][j].item()
        # print(f"  Candidate property: {candidate_property}, similarity score: {sim_score:.4f}")
        top_candidates.append(candidate_property)
        top_scores.append(f"{sim_score:.4f}")
        top_sources.append(source_property)
top_df = pd.DataFrame({
    "source_property": top_sources,
    "candidate_property": top_candidates,
    "similarity_score": top_scores
})
top_df.head(10)

  similarity = torch.tensor(similarity)


Unnamed: 0,source_property,candidate_property,similarity_score
0,maximal geographic scale,Geographic scale (Km²),0.5569
1,maximal geographic scale,Geographical scope,0.5455
2,maximal geographic scale,geographical coverage,0.4905
3,maximal geographic scale,Scale factor,0.4757
4,maximal geographic scale,Scale economies evaluated in the approximatio...,0.473
5,data format,Data formats,0.6974
6,data format,Data format,0.6402
7,data format,text data format,0.5891
8,data format,dataset format,0.5265
9,data format,file format,0.5095


In [30]:
import json
from pydantic import BaseModel, ValidationError

# Define a Pydantic model for the expected JSON structure
class User(BaseModel):
    name: str
    age: int
    city: str

def validate_llm_output_with_pydantic(llm_output):
    """
    Checks if a string is valid JSON and conforms to the Pydantic schema.

    Args:
        llm_output (str): The string to validate.

    Returns:
        bool: True if validation is successful, False otherwise.
    """
    try:
        # First, parse the string as JSON
        data = json.loads(llm_output)

        # Then, validate the parsed data against the Pydantic model
        _ = User(**data)

        # If both steps succeed, the output is valid
        print("Validation successful!")
        return True

    except (json.JSONDecodeError, ValidationError) as e:
        # Catch errors from both json.loads() and Pydantic validation
        print(f"Validation failed with error: {e}")
        return False

# --- Example Usage ---

# Example 1: Valid JSON that matches the schema
valid_output = '{"name": "Alice", "age": 30, "city": "New York"}'
print(f"Testing valid output: {valid_output}")
validate_llm_output_with_pydantic(valid_output)
# Expected output: Validation successful!

print("-" * 20)

# Example 2: Invalid JSON format
invalid_format_output = 'This is not JSON.'
print(f"Testing invalid format output: {invalid_format_output}")
validate_llm_output_with_pydantic(invalid_format_output)
# Expected output: Validation failed with error: JSONDecodeError

print("-" * 20)

# Example 3: Valid JSON, but fails Pydantic schema (missing a key)
invalid_schema_output_1 = '{"name": "Bob", "age": 25}'
print(f"Testing invalid schema output (missing key): {invalid_schema_output_1}")
validate_llm_output_with_pydantic(invalid_schema_output_1)
# Expected output: Validation failed with error: ValidationError

print("-" * 20)

# Example 4: Valid JSON, but fails Pydantic schema (wrong data type)
invalid_schema_output_2 = '{"name": "Charlie", "age": "35", "city": "London"}'
print(f"Testing invalid schema output (wrong data type): {invalid_schema_output_2}")
validate_llm_output_with_pydantic(invalid_schema_output_2)
# Expected output: Validation failed with error: ValidationError

Testing valid output: {"name": "Alice", "age": 30, "city": "New York"}
Validation successful!
--------------------
Testing invalid format output: This is not JSON.
Validation failed with error: Expecting value: line 1 column 1 (char 0)
--------------------
Testing invalid schema output (missing key): {"name": "Bob", "age": 25}
Validation failed with error: 1 validation error for User
city
  Field required [type=missing, input_value={'name': 'Bob', 'age': 25}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing
--------------------
Testing invalid schema output (wrong data type): {"name": "Charlie", "age": "35", "city": "London"}
Validation successful!


True

In [None]:
import json
from pydantic import BaseModel, ValidationError, conlist



# --- Example Usage ---

# 1. Valid output with content
valid_output = '{"Entities": ["entity1", "entity2"], "Properties": ["property1"]}'
print(f"Validating: {valid_output}")
validate_llm_output_with_pydantic(valid_output)

# 2. Valid output with empty lists
valid_output_empty = '{"Entities": [], "Properties": []}'
print(f"\nValidating: {valid_output_empty}")
validate_llm_output_with_pydantic(valid_output_empty)

# 3. Invalid output (missing a key)
invalid_output_missing_key = '{"Entities": ["entity1"]}'
print(f"\nValidating: {invalid_output_missing_key}")
validate_llm_output_with_pydantic(invalid_output_missing_key)

# 4. Invalid output (wrong data type for a list item)
invalid_output_wrong_type = '{"Entities": [123, "entity2"], "Properties": ["property1"]}'
print(f"\nValidating: {invalid_output_wrong_type}")
validate_llm_output_with_pydantic(invalid_output_wrong_type)

Validating: {"Entities": ["entity1", "entity2"], "Properties": ["property1"]}
✅ Validation successful!

Validating: {"Entities": [], "Properties": []}
✅ Validation successful!

Validating: {"Entities": ["entity1"]}
❌ Validation failed: 1 validation error for EntityProperty
Properties
  Field required [type=missing, input_value={'Entities': ['entity1']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing

Validating: {"Entities": [123, "entity2"], "Properties": ["property1"]}
❌ Validation failed: 1 validation error for EntityProperty
Entities.0
  Input should be a valid string [type=string_type, input_value=123, input_type=int]
    For further information visit https://errors.pydantic.dev/2.11/v/string_type


False