In [32]:
from google import genai
from google.genai import types

client = genai.Client()

def get_response(prompt, instruction):
    response = client.models.generate_content(
    model="gemini-2.5-flash-lite",
    config=types.GenerateContentConfig(
        system_instruction=instruction),
    contents=prompt
)
    return response.text


In [38]:
instruction = """"
You are an expert in entity recognition and relation extraction. 
You will be given a question and you need to identify and extract all the entities and properties mentioned in the question. 
Please provide a list of entities and properties found in the following json format: 
{
  "Entities": [entity1, entity2, ...],
  "Properties": [property1, property2, ...]
}   
If no entities or properties are found, please return empty lists.
"""
prompt = "What data format does CHEMDNER corpus have?"
# prompt = "Where did the study with maximal geographic scale take place?"

response = get_response(prompt, instruction)
print(response)

ServerError: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}

In [None]:
def build_sparql_query_entity(text: str) -> str:
    """
    Build a SPARQL query that filters resources by requiring all words
    in `text` to appear in the label (case-insensitive).
    """
    # Split text into lowercase tokens
    tokens = text.lower().split()

    # Build FILTER expression: require all tokens (AND logic)
    filter_parts = [f'CONTAINS(LCASE(STR(?label)), "{t}")' for t in tokens]
    # filter_expr = " && ".join(filter_parts)  # AND: all words must appear
    filter_expr = " || ".join(filter_parts)  # OR logic: any word can appear

    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?resource ?label
        WHERE {{
        ?resource ?p ?o .
        ?resource rdfs:label ?label .
        FILTER ( {filter_expr} )
        }}
        """
    return query

# query = build_sparql_query("CHEMDNER corpus")
query = build_sparql_query_entity("maximal geographic scale")
print(f"query: {query}")

from helper import run_sparql_query
results = run_sparql_query(sparql_text = query, SPARQLPATH="http://localhost:8890/sparql")
print(results)



query: 
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?resource ?label
        WHERE {
        ?resource rdfs:label ?label .
        FILTER ( CONTAINS(LCASE(STR(?label)), "maximal") || CONTAINS(LCASE(STR(?label)), "geographic") || CONTAINS(LCASE(STR(?label)), "scale") )
        }
        
{"head": {"vars": ["resource", "label"]}, "results": {"bindings": [{"resource": {"type": "uri", "value": "http://orkg.org/orkg/resource/R57231"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "Predicting the landscape-scale distribution of alien plants and their threat to plant diversity"}}, {"resource": {"type": "uri", "value": "http://orkg.org/orkg/resource/R53243"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "Associations between a highly invasive species and native macrophytes differ across spatial scales"}}, {"resource": {"type": "uri", "value": "htt

In [37]:
def build_sparql_query_properties(text: str) -> str:
    """
    Build a SPARQL query that filters resources by requiring all words
    in `text` to appear in the label (case-insensitive).
    """
    # Split text into lowercase tokens
    tokens = text.lower().split()

    # Build FILTER expression: require all tokens (AND logic)
    filter_parts = [f'CONTAINS(LCASE(STR(?label)), "{t}")' for t in tokens]
    # filter_expr = " && ".join(filter_parts)  # AND: all words must appear
    filter_expr = " || ".join(filter_parts)  # OR logic: any word can appear

    query = f"""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?p ?label
        WHERE {{
        ?s ?p ?o .
        ?p rdfs:label ?label .
        FILTER ( {filter_expr} )
        }}
        """
    return query

# query = build_sparql_query("CHEMDNER corpus")
query = build_sparql_query_properties("maximal geographic scale")
print(f"query: {query}")

from helper import run_sparql_query
results = run_sparql_query(sparql_text = query, SPARQLPATH="http://localhost:8890/sparql")
print(results)



query: 
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT DISTINCT ?p ?label
        WHERE {
        ?s ?p ?o .
        ?p rdfs:label ?label .
        FILTER ( CONTAINS(LCASE(STR(?label)), "maximal") || CONTAINS(LCASE(STR(?label)), "geographic") || CONTAINS(LCASE(STR(?label)), "scale") )
        }
        
{"head": {"vars": ["p", "label"]}, "results": {"bindings": [{"p": {"type": "uri", "value": "http://orkg.org/orkg/predicate/P44135"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "Evaluates how the benefit of working with smaller vocabularies for numbers and geographic locations"}}, {"p": {"type": "uri", "value": "http://orkg.org/orkg/predicate/P15132"}, "label": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "Activity\u2019s size (small scale of operat.)"}}, {"p": {"type": "uri", "value": "http://orkg.org/orkg/predicate/P37256"}, "label": {"type": "typed-literal

In [None]:
## entity embedding 
# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0

from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B")

# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# together with setting `padding_side` to "left":
# model = SentenceTransformer(
#     "Qwen/Qwen3-Embedding-8B",
#     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
#     tokenizer_kwargs={"padding_side": "left"},
# )

# The queries and documents to embed
queries = [
    "What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

# Encode the queries and documents. Note that queries benefit from using a prompt
# Here we use the prompt called "query" stored under `model.prompts`, but you can
# also pass your own prompt via the `prompt` argument
query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

# Compute the (cosine) similarity between the query and document embeddings
similarity = model.similarity(query_embeddings, document_embeddings)
print(similarity)
# tensor([[0.7493, 0.0751],
#         [0.0880, 0.6318]])
