In [None]:
import requests
import json
from collections import Counter
import pandas as pd
import regex as re

# AGROVOC SPARQL endpoint
AGROVOC_ENDPOINT = "http://agrovoc.fao.org/sparql"

def execute_sparql_query(query, endpoint=AGROVOC_ENDPOINT):
    """Execute a SPARQL query and return results as JSON"""
    headers = {
        'Accept': 'application/sparql-results+json',
        'Content-Type': 'application/x-www-form-urlencoded'
    }
    
    response = requests.post(endpoint, data={'query': query}, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Query 1: Overview of concept types and their counts
query_concept_types = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?type (COUNT(?concept) as ?count)
WHERE {
  ?concept rdf:type ?type .
  FILTER(STRSTARTS(STR(?concept), "http://aims.fao.org/aos/agrovoc/"))
}
GROUP BY ?type
ORDER BY DESC(?count)
"""

# Query 2: Sample concepts with their basic attributes
query_sample_concepts = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?concept ?prefLabel ?altLabel ?definition ?broader ?narrower
WHERE {
  ?concept rdf:type skos:Concept .
  ?concept skos:prefLabel ?prefLabel .
  FILTER(LANG(?prefLabel) = "en")
  
  OPTIONAL { ?concept skos:altLabel ?altLabel . FILTER(LANG(?altLabel) = "en") }
  OPTIONAL { ?concept skos:definition ?definition . FILTER(LANG(?definition) = "en") }
  OPTIONAL { ?concept skos:broader ?broader }
  OPTIONAL { ?concept skos:narrower ?narrower }
}
LIMIT 50
"""

# Query 3: All properties used in AGROVOC
query_all_properties = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT DISTINCT ?property (COUNT(?concept) as ?usage_count)
WHERE {
  ?concept ?property ?value .
  FILTER(STRSTARTS(STR(?concept), "http://aims.fao.org/aos/agrovoc/"))
}
GROUP BY ?property
ORDER BY DESC(?usage_count)
"""

# Query 4: Language distribution
query_languages = """
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?language (COUNT(?label) as ?count)
WHERE {
  ?concept skos:prefLabel ?label .
  BIND(LANG(?label) as ?language)
  FILTER(?language != "")
}
GROUP BY ?language
ORDER BY DESC(?count)
"""

# Query 5: Hierarchical structure depth analysis
query_hierarchy_depth = """
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?concept ?prefLabel ?broader_count ?narrower_count
WHERE {
  ?concept rdf:type skos:Concept .
  ?concept skos:prefLabel ?prefLabel .
  FILTER(LANG(?prefLabel) = "en")
  
  OPTIONAL {
    SELECT ?concept (COUNT(?broader) as ?broader_count) {
      ?concept skos:broader ?broader
    }
    GROUP BY ?concept
  }
  
  OPTIONAL {
    SELECT ?concept (COUNT(?narrower) as ?narrower_count) {
      ?concept skos:narrower ?narrower
    }
    GROUP BY ?concept
  }
}
ORDER BY DESC(?broader_count) DESC(?narrower_count)
LIMIT 100
"""

# Query 6: Top-level concepts (concepts without broader terms)
query_top_concepts = """
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?concept ?prefLabel ?narrower_count
WHERE {
  ?concept rdf:type skos:Concept .
  ?concept skos:prefLabel ?prefLabel .
  FILTER(LANG(?prefLabel) = "en")
  
  FILTER NOT EXISTS { ?concept skos:broader ?broader }
  
  OPTIONAL {
    SELECT ?concept (COUNT(?narrower) as ?narrower_count) {
      ?concept skos:narrower ?narrower
    }
    GROUP BY ?concept
  }
}
ORDER BY DESC(?narrower_count)
"""

# Query 7: Concepts with external mappings
query_external_mappings = """
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?concept ?prefLabel ?mapping_property ?external_concept
WHERE {
  ?concept rdf:type skos:Concept .
  ?concept skos:prefLabel ?prefLabel .
  FILTER(LANG(?prefLabel) = "en")
  
  ?concept ?mapping_property ?external_concept .
  FILTER(?mapping_property IN (skos:exactMatch, skos:closeMatch, skos:broadMatch, skos:narrowMatch, skos:relatedMatch))
}
LIMIT 100
"""

def analyze_agrovoc():
    """Run all exploratory queries and analyze results"""
    
    print("🔍 AGROVOC Dataset Exploration")
    print("=" * 50)
    
    # 1. Concept Types Analysis
    print("\n1. CONCEPT TYPES AND COUNTS:")
    print("-" * 30)
    results = execute_sparql_query(query_concept_types)
    if results:
        for binding in results['results']['bindings']:
            type_uri = binding['type']['value']
            count = binding['count']['value']
            print(f"{type_uri.split('/')[-1]}: {count}")
    
    # 2. Sample Concepts
    print("\n2. SAMPLE CONCEPTS WITH ATTRIBUTES:")
    print("-" * 40)
    results = execute_sparql_query(query_sample_concepts)
    if results:
        for i, binding in enumerate(results['results']['bindings'][:10]):  # Show first 10
            concept = binding['concept']['value'].split('/')[-1]
            pref_label = binding['prefLabel']['value']
            alt_label = binding.get('altLabel', {}).get('value', 'N/A')
            definition = binding.get('definition', {}).get('value', 'N/A')
            print(f"\nConcept {i+1}: {concept}")
            print(f"  Preferred Label: {pref_label}")
            print(f"  Alternative Label: {alt_label}")
            print(f"  Definition: {definition[:100]}..." if len(definition) > 100 else f"  Definition: {definition}")
    
    # 3. All Properties
    print("\n3. PROPERTIES USED IN AGROVOC:")
    print("-" * 35)
    results = execute_sparql_query(query_all_properties)
    if results:
        for binding in results['results']['bindings'][:20]:  # Show top 20
            prop = binding['property']['value']
            count = binding['usage_count']['value']
            prop_name = prop.split('/')[-1] if '/' in prop else prop.split('#')[-1]
            print(f"{prop_name}: {count}")
    
    # 4. Languages
    print("\n4. LANGUAGE DISTRIBUTION:")
    print("-" * 25)
    results = execute_sparql_query(query_languages)
    if results:
        for binding in results['results']['bindings'][:15]:  # Show top 15
            lang = binding['language']['value']
            count = binding['count']['value']
            print(f"{lang}: {count}")
    
    # 5. Top-level Concepts
    print("\n5. TOP-LEVEL CONCEPTS (ROOT CATEGORIES):")
    print("-" * 45)
    results = execute_sparql_query(query_top_concepts)
    if results:
        for binding in results['results']['bindings'][:15]:  # Show top 15
            concept = binding['concept']['value'].split('/')[-1]
            pref_label = binding['prefLabel']['value']
            narrower_count = binding.get('narrower_count', {}).get('value', '0')
            print(f"{pref_label} ({concept}): {narrower_count} subconcepts")
    
    # 6. External Mappings Sample
    print("\n6. EXTERNAL MAPPINGS (SAMPLE):")
    print("-" * 32)
    results = execute_sparql_query(query_external_mappings)
    if results:
        for binding in results['results']['bindings'][:10]:  # Show first 10
            concept = binding['concept']['value'].split('/')[-1]
            pref_label = binding['prefLabel']['value']
            mapping_prop = binding['mapping_property']['value'].split('#')[-1]
            external = binding['external_concept']['value']
            print(f"{pref_label} --{mapping_prop}--> {external}")

def get_concept_details(concept_uri):
    """Get detailed information about a specific concept"""
    
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    
    SELECT ?property ?value ?language
    WHERE {{
      <{concept_uri}> ?property ?value .
      OPTIONAL {{ BIND(LANG(?value) as ?language) }}
    }}
    """
    
    results = execute_sparql_query(query)
    if results:
        print(f"\nDetailed information for: {concept_uri}")
        print("-" * 50)
        
        for binding in results['results']['bindings']:
            prop = binding['property']['value']
            value = binding['value']['value']
            lang = binding.get('language', {}).get('value', '')
            
            prop_name = prop.split('/')[-1] if '/' in prop else prop.split('#')[-1]
            
            if lang:
                print(f"{prop_name} ({lang}): {value}")
            else:
                print(f"{prop_name}: {value}")

def search_concepts(search_term, limit=20):
    """Search for concepts containing a specific term"""
    
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    
    SELECT ?concept ?prefLabel ?altLabel ?definition
    WHERE {{
      ?concept rdf:type skos:Concept .
      ?concept skos:prefLabel ?prefLabel .
      FILTER(LANG(?prefLabel) = "en")
      FILTER(CONTAINS(LCASE(?prefLabel), LCASE("{search_term}")))
      
      OPTIONAL {{ ?concept skos:altLabel ?altLabel . FILTER(LANG(?altLabel) = "en") }}
      OPTIONAL {{ ?concept skos:definition ?definition . FILTER(LANG(?definition) = "en") }}
    }}
    ORDER BY ?prefLabel
    LIMIT {limit}
    """
    
    results = execute_sparql_query(query)
    if results:
        print(f"\nSearch results for '{search_term}':")
        print("-" * 40)
        
        for binding in results['results']['bindings']:
            concept = binding['concept']['value']
            pref_label = binding['prefLabel']['value']
            alt_label = binding.get('altLabel', {}).get('value', '')
            definition = binding.get('definition', {}).get('value', '')
            
            print(f"\n• {pref_label}")
            print(f"  URI: {concept}")
            if alt_label:
                print(f"  Alternative: {alt_label}")
            if definition:
                print(f"  Definition: {definition[:200]}..." if len(definition) > 200 else f"  Definition: {definition}")

if __name__ == "__main__":
    # Run the main analysis
    analyze_agrovoc()
    
    # Example usage:
    print("\n" + "="*50)
    print("EXAMPLE USAGE:")
    print("="*50)
    
    # Search for concepts related to "agriculture"
    search_concepts("agriculture", limit=10)
    
    # Get details for a specific concept (example)
    # get_concept_details("http://aims.fao.org/aos/agrovoc/c_49890")  # Example concept URI

🔍 AGROVOC Dataset Exploration

1. CONCEPT TYPES AND COUNTS:
------------------------------
skos-xl#Label: 1219844
core#Concept: 41447
Image: 60
core#Collection: 3
owl#ObjectProperty: 1
owl#Ontology: 1

2. SAMPLE CONCEPTS WITH ATTRIBUTES:
----------------------------------------

Concept 1: c_4788
  Preferred Label: methods
  Alternative Label: ways of doing
  Definition: N/A

Concept 2: c_4788
  Preferred Label: methods
  Alternative Label: ways of doing
  Definition: N/A

Concept 3: c_4788
  Preferred Label: methods
  Alternative Label: ways of doing
  Definition: N/A

Concept 4: c_4788
  Preferred Label: methods
  Alternative Label: ways of doing
  Definition: N/A

Concept 5: c_4788
  Preferred Label: methods
  Alternative Label: ways of doing
  Definition: N/A

Concept 6: c_4788
  Preferred Label: methods
  Alternative Label: ways of doing
  Definition: N/A

Concept 7: c_4788
  Preferred Label: methods
  Alternative Label: ways of doing
  Definition: N/A

Concept 8: c_4788
  Preferr

In [21]:
import os
import json
import string
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from sentence_transformers import SentenceTransformer, util
import regex as re

In [22]:
# Initialize SPARQL endpoint and model
sparql = SPARQLWrapper("http://agrovoc.fao.org/sparql")
model = SentenceTransformer('all-MiniLM-L6-v2')

CACHE_PATH = "agro_cache2.json"

In [None]:
def load_cache():
    if not os.path.exists(CACHE_PATH):
        return {}
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)
        return cache

def save_cache(cache):
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2)
        

def clean_cache():
    if not os.path.exists(CACHE_PATH):
        return
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)

    cleaned_cache = {
        key: value
        for key, value in cache.items()
        if value.get("label") is not None
    }

    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cleaned_cache, f, indent=2)
    print("The cache has been cleaned from all empty entries!")

def norm_cache():
    if not os.path.exists(CACHE_PATH):
        return
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)

        for entry in cache:
            entry = entry.lower().replace(",", "")
            entry["altLabel"] = []


In [32]:
# Initialize SPARQL endpoint and model
def semantic_best_match(value, candidates):
    input_embedding = model.encode(value, convert_to_tensor=True)
    candidate_texts = []
    for c in candidates:
        candidate_texts.append(c["label"])
    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
    scores = util.cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx]


def query_agrovoc(value, cache):

    norm_value = value.lower().replace(",", "")

    if norm_value in cache:
        print(f"Cache hit: '{value}' found in cache")
        return cache[norm_value]
    
    for entry in cache.values():
        if "altLabels" in entry and value in entry["altLabels"]:
            print(f"Cache hit: '{value}' found in cache")
            return entry
    
    
    print(f"Cache miss: '{value}' not in cache, querying AGROVOC endpoint")
    # Run SPARQL query
    query = f"""
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

        SELECT ?concept ?label (GROUP_CONCAT(?altLabel; separator=" | ") AS ?altLabels) WHERE {{
        ?concept skos:prefLabel ?label .
        OPTIONAL {{
            ?concept skos:altLabel ?altLabel .
            FILTER(LANG(?altLabel) = "en")
        }}

        FILTER(LANG(?label) = "en")

        FILTER(
            CONTAINS(LCASE(?label), "{norm_value}") ||
            (BOUND(?altLabel) && CONTAINS(LCASE(?altLabel), "{norm_value}"))
        )
        }}
        GROUP BY ?concept ?label

    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    candidates = []
    for result in results["results"]["bindings"]:
        label = result["label"]["value"]
        uri = result["concept"]["value"]
        altlabels_str = result["altLabels"]["value"] if "altLabels" in result else ""
        altlabels = [al.strip().lower() for al in altlabels_str.split("|") if al.strip()]
        candidate = {"label": label, "uri": uri, "altLabels": altlabels}
        candidates.append(candidate)
        
    if not candidates:
        match = {"label": None, "uri": None, "altLabels": []}
    else:
        best = semantic_best_match(value, candidates)
        match = best
    # Cache the result
    cache[norm_value] = match
    save_cache(cache)
    return match


def enrich_with_agrovoc(df, column_name):
    cache = load_cache()
    labels = []
    uris = []
    for val in df[column_name]:
        match = query_agrovoc(val, cache)
        labels.append(match["label"])
        uris.append(match["uri"])
    df["AGROVOC_label"] = labels
    df["AGROVOC_uri"] = uris
    return df

In [33]:
clean_cache()
food_consumption_data = pd.read_csv("data/chronic_consumption_gday_allsubjects.csv", encoding="utf-16")

# Group by 'Exposure hierarchy (L7)', sum the 'Mean' values, and get the top 15
food_consumption_data_nowater = food_consumption_data[
    (food_consumption_data["Exposure hierarchy (L7)"] != "Natural mineral water") &
    (food_consumption_data["Exposure hierarchy (L7)"] != "Tap water") &
    (food_consumption_data["Exposure hierarchy (L7)"] != "Filtered tap water")]

top = (
    food_consumption_data_nowater
    .groupby('Exposure hierarchy (L7)', as_index=False)['Mean']
    .sum()
    .sort_values('Mean', ascending=False)
    
)

top_food_consumption_data_agrovoc = enrich_with_agrovoc(top, "Exposure hierarchy (L7)")
top_food_consumption_data_agrovoc.to_csv(r"final_data/top_food_consumption_agrovoc.csv")

Cache miss: 'Cow milk, semi skimmed (half fat)' not in cache, querying AGROVOC endpoint
Cache miss: 'Coffee (average strength) beverage' not in cache, querying AGROVOC endpoint
Cache hit: 'Apples' found in cache
Cache hit: 'Potatoes' found in cache
Cache miss: 'Wheat bread and rolls, white (refined flour)' not in cache, querying AGROVOC endpoint
Cache miss: 'Wine, red' not in cache, querying AGROVOC endpoint
Cache miss: 'Chicken fresh meat' not in cache, querying AGROVOC endpoint
Cache miss: 'Cow milk, whole' not in cache, querying AGROVOC endpoint
Cache miss: 'Dried durum pasta' not in cache, querying AGROVOC endpoint
Cache miss: 'Olive oil, virgin or extra-virgin' not in cache, querying AGROVOC endpoint
Cache miss: 'Coffee espresso (beverage)' not in cache, querying AGROVOC endpoint
Cache miss: 'Beer' not in cache, querying AGROVOC endpoint
Cache miss: 'Cow, ox or bull fresh meat' not in cache, querying AGROVOC endpoint
Cache miss: 'Cola beverages, caffeinic' not in cache, querying A

KeyboardInterrupt: 