Our LOD integration with AGROVOC allows us to efficently associate the terms and concepts that we study to URIs provided by an authorative source.

In [None]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from sentence_transformers import SentenceTransformer, util

# Load sentence transformer for semantic matching
model = SentenceTransformer('all-MiniLM-L6-v2')

# Setup SPARQL endpoint
sparql = SPARQLWrapper("http://agrovoc.fao.org/sparql")

def query_agrovoc(value):
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?concept ?label WHERE {{
        ?concept skos:prefLabel ?label .
        FILTER(LANG(?label) = "en" && REGEX(?label, "{value}", "i"))
    }}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    matches = []

    for result in results["results"]["bindings"]:
        label = result.get("label", {}).get("value")
        concept = result.get("concept", {}).get("value")
        
        if label and concept:
            matches.append((label, concept))
            
    print(matches)     
    return matches


def best_match(value, candidates):
    if not candidates:
        return None, None
    input_embedding = model.encode(value, convert_to_tensor=True)
    candidate_texts = [label for label, _ in candidates]
    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
    scores = util.cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx]

def int_df_agrovoc(df, column):
    agrovoc_labels = []
    agrovoc_uris = []

    for val in df[column]:
        matches = query_agrovoc(val)
        best_label, best_uri = best_match(val, matches)
        agrovoc_labels.append(best_label)
        agrovoc_uris.append(best_uri)

    df["AGROVOC_label"] = agrovoc_labels
    df["AGROVOC_uri"] = agrovoc_uris
    return df


In [13]:
import pandas as pd

landuse_data = pd.read_csv("data/FAOSTAT_land_use_data.csv", encoding="utf-8")
landuse_data.head(10)

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note
0,RL,Land Use,1,World,5110,Area,6601,Land area,2016,2016,1000 ha,13031770.0,E,Estimated value,
1,RL,Land Use,1,World,5110,Area,6601,Land area,2017,2017,1000 ha,13031240.0,E,Estimated value,
2,RL,Land Use,1,World,5110,Area,6601,Land area,2018,2018,1000 ha,13031520.0,B,Time series break,
3,RL,Land Use,1,World,5110,Area,6602,Agriculture,2016,2016,1000 ha,4831958.0,E,Estimated value,
4,RL,Land Use,1,World,5110,Area,6602,Agriculture,2017,2017,1000 ha,4855317.0,E,Estimated value,
5,RL,Land Use,1,World,5110,Area,6602,Agriculture,2018,2018,1000 ha,4841329.0,B,Time series break,
6,RL,Land Use,1,World,5110,Area,6610,Agricultural land,2016,2016,1000 ha,4785695.0,E,Estimated value,
7,RL,Land Use,1,World,5110,Area,6610,Agricultural land,2017,2017,1000 ha,4815944.0,E,Estimated value,
8,RL,Land Use,1,World,5110,Area,6610,Agricultural land,2018,2018,1000 ha,4804410.0,E,Estimated value,
9,RL,Land Use,1,World,5110,Area,6620,Cropland,2016,2016,1000 ha,1557773.0,E,Estimated value,


In [14]:
agro_enrich = int_df_agrovoc(landuse_data, "Item")
agro_enrich

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description,Note,AGROVOC_label,AGROVOC_uri
0,RL,Land Use,1,World,5110,Area,6601,Land area,2016,2016,1000 ha,13031770.0,E,Estimated value,,,
1,RL,Land Use,1,World,5110,Area,6601,Land area,2017,2017,1000 ha,13031240.0,E,Estimated value,,,
2,RL,Land Use,1,World,5110,Area,6601,Land area,2018,2018,1000 ha,13031520.0,B,Time series break,,,
3,RL,Land Use,1,World,5110,Area,6602,Agriculture,2016,2016,1000 ha,4831958.0,E,Estimated value,,,
4,RL,Land Use,1,World,5110,Area,6602,Agriculture,2017,2017,1000 ha,4855317.0,E,Estimated value,,,
5,RL,Land Use,1,World,5110,Area,6602,Agriculture,2018,2018,1000 ha,4841329.0,B,Time series break,,,
6,RL,Land Use,1,World,5110,Area,6610,Agricultural land,2016,2016,1000 ha,4785695.0,E,Estimated value,,,
7,RL,Land Use,1,World,5110,Area,6610,Agricultural land,2017,2017,1000 ha,4815944.0,E,Estimated value,,,
8,RL,Land Use,1,World,5110,Area,6610,Agricultural land,2018,2018,1000 ha,4804410.0,E,Estimated value,,,
9,RL,Land Use,1,World,5110,Area,6620,Cropland,2016,2016,1000 ha,1557773.0,E,Estimated value,,,
