### AGROVOC Integration and Semantic Matching

In [2]:
import os
import json
import string
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from sentence_transformers import SentenceTransformer, util

* **SPARQLWrapper**: Used to connect and query the AGROVOC SPARQL endpoint (`http://agrovoc.fao.org/sparql`).
* **SentenceTransformer**: An 'all-MiniLM-L6-v2' model is initialized to perform semantic similarity calculations. This is crucial for finding the best match from AGROVOC candidates when a direct exact match is not found.
* **Caching Mechanism**: Functions `load_cache()`and `save_cache()` are defined to store and retrieve AGROVOC query results. This prevents redundant queries to the external endpoint, improving efficiency.`clean_cache()` is designed to clear the cache from empty entries (in the case the SPARQL query did not return anything) by creating a `cleaned_cache` dictionary, including only entries where the `value` dictionary contains a "label" key with a non-`None` value. 

In [3]:
# Initialize SPARQL endpoint and model
sparql = SPARQLWrapper("http://agrovoc.fao.org/sparql")
model = SentenceTransformer('all-MiniLM-L6-v2')

CACHE_PATH = "agro_cache.json"

In [4]:
def load_cache():
    if not os.path.exists(CACHE_PATH):
        return {}
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)
        return cache

def save_cache(cache):
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2)

def clean_cache():
    if not os.path.exists(CACHE_PATH):
        return
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)

    cleaned_cache = {
        key: value
        for key, value in cache.items()
        if value.get("label") is not None
    }

    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cleaned_cache, f, indent=2)

* **`semantic_best_match(value, candidates)`**: This function takes an input `value` and a list of `candidates` (from AGROVOC). It encodes both the input and candidates into embeddings and then uses cosine similarity to find the candidate that is semantically closest to the input.

In [51]:
def semantic_best_match(value, candidates):
    input_embedding = model.encode(value, convert_to_tensor=True)
    candidate_texts = []
    for c in candidates:
        candidate_texts.append(c["label"])
    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
    scores = util.cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx]

* **`query_agrovoc(value, cache)`**: This function first checks if the `value` or its alternative labels are present in the local `cache`. If not, it constructs a SPARQL query to search for the `value` in AGROVOC's `skos:prefLabel` and `skos:altLabel` (alternative labels). The results are then passed to `semantic_best_match` to identify the most relevant concept, which is then stored in the cache for future use.

In [52]:
def query_agrovoc(value, cache):

    norm_value = value.lower().replace(",", "")

    if norm_value in cache:
        print(f"Cache hit: '{value}' found in cache")
        return cache[norm_value]
    
    for entry in cache.values():
        if "altLabels" in entry and value in entry["altLabels"]:
            print(f"Cache hit: '{value}' found in cache")
            return entry
    
    
    print(f"Cache miss: '{value}' not in cache, querying AGROVOC endpoint")
    # Run SPARQL query
    query = f"""
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

        SELECT ?concept ?label (GROUP_CONCAT(?altLabel; separator=" | ") AS ?altLabels) WHERE {{
        ?concept skos:prefLabel ?label .
        OPTIONAL {{
            ?concept skos:altLabel ?altLabel .
            FILTER(LANG(?altLabel) = "en")
        }}

        FILTER(LANG(?label) = "en")

        FILTER(
            CONTAINS(LCASE(?label), "{norm_value}") ||
            (BOUND(?altLabel) && CONTAINS(LCASE(?altLabel), "{norm_value}"))
        )
        }}
        GROUP BY ?concept ?label

    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    candidates = []
    for result in results["results"]["bindings"]:
        label = result["label"]["value"]
        uri = result["concept"]["value"]
        altlabels_str = result["altLabels"]["value"] if "altLabels" in result else ""
        altlabels = [al.strip().lower() for al in altlabels_str.split("|") if al.strip()]
        candidate = {"label": label, "uri": uri, "altLabels": altlabels}
        candidates.append(candidate)
        
    if not candidates:
        match = {"label": None, "uri": None, "altLabels": []}
    else:
        best = semantic_best_match(value, candidates)
        match = best
    # Cache the result
    cache[norm_value] = match
    save_cache(cache)
    return match

* **`enrich_with_agrovoc(df, column_name)`**: This function iterates through a specified `column_name` in a DataFrame, queries AGROVOC for each value, and adds two new columns to the DataFrame: `AGROVOC_label` (the preferred label from AGROVOC) and `AGROVOC_uri` (the URI of the AGROVOC concept).

In [53]:
def enrich_with_agrovoc(df, column_name):
    cache = load_cache()
    labels = []
    uris = []
    for val in df[column_name]:
        match = query_agrovoc(val, cache)
        labels.append(match["label"])
        uris.append(match["uri"])
    df["AGROVOC_label"] = labels
    df["AGROVOC_uri"] = uris
    return df

### How much water is used to produce food?
- the FAO AQUASTAT database for information about worldwide water withdrawal

In [54]:
world_water_data = pd.read_csv("data/aquastat_world_water_data.csv", encoding="utf-8")
# Define the variables of interest
variables_of_interest = [
    "Agricultural water withdrawal",
    "Industrial water withdrawal",
    "Municipal water withdrawal"
]
# Filter the dataframe
filtered_water = world_water_data[
    (world_water_data["Variable"].isin(variables_of_interest)) &
    (world_water_data["Year"] == 2022) &
    (world_water_data["Area"] == "World")
]

filtered_water_agrovoc = enrich_with_agrovoc(filtered_water,"Variable")
# filtered_water_agrovoc.to_csv(r"final_data\filtered_water_agrovoc.csv")


Cache hit: 'Agricultural water withdrawal' found in cache
Cache hit: 'Industrial water withdrawal' found in cache
Cache hit: 'Municipal water withdrawal' found in cache


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["AGROVOC_label"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["AGROVOC_uri"] = uris


### How much land is it used for food production?
- land use data from FAOSTAT

In [55]:
landuse_data = pd.read_csv("data/FAOSTAT_land_use_data.csv", encoding="utf-8")

In [57]:

filtered_landuse = landuse_data[
    (landuse_data["Area"] == "World") &
    (landuse_data["Year"] == 2018) &
    (landuse_data["Element"] == "Area") &
    (landuse_data["Item"].isin(["Country land",
                                "Land area", 
                                "Agriculture", 
                                "Forest land", 
                                "Land used for aquaculture", 
                                "Inland waters used for aquac. or holding facilities", 
                                "Inland waters used for capture fishes", 
                                "Coastal waters used for aquac. or holding facilities",
                                "Coastal waters used for capture fishes",
                                "EEZ used for aquac. or holding facilities",
                                "EEZ used for capture fishes"]))
]

filtered_landuse = filtered_landuse[["Area", "Year", "Item", "Value"]]
filtered_landuse_agrovoc = enrich_with_agrovoc(filtered_landuse, "Item")
filtered_landuse_agrovoc.to_csv(r"site/final_data/filtered_landuse_agrovoc.csv")

Cache hit: 'Land area' found in cache
Cache hit: 'Agriculture' found in cache
Cache hit: 'Forest land' found in cache


### What are the emissions of the agricultural sector compared to other sectors?
- greenhouse gas emissions data from Climate Watch + FAOSTAT
#TO DO

In [58]:
ghg_emissions_data = pd.read_csv("data/historical_emissions.csv", encoding="utf-8")
fao_greenhouse_gas_emissions = pd.read_csv("data/FAOSTAT_emissions_total.csv", encoding="utf-8")

In [59]:
aggregation_map = {
    # Crop Production
    'Crop Residues': 'Crop Production',
    'Rice Cultivation': 'Crop Production',
    'Burning - Crop residues': 'Crop Production',
    'Synthetic Fertilizers': 'Crop Production',
    'Drained organic soils': 'Crop Production',
    'Drained organic soils (CO2)': 'Crop Production',
    'Drained organic soils (N2O)': 'Crop Production',
    'Manure applied to Soils': 'Crop Production',

    # Livestock Production
    'Enteric Fermentation': 'Livestock Production',
    'Manure Management': 'Livestock Production',
    'Manure left on Pasture': 'Livestock Production',

    # Land-Use Change
    'Forestland': 'Land-Use Change',
    'Net Forest conversion': 'Land-Use Change',
    'Savanna fires': 'Land-Use Change',
    'Fires in organic soils': 'Land-Use Change',
    'Forest fires': 'Land-Use Change',
    'Fires in humid tropical forests': 'Land-Use Change',

    # On-Farm Energy
    'On-farm energy use': 'On-Farm Energy',

    # Agricultural Inputs
    'Fertilizers Manufacturing': 'Ag. Inputs Manufacturing',
    'Pesticides Manufacturing': 'Ag. Inputs Manufacturing',

    # Processing & Packaging
    'Food Processing': 'Processing & Packaging',
    'Food Packaging': 'Processing & Packaging',

    # Transport & Retail
    'Food Transport': 'Transport & Retail',
    'Food Retail': 'Transport & Retail',

    # Consumption
    'Food Household Consumption': 'Consumption',

    # Waste
    'Agrifood Systems Waste Disposal': 'Waste',

    # Exclude or label as "Other" if unrelated
    'Energy': 'Other',
    'IPPU': 'Other',
    'Waste': 'Other',
    'International bunkers': 'Other',
    'Other': 'Other'
}

fao_greenhouse_gas_emissions['Category'] = fao_greenhouse_gas_emissions['Item'].map(aggregation_map)

useless_drained_organic_soil = [
    "Drained organic soils (CO2)", "Drained organic soils (N2O)"
]

# Select rows for agriculture sectors emissions
agriculture_sectors_emissions = fao_greenhouse_gas_emissions[(fao_greenhouse_gas_emissions["Category"] != "Other") & 
                                                             (fao_greenhouse_gas_emissions["Area"] == "World") &
                                                                (fao_greenhouse_gas_emissions["Year"] == 2022) &
                                                             (fao_greenhouse_gas_emissions["Element"] == "Emissions (CO2eq) (AR5)")]

# Remove rows with "Dried organic soils (CO2)" and "Dried organic soils (N2O)"
agriculture_sectors_emissions = agriculture_sectors_emissions[~agriculture_sectors_emissions["Item"].isin(useless_drained_organic_soil)] # Pietro use this
# Group by Category and sum the values
agriculture_sectors_emissions = agriculture_sectors_emissions.groupby("Category").agg({
    "Domain": "first",  # Keep the first value (assuming all rows have the same value)
    "Area": "first",   # Keep the first value (assuming all rows have the same value)
    "Element": "first",  # Keep the first value (assuming all rows have the same value)
    "Year": "first",  # Keep the first value (assuming all rows have the same value)
    "Unit": "first",  # Keep the first value (assuming all rows have the same value)
    "Value": "sum",  # Sum the values
    "Flag Description": "first"  # Keep the first value (assuming all rows have the same value)
}).reset_index()

# Rename category column as "Item"
agriculture_sectors_emissions.rename(columns={"Category": "Item"}, inplace=True)

# Add a new column category column filled with "Agrifood System"
agriculture_sectors_emissions["Category"] = "Agrifood System"

# Filter for other sectors emissions
Other_sectors_emissions = fao_greenhouse_gas_emissions[(fao_greenhouse_gas_emissions["Category"] == "Other") &
                                                        (fao_greenhouse_gas_emissions["Area"] == "World") &
                                                        (fao_greenhouse_gas_emissions["Year"] == 2022) &
                                                        (fao_greenhouse_gas_emissions["Element"] == "Emissions (CO2eq) (AR5)")]

# Remove columns that are not needed
Other_sectors_emissions = Other_sectors_emissions[["Item", "Domain", "Area", "Element", "Year", "Unit", "Value", "Flag Description", "Category"]]

In [60]:
emissions_sectors = pd.concat([Other_sectors_emissions, agriculture_sectors_emissions], ignore_index=True)
emissions_sectors_agrovoc = enrich_with_agrovoc(emissions_sectors, "Item")
# emissions_sectors_agrovoc.to_csv(r"site/final_data/emissions_sectors_agrovoc")


Cache hit: 'Energy' found in cache
Cache hit: 'IPPU' found in cache
Cache hit: 'Waste' found in cache
Cache hit: 'International bunkers' found in cache
Cache hit: 'Other' found in cache
Cache hit: 'Ag. Inputs Manufacturing' found in cache
Cache hit: 'Consumption' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Land-Use Change' found in cache
Cache hit: 'Livestock Production' found in cache
Cache hit: 'On-Farm Energy' found in cache
Cache hit: 'Processing & Packaging' found in cache
Cache hit: 'Transport & Retail' found in cache
Cache hit: 'Waste' found in cache


## Which are the most consumed types of food in Italy?
- the EFSA (European Food Safety Authority) food consumption data for data about the consumption of different types of food in various countries
DA SISTEMARE

In [61]:
food_consumption_data = pd.read_csv("data/chronic_consumption_gday_allsubjects.csv", encoding="utf-16")

# Group by 'Exposure hierarchy (L7)', sum the 'Mean' values, and get the top 15
food_consumption_data_nowater = food_consumption_data[
    (food_consumption_data["Exposure hierarchy (L7)"] != "Natural mineral water") &
    (food_consumption_data["Exposure hierarchy (L7)"] != "Tap water") &
    (food_consumption_data["Exposure hierarchy (L7)"] != "Filtered tap water")]

top = (
    food_consumption_data_nowater
    .groupby('Exposure hierarchy (L7)', as_index=False)['Mean']
    .sum()
    .sort_values('Mean', ascending=False)
    .head(15)
)

top_food_consumption_data_agrovoc = enrich_with_agrovoc(top, "Exposure hierarchy (L7)")
# top_food_consumption_data_agrovoc.to_csv(r"final_data/top_food_consumption_agrovoc.csv")

Cache hit: 'Cow milk, semi skimmed (half fat)' found in cache
Cache hit: 'Coffee (average strength) beverage' found in cache
Cache hit: 'Apples' found in cache
Cache hit: 'Potatoes' found in cache
Cache hit: 'Wheat bread and rolls, white (refined flour)' found in cache
Cache hit: 'Wine, red' found in cache
Cache hit: 'Chicken fresh meat' found in cache
Cache hit: 'Cow milk, whole' found in cache
Cache hit: 'Dried durum pasta' found in cache
Cache hit: 'Olive oil, virgin or extra-virgin' found in cache
Cache hit: 'Coffee espresso (beverage)' found in cache
Cache hit: 'Beer' found in cache
Cache hit: 'Cow, ox or bull fresh meat' found in cache
Cache hit: 'Cola beverages, caffeinic' found in cache
Cache hit: 'Globe tomato' found in cache
