In [3]:
import os
import json
import string
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from sentence_transformers import SentenceTransformer, util

In [14]:
# Initialize SPARQL endpoint and model
sparql = SPARQLWrapper("http://agrovoc.fao.org/sparql")
model = SentenceTransformer('all-MiniLM-L6-v2')

CACHE_PATH = "agro_cache.json"

In [19]:
def load_cache():
    if not os.path.exists(CACHE_PATH):
        return {}
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)
        return cache

def save_cache(cache):
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2)

def clean_cache():
    if not os.path.exists(CACHE_PATH):
        return
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)

    cleaned_cache = {
        key: value
        for key, value in cache.items()
        if value.get("label") is not None
    }

    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cleaned_cache, f, indent=2)

In [12]:
def semantic_best_match(value, candidates):
    input_embedding = model.encode(value, convert_to_tensor=True)
    candidate_texts = []
    for c in candidates:
        candidate_texts.append(c["label"])
    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
    scores = util.cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx]

In [11]:
def query_agrovoc(value, cache):


    if value in cache:
        print(f"Cache hit: '{value}' found in cache")
        return cache[value]
    
    print(f"Cache miss: '{value}' not in cache, querying AGROVOC endpoint")
    # Run SPARQL query
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?concept ?label WHERE {{
        ?concept skos:prefLabel ?label .
        FILTER(LANG(?label) = "en" && REGEX(?label, "{value}", "i"))
    }}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    candidates = []
    for result in results["results"]["bindings"]:
        label = result["label"]["value"]
        uri = result["concept"]["value"]
        candidate = {"label": label, "uri": uri}
        candidates.append(candidate)
    if not candidates:
        match = {"label": None, "uri": None}
    else:
        best = semantic_best_match(value, candidates)
        match = best
    # Cache the result
    cache[value] = match
    save_cache(cache)
    return match

In [10]:
def enrich_with_agrovoc(df, column_name):
    cache = load_cache()
    labels = []
    uris = []
    for val in df[column_name]:
        match = query_agrovoc(val, cache)
        labels.append(match["label"])
        uris.append(match["uri"])
    df["AGROVOC_label"] = labels
    df["AGROVOC_uri"] = uris
    return df

### How much water is used to produce food?
- the FAO AQUASTAT database for information about worldwide water withdrawal

In [33]:
world_water_data = pd.read_csv("data/aquastat_world_water_data.csv", encoding="utf-8")
# Define the variables of interest
variables_of_interest = [
    "Agricultural water withdrawal",
    "Industrial water withdrawal",
    "Municipal water withdrawal"
]
# Filter the dataframe
filtered_water = world_water_data[
    (world_water_data["Variable"].isin(variables_of_interest)) &
    (world_water_data["Year"] == 2022) &
    (world_water_data["Area"] == "World")
]

filtered_water_agrovoc = enrich_with_agrovoc(filtered_water,"Variable")
filtered_water_agrovoc.to_csv(r"final_data\filtered_water_agrovoc.csv")


Cache hit: 'Agricultural water withdrawal' found in cache
Cache hit: 'Industrial water withdrawal' found in cache
Cache hit: 'Municipal water withdrawal' found in cache


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["AGROVOC_label"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["AGROVOC_uri"] = uris


### How much land is it used for food production?
- land use data from FAOSTAT

In [35]:
landuse_data = pd.read_csv("data/FAOSTAT_land_use_data.csv", encoding="utf-8")

filtered_landuse = landuse_data[
    (landuse_data["Area"] == "World") &
    (landuse_data["Year"] == 2018) &
    (landuse_data["Element"] == "Area") &
    (landuse_data["Item"].isin(["Country land",
                                "Land area", 
                                "Agriculture", 
                                "Forest land", 
                                "Land used for aquaculture", 
                                "Inland waters used for aquac. or holding facilities", 
                                "Inland waters used for capture fishes", 
                                "Coastal waters used for aquac. or holding facilities",
                                "Coastal waters used for capture fishes",
                                "EEZ used for aquac. or holding facilities",
                                "EEZ used for capture fishes"]))
]

filtered_landuse = filtered_landuse[["Area", "Year", "Item", "Value"]]
filtered_landuse_agrovoc = enrich_with_agrovoc(filtered_landuse, "Item")
filtered_landuse_agrovoc.to_csv(r"final_data/filtered_landuse_agrovoc.csv")

Cache hit: 'Land area' found in cache
Cache hit: 'Agriculture' found in cache
Cache hit: 'Forest land' found in cache


### What are the emissions of the agricultural sector compared to other sectors?
- greenhouse gas emissions data from Climate Watch + FAOSTAT
#TO DO

In [4]:
ghg_emissions_data = pd.read_csv("data/historical_emissions.csv", encoding="utf-8")
fao_greenhouse_gas_emissions = pd.read_csv("data/FAOSTAT_emissions_total.csv", encoding="utf-8")

In [5]:
aggregation_map = {
    # Crop Production
    'Crop Residues': 'Crop Production',
    'Rice Cultivation': 'Crop Production',
    'Burning - Crop residues': 'Crop Production',
    'Synthetic Fertilizers': 'Crop Production',
    'Drained organic soils': 'Crop Production',
    'Drained organic soils (CO2)': 'Crop Production',
    'Drained organic soils (N2O)': 'Crop Production',
    'Manure applied to Soils': 'Crop Production',

    # Livestock Production
    'Enteric Fermentation': 'Livestock Production',
    'Manure Management': 'Livestock Production',
    'Manure left on Pasture': 'Livestock Production',

    # Land-Use Change
    'Forestland': 'Land-Use Change',
    'Net Forest conversion': 'Land-Use Change',
    'Savanna fires': 'Land-Use Change',
    'Fires in organic soils': 'Land-Use Change',
    'Forest fires': 'Land-Use Change',
    'Fires in humid tropical forests': 'Land-Use Change',

    # On-Farm Energy
    'On-farm energy use': 'On-Farm Energy',

    # Agricultural Inputs
    'Fertilizers Manufacturing': 'Ag. Inputs Manufacturing',
    'Pesticides Manufacturing': 'Ag. Inputs Manufacturing',

    # Processing & Packaging
    'Food Processing': 'Processing & Packaging',
    'Food Packaging': 'Processing & Packaging',

    # Transport & Retail
    'Food Transport': 'Transport & Retail',
    'Food Retail': 'Transport & Retail',

    # Consumption
    'Food Household Consumption': 'Consumption',

    # Waste
    'Agrifood Systems Waste Disposal': 'Waste',

    # Exclude or label as "Other" if unrelated
    'Energy': 'Other',
    'IPPU': 'Other',
    'Waste': 'Other',
    'International bunkers': 'Other',
    'Other': 'Other'
}

fao_greenhouse_gas_emissions['Category'] = fao_greenhouse_gas_emissions['Item'].map(aggregation_map)



In [None]:
useless_drained_organic_soil = [
    "Drained organic soils (CO2)", "Drained organic soils (N2O)"
]

# Select rows for agriculture sectors emissions
agriculture_sectors_emissions = fao_greenhouse_gas_emissions[(fao_greenhouse_gas_emissions["Category"] != "Other") & 
                                                             (fao_greenhouse_gas_emissions["Area"] == "World") &
                                                                (fao_greenhouse_gas_emissions["Year"] == 2022) &
                                                             (fao_greenhouse_gas_emissions["Element"] == "Emissions (CO2eq) (AR5)")]

# Remove rows with "Dried organic soils (CO2)" and "Dried organic soils (N2O)"
agriculture_sectors_emissions = agriculture_sectors_emissions[~agriculture_sectors_emissions["Item"].isin(useless_drained_organic_soil)] # Pietro use this
agriculture_sectors_emissions_values = agriculture_sectors_emissions["Value"].tolist()
agriculture_sectors_emissions_items = agriculture_sectors_emissions["Item"].tolist()

agriculture_sectors_emissions # Pietro questo

In [None]:
Other_sectors_emissions = fao_greenhouse_gas_emissions[(fao_greenhouse_gas_emissions["Category"] == "Other") &
                                                        (fao_greenhouse_gas_emissions["Area"] == "World") &
                                                        (fao_greenhouse_gas_emissions["Year"] == 2022) &
                                                        (fao_greenhouse_gas_emissions["Element"] == "Emissions (CO2eq) (AR5)")] # Pietro use this
other_sectors_emissions_values = Other_sectors_emissions["Value"].tolist()
other_sectors_emissions_items = Other_sectors_emissions["Item"].tolist()

Other_sectors_emissions # Pietro questo

In [24]:
emissions_sectors = pd.concat([Other_sectors_emissions, agriculture_sectors_emissions], ignore_index=True)

emissions_sectors_agrovoc = enrich_with_agrovoc(emissions_sectors, "Item")



Cache hit: 'Energy' found in cache
Cache hit: 'IPPU' found in cache
Cache hit: 'Waste' found in cache
Cache hit: 'International bunkers' found in cache
Cache hit: 'Other' found in cache
Cache hit: 'Crop Residues' found in cache
Cache hit: 'Rice Cultivation' found in cache
Cache hit: 'Burning - Crop residues' found in cache
Cache hit: 'Enteric Fermentation' found in cache
Cache hit: 'Manure Management' found in cache
Cache hit: 'Manure left on Pasture' found in cache
Cache hit: 'Manure applied to Soils' found in cache
Cache hit: 'Synthetic Fertilizers' found in cache
Cache hit: 'Drained organic soils' found in cache
Cache hit: 'On-farm energy use' found in cache
Cache hit: 'Forestland' found in cache
Cache hit: 'Net Forest conversion' found in cache
Cache hit: 'Savanna fires' found in cache
Cache hit: 'Fires in organic soils' found in cache
Cache hit: 'Forest fires' found in cache
Cache hit: 'Fires in humid tropical forests' found in cache
Cache hit: 'Agrifood Systems Waste Disposal' fo

In [25]:
emissions_sectors_agrovoc

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Source Code,Source,Unit,Value,Flag,Flag Description,Note,Category,AGROVOC_label,AGROVOC_uri
0,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),6821,Energy,2022,2022,3050,FAO TIER 1,kt,38843790.0,E,Estimated value,,Other,energy,http://aims.fao.org/aos/agrovoc/c_2565
1,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),6817,IPPU,2022,2022,3050,FAO TIER 1,kt,4500829.0,E,Estimated value,,Other,,
2,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),6818,Waste,2022,2022,3050,FAO TIER 1,kt,2577948.0,E,Estimated value,,Other,wastes,http://aims.fao.org/aos/agrovoc/c_8307
3,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),6820,International bunkers,2022,2022,3050,FAO TIER 1,kt,1175800.0,E,Estimated value,,Other,,
4,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),6819,Other,2022,2022,3050,FAO TIER 1,kt,211179.1,E,Estimated value,,Other,,
5,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),5064,Crop Residues,2022,2022,3050,FAO TIER 1,kt,195546.4,E,Estimated value,,Crop Production,crop residues,http://aims.fao.org/aos/agrovoc/c_16118
6,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),5060,Rice Cultivation,2022,2022,3050,FAO TIER 1,kt,682755.0,E,Estimated value,,Crop Production,crop production,http://aims.fao.org/aos/agrovoc/c_5976
7,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),5066,Burning - Crop residues,2022,2022,3050,FAO TIER 1,kt,37552.91,E,Estimated value,,Crop Production,crop production,http://aims.fao.org/aos/agrovoc/c_5976
8,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),5058,Enteric Fermentation,2022,2022,3050,FAO TIER 1,kt,2905329.0,E,Estimated value,,Livestock Production,fermentation,http://aims.fao.org/aos/agrovoc/c_2855
9,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),5059,Manure Management,2022,2022,3050,FAO TIER 1,kt,397672.8,E,Estimated value,,Livestock Production,manure management,http://aims.fao.org/aos/agrovoc/c_2ed1deb4


## Which are the most consumed types of food in Italy?
- the EFSA (European Food Safety Authority) food consumption data for data about the consumption of different types of food in various countries
DA SISTEMARE

In [1]:
food_consumption_data = pd.read_csv("data/chronic_consumption_gday_allsubjects.csv", encoding="utf-16")

# Group by 'Exposure hierarchy (L7)', sum the 'Mean' values, and get the top 15
food_consumption_data_nowater = food_consumption_data[
    (food_consumption_data["Exposure hierarchy (L7)"] != "Natural mineral water") &
    (food_consumption_data["Exposure hierarchy (L7)"] != "Tap water") &
    (food_consumption_data["Exposure hierarchy (L7)"] != "Filtered tap water")]

top = (
    food_consumption_data_nowater
    .groupby('Exposure hierarchy (L7)', as_index=False)['Mean']
    .sum()
    .sort_values('Mean', ascending=False)
    .head(15)
)

top_food_consumption_data_agrovoc = enrich_with_agrovoc(top, "Exposure hierarchy (L7)")
top_food_consumption_data_agrovoc.to_csv(r"final_data/top_food_consumption_agrovoc.csv")

NameError: name 'pd' is not defined