In [18]:
import os
import json
import string
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from sentence_transformers import SentenceTransformer, util

In [8]:
# Initialize SPARQL endpoint and model
sparql = SPARQLWrapper("http://agrovoc.fao.org/sparql")
model = SentenceTransformer('all-MiniLM-L6-v2')

CACHE_PATH = "agro_cache.json"

In [9]:
def load_cache():
    if not os.path.exists(CACHE_PATH):
        return {}
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        cache = json.load(f)
        return cache

def save_cache(cache):
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2)

In [10]:
def semantic_best_match(value, candidates):
    input_embedding = model.encode(value, convert_to_tensor=True)
    candidate_texts = []
    for c in candidates:
        candidate_texts.append(c["label"])
    candidate_embeddings = model.encode(candidate_texts, convert_to_tensor=True)
    scores = util.cos_sim(input_embedding, candidate_embeddings)[0]
    best_idx = scores.argmax().item()
    return candidates[best_idx]

In [22]:
def query_agrovoc(value, cache):


    if value in cache:
        print(f"Cache hit: '{value}' found in cache")
        return cache[value]
    
    print(f"Cache miss: '{value}' not in cache, querying AGROVOC endpoint")
    # Run SPARQL query
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?concept ?label WHERE {{
        ?concept skos:prefLabel ?label .
        FILTER(LANG(?label) = "en" && REGEX(?label, "{value}", "i"))
    }}
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    candidates = []
    for result in results["results"]["bindings"]:
        label = result["label"]["value"]
        uri = result["concept"]["value"]
        candidate = {"label": label, "uri": uri}
        candidates.append(candidate)
    if not candidates:
        match = {"label": None, "uri": None}
    else:
        best = semantic_best_match(value, candidates)
        match = best
    # Cache the result
    cache[value] = match
    save_cache(cache)
    return match

In [20]:
def enrich_with_agrovoc(df, column_name):
    cache = load_cache()
    labels = []
    uris = []
    for val in df[column_name]:
        match = query_agrovoc(val, cache)
        labels.append(match["label"])
        uris.append(match["uri"])
    df["AGROVOC_label"] = labels
    df["AGROVOC_uri"] = uris
    return df

### How much water is used to produce food?
- the FAO AQUASTAT database for information about worldwide water withdrawal

In [33]:
world_water_data = pd.read_csv("data/aquastat_world_water_data.csv", encoding="utf-8")
# Define the variables of interest
variables_of_interest = [
    "Agricultural water withdrawal",
    "Industrial water withdrawal",
    "Municipal water withdrawal"
]
# Filter the dataframe
filtered_water = world_water_data[
    (world_water_data["Variable"].isin(variables_of_interest)) &
    (world_water_data["Year"] == 2022) &
    (world_water_data["Area"] == "World")
]

filtered_water_agrovoc = enrich_with_agrovoc(filtered_water,"Variable")
filtered_water_agrovoc.to_csv(r"final_data\filtered_water_agrovoc.csv")


Cache hit: 'Agricultural water withdrawal' found in cache
Cache hit: 'Industrial water withdrawal' found in cache
Cache hit: 'Municipal water withdrawal' found in cache


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["AGROVOC_label"] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["AGROVOC_uri"] = uris


### How much land is it used for food production?
- land use data from FAOSTAT

In [35]:
landuse_data = pd.read_csv("data/FAOSTAT_land_use_data.csv", encoding="utf-8")

filtered_landuse = landuse_data[
    (landuse_data["Area"] == "World") &
    (landuse_data["Year"] == 2018) &
    (landuse_data["Element"] == "Area") &
    (landuse_data["Item"].isin(["Country land",
                                "Land area", 
                                "Agriculture", 
                                "Forest land", 
                                "Land used for aquaculture", 
                                "Inland waters used for aquac. or holding facilities", 
                                "Inland waters used for capture fishes", 
                                "Coastal waters used for aquac. or holding facilities",
                                "Coastal waters used for capture fishes",
                                "EEZ used for aquac. or holding facilities",
                                "EEZ used for capture fishes"]))
]

filtered_landuse = filtered_landuse[["Area", "Year", "Item", "Value"]]
filtered_landuse_agrovoc = enrich_with_agrovoc(filtered_landuse, "Item")
filtered_landuse_agrovoc.to_csv(r"final_data/filtered_landuse_agrovoc.csv")

Cache hit: 'Land area' found in cache
Cache hit: 'Agriculture' found in cache
Cache hit: 'Forest land' found in cache


### What are the emissions of the agricultural sector compared to other sectors?
- greenhouse gas emissions data from Climate Watch + FAOSTAT
#TO DO

In [4]:
ghg_emissions_data = pd.read_csv("data/historical_emissions.csv", encoding="utf-8")
fao_greenhouse_gas_emissions = pd.read_csv("data/FAOSTAT_emissions_total.csv", encoding="utf-8")

In [6]:
aggregation_map = {
    # Crop Production
    'Crop Residues': 'Crop Production',
    'Rice Cultivation': 'Crop Production',
    'Burning - Crop residues': 'Crop Production',
    'Synthetic Fertilizers': 'Crop Production',
    'Drained organic soils': 'Crop Production',
    'Drained organic soils (CO2)': 'Crop Production',
    'Drained organic soils (N2O)': 'Crop Production',
    'Manure applied to Soils': 'Crop Production',

    # Livestock Production
    'Enteric Fermentation': 'Livestock Production',
    'Manure Management': 'Livestock Production',
    'Manure left on Pasture': 'Livestock Production',

    # Land-Use Change
    'Forestland': 'Land-Use Change',
    'Net Forest conversion': 'Land-Use Change',
    'Savanna fires': 'Land-Use Change',
    'Fires in organic soils': 'Land-Use Change',
    'Forest fires': 'Land-Use Change',
    'Fires in humid tropical forests': 'Land-Use Change',

    # On-Farm Energy
    'On-farm energy use': 'On-Farm Energy',

    # Agricultural Inputs
    'Fertilizers Manufacturing': 'Ag. Inputs Manufacturing',
    'Pesticides Manufacturing': 'Ag. Inputs Manufacturing',

    # Processing & Packaging
    'Food Processing': 'Processing & Packaging',
    'Food Packaging': 'Processing & Packaging',

    # Transport & Retail
    'Food Transport': 'Transport & Retail',
    'Food Retail': 'Transport & Retail',

    # Consumption
    'Food Household Consumption': 'Consumption',

    # Waste
    'Agrifood Systems Waste Disposal': 'Waste',

    # Exclude or label as "Other" if unrelated
    'Energy': 'Other',
    'IPPU': 'Other',
    'Waste': 'Other',
    'International bunkers': 'Other',
    'Other': 'Other'
}

fao_greenhouse_gas_emissions['Category'] = fao_greenhouse_gas_emissions['Item'].map(aggregation_map)



In [13]:
fao_greenhouse_gas_emissions_agrovoc = enrich_with_agrovoc(fao_greenhouse_gas_emissions, "Category")
fao_greenhouse_gas_emissions_agrovoc

Cache miss: 'Crop Production' not in cache, querying AGROVOC endpoint
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache hit: 'Crop Production' found in cache
Cache 

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Source Code,Source,Unit,Value,Flag,Flag Description,Note,Category,AGROVOC_label,AGROVOC_uri
0,GT,Emissions totals,1,World,7234,Direct emissions (N2O),5064,Crop Residues,1999,1999,3050,FAO TIER 1,kt,425.3989,E,Estimated value,,Crop Production,crop production,http://aims.fao.org/aos/agrovoc/c_5976
1,GT,Emissions totals,1,World,7236,Indirect emissions (N2O),5064,Crop Residues,1999,1999,3050,FAO TIER 1,kt,95.7152,E,Estimated value,,Crop Production,crop production,http://aims.fao.org/aos/agrovoc/c_5976
2,GT,Emissions totals,1,World,7230,Emissions (N2O),5064,Crop Residues,1999,1999,3050,FAO TIER 1,kt,521.1148,E,Estimated value,,Crop Production,crop production,http://aims.fao.org/aos/agrovoc/c_5976
3,GT,Emissions totals,1,World,724313,Emissions (CO2eq) from N2O (AR5),5064,Crop Residues,1999,1999,3050,FAO TIER 1,kt,138095.4220,E,Estimated value,,Crop Production,crop production,http://aims.fao.org/aos/agrovoc/c_5976
4,GT,Emissions totals,1,World,723113,Emissions (CO2eq) (AR5),5064,Crop Residues,1999,1999,3050,FAO TIER 1,kt,138095.4220,E,Estimated value,,Crop Production,crop production,http://aims.fao.org/aos/agrovoc/c_5976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3619,GT,Emissions totals,1,World,7230,Emissions (N2O),6819,Other,2022,2022,3050,FAO TIER 1,kt,601.7354,E,Estimated value,,Other,Other perciformes,http://aims.fao.org/aos/agrovoc/c_44954
3620,GT,Emissions totals,1,World,7273,Emissions (CO2),6819,Other,2022,2022,3050,FAO TIER 1,kt,47500.3188,E,Estimated value,,Other,Other perciformes,http://aims.fao.org/aos/agrovoc/c_44954
3621,GT,Emissions totals,1,World,724413,Emissions (CO2eq) from CH4 (AR5),6819,Other,2022,2022,3050,FAO TIER 1,kt,4218.9272,E,Estimated value,,Other,Other perciformes,http://aims.fao.org/aos/agrovoc/c_44954
3622,GT,Emissions totals,1,World,724313,Emissions (CO2eq) from N2O (AR5),6819,Other,2022,2022,3050,FAO TIER 1,kt,159459.8941,E,Estimated value,,Other,Other perciformes,http://aims.fao.org/aos/agrovoc/c_44954


## Which are the most consumed types of food in Italy?
- the EFSA (European Food Safety Authority) food consumption data for data about the consumption of different types of food in various countries

In [17]:
food_consumption_data = pd.read_csv("data/chronic_consumption_gday_allsubjects.csv", encoding="utf-16")
food_consumption_data
# Group by 'Exposure hierarchy (L7)', sum the 'Mean' values, and get the top 15
# food_consumption_data_nowater = food_consumption_data[
#     (food_consumption_data["Exposure hierarchy (L7)"] != "Natural mineral water") &
#     (food_consumption_data["Exposure hierarchy (L7)"] != "Tap water") &
#     (food_consumption_data["Exposure hierarchy (L7)"] != "Filtered tap water")]

# top = (
#     food_consumption_data_nowater
#     .groupby('Exposure hierarchy (L7)', as_index=False)['Mean']
#     .sum()
#     .sort_values('Mean', ascending=False)
    
# )
# food_consumption_data_agrovoc = enrich_with_agrovoc(food_consumption_data, "Exposure hierarchy (L7)")

Unnamed: 0,Survey's country,Survey start year,Survey name,Population Group (L2),Exposure hierarchy (L1),Exposure hierarchy (L2),Exposure hierarchy (L3),Exposure hierarchy (L4),Exposure hierarchy (L5),Exposure hierarchy (L6),...,Number of consumers,Mean,Standard Deviation,5th percentile,10th percentile,Median,95th percentile,97.5th percentile,99th percentile,Comment
0,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Beer and beer-like beverage,Beer,Beer,Beer,Beer,...,9,8.40,79.01,0.0,0.0,0.0,0.00,7.88,165.00,"'Number of observations lower than 298, the 99..."
1,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Unsweetened spirits and liqueurs,Liqueurs,Liqueurs,Liqueurs,Liqueurs,...,2,0.00,0.01,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
2,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Unsweetened spirits and liqueurs,Unsweetened spirits,Spirits from fruit,Spirits made from fruits other than stone fruits,Spirits made from fruits other than stone fruits,...,1,0.05,0.75,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
3,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Wine and wine-like drinks,Wine,Sparkling wine,Sparkling wine,Sparkling wine,...,1,0.08,1.28,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 298, the 99..."
4,Italy,2018,Italian national dietary survey on adult popul...,Adolescents,Alcoholic beverages,Wine and wine-like drinks,Wine,"Wine, red","Wine, red","Wine, red",...,16,1.19,15.21,0.0,0.0,0.0,0.09,0.12,2.96,"'Number of observations lower than 298, the 99..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1484,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit",Soft drink with bitter principle,...,2,0.58,5.11,0.0,0.0,0.0,0.00,0.00,42.20,"'Number of observations lower than 180, the 97..."
1485,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit","Soft drink, flavoured, no fruit",...,1,1.03,12.81,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 180, the 97..."
1486,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, flavoured, no fruit","Soft drink, lemon flavour",...,1,0.90,11.21,0.0,0.0,0.0,0.00,0.00,0.00,"'Number of observations lower than 180, the 97..."
1487,Italy,2018,Italian national dietary survey on adult popul...,Elderly,Water and water-based beverages,Water based beverages,Soft drinks,Soft drinks with minor amounts of fruits or fl...,"Soft drink, with fruit juice (fruit content be...","Fruit soft drink, orange",...,2,0.91,8.14,0.0,0.0,0.0,0.00,0.00,64.95,"'Number of observations lower than 180, the 97..."
