# Import packages:

In [1]:
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd

# Load data:

This dictionary contains mapping of province names from the files downloaded from the Spanish Ministry of Agriculture to the corresponding names in the Spain map.

In [2]:
province_name_mapping = {
    "Alava": "Araba/Álava",
    "Álava": "Araba/Álava",
    "PAÍS VASCO": "País Vasco/Euskadi",
    "PAIS VASCO": "País Vasco/Euskadi",
    "NAVARRA": "Navarra",
    "LA RIOJA": "La Rioja",
    "ARAGÓN": "Aragón",
    "ARAGON": "Aragón",
    "CATALUÑA": "Cataluña/Catalunya",
    "BALEARES": "Illes Balears",
    "Avila": "Ávila",
    "CASTILLA Y LEÓN": "Castilla y León",
    "CASTILLA Y LEON": "Castilla y León",
    "MADRID": "Madrid",
    "CASTILLA-LA MANCHA": "Castilla-La Mancha",
    "C. VALENCIANA": "Comunitat Valenciana",
    "R. DE MURCIA": "Murcia",
    "EXTREMADURA": "Extremadura",
    "ANDALUCÍA": "Andalucía",
    "ANDALUCIA": "Andalucía",
    "CANARIAS": "Canarias",
    "Guipúzcoa": "Gipuzkoa",
    "GALICIA": "Galicia",
    "P. DE ASTURIAS": "Asturias",
    "CANTABRIA": "Cantabria",
    "Vizcaya": "Bizkaia",
    "S.C. de Tenerife": "Santa Cruz de Tenerife",
    "Alicante": "Alacant/Alicante",
    "Castellón": "Castelló/Castellón",
    "Valencia": "València/Valencia",
}

In [3]:
def normalize_province_name(input_province_name: str) -> str:
    if input_province_name in province_name_mapping.keys():
        return province_name_mapping.get(input_province_name)
    return input_province_name

In [4]:
def read_excel_file(file_path):
    df = pd.read_excel(file_path, skiprows=8, na_values="–", header=None)

    # change column names
    df.columns = [
        "province",
        "rainfed_plantation_area",
        "irrigated_plantation_area",
        "total_plantation_area",
        "rainfed_production_area",
        "irrigated_production_area",
        "n_scattered_trees",
        "rainfed_yield",
        "irrigated_yield",
        "scattered_trees_yield",
        "total_production",
    ]

    numeric_columns = [
        "rainfed_plantation_area",
        "irrigated_plantation_area",
        "total_plantation_area",
        "rainfed_production_area",
        "irrigated_production_area",
        "n_scattered_trees",
        "rainfed_yield",
        "irrigated_yield",
        "scattered_trees_yield",
        "total_production",
    ]

    # add year column
    crop_type, year = file_path.stem.split("_")
    df["crop_type"] = crop_type
    df["year"] = int(year)

    # normalize values in province column
    df["province"] = df["province"].str.strip().apply(normalize_province_name)

    # fill NA values in numeric columns with zero
    df[numeric_columns] = df[numeric_columns].fillna(0.0)

    # drop empty rows
    df = df.dropna()

    # fix dtype
    df = df.astype(
        {
            "province": "string[pyarrow]",
            "rainfed_plantation_area": "float32",
            "irrigated_plantation_area": "float32",
            "total_plantation_area": "float32",
            "rainfed_production_area": "float32",
            "irrigated_production_area": "float32",
            "n_scattered_trees": "float32",
            "rainfed_yield": "float32",
            "irrigated_yield": "float32",
            "scattered_trees_yield": "float32",
            "total_production": "float32",
            "crop_type": "string[pyarrow]",
            "year": "int",
        }
    )

    return df

In [5]:
olive_cultivation_system_path = Path("../data/raw/olive_cultivation_system/")

In [7]:
df_list = []

for file_path in olive_cultivation_system_path.iterdir():
    df = read_excel_file(file_path)
    df_list.append(df)

In [9]:
len(df_list)

50

In [10]:
olive_cultivation_system_data = pd.concat(df_list, ignore_index=True)

In [11]:
olive_cultivation_system_data.head()

Unnamed: 0,province,rainfed_plantation_area,irrigated_plantation_area,total_plantation_area,rainfed_production_area,irrigated_production_area,n_scattered_trees,rainfed_yield,irrigated_yield,scattered_trees_yield,total_production,crop_type,year
0,Araba/Álava,92.0,0.0,92.0,87.0,0.0,0.0,2356.0,0.0,0.0,205.0,oil,1998
1,País Vasco/Euskadi,92.0,0.0,92.0,87.0,0.0,0.0,2356.0,0.0,0.0,205.0,oil,1998
2,Navarra,1635.0,1159.0,2794.0,1594.0,944.0,78.0,2476.0,3067.0,13.0,6843.0,oil,1998
3,La Rioja,2197.0,494.0,2691.0,1862.0,346.0,2268.0,1276.0,1396.0,4.0,2868.0,oil,1998
4,Huesca,8442.0,2022.0,10464.0,8442.0,2022.0,24200.0,430.0,1037.0,0.0,5727.0,oil,1998


In [12]:
olive_cultivation_system_data.shape

(2254, 13)

In [13]:
olive_cultivation_system_data.dtypes

province                     string[pyarrow]
rainfed_plantation_area              float32
irrigated_plantation_area            float32
total_plantation_area                float32
rainfed_production_area              float32
irrigated_production_area            float32
n_scattered_trees                    float32
rainfed_yield                        float32
irrigated_yield                      float32
scattered_trees_yield                float32
total_production                     float32
crop_type                    string[pyarrow]
year                                   int32
dtype: object

# Validate province mapping

In [15]:
spain_map_path = Path("../data/processed/maps/")

In [16]:
spain_map = gpd.read_file(spain_map_path)

In [19]:
spain_map.head()

Unnamed: 0,community,province,geometry
0,País Vasco/Euskadi,Araba/Álava,"POLYGON ((-2.76808 42.61408, -2.76863 42.61334..."
1,Castilla-La Mancha,Albacete,"POLYGON ((-2.55212 38.08501, -2.55207 38.08517..."
2,Comunitat Valenciana,Alacant/Alicante,"MULTIPOLYGON (((-0.75223 37.88691, -0.75225 37..."
3,Andalucía,Almería,"MULTIPOLYGON (((-3.03624 35.93791, -3.03637 35..."
4,Castilla y León,Ávila,"POLYGON ((-5.43382 40.24491, -5.4343 40.24276,..."


In [20]:
def is_province(province_name: str) -> bool:
    return any(spain_map["province"] == province_name) or any(
        spain_map["community"] == province_name
    )

In [22]:
provinces_from_data = olive_cultivation_system_data["province"].unique()

In [23]:
for province in provinces_from_data:
    if not is_province(province):
        print(f"{province}")

ESPAÑA
(1) Incluye a ceituna de mesa con destino almazara
(1) Incluyendo aceitunas de mesa con destino almazara
(1) Incluyendo aceitunas de almazara con destino mesa
Ciudades Autónomas (*)
(*) Fuente: Censo Agrario, 1999. I.N.E.


Some rows in the dataset represents aggregations at the community level. We should remove these rows.

To do so, for each value in the `province` column we need to know whther it's a province or communtiy.

In [24]:
spain_provinces = spain_map["province"].unique()

In [25]:
single_province_communities = (
    spain_map["community"].value_counts().pipe(lambda x: x[x == 1]).index
).values

In [26]:
provinces_to_keep = np.unique(
    np.concatenate((single_province_communities, spain_provinces))
)

In [27]:
print(provinces_to_keep)

['A Coruña' 'Alacant/Alicante' 'Albacete' 'Almería' 'Araba/Álava'
 'Asturias' 'Badajoz' 'Barcelona' 'Bizkaia' 'Burgos' 'Cantabria'
 'Castelló/Castellón' 'Ceuta' 'Ciudad Autónoma de Ceuta'
 'Ciudad Autónoma de Melilla' 'Ciudad Real' 'Comunidad Foral de Navarra'
 'Comunidad de Madrid' 'Cuenca' 'Cáceres' 'Cádiz' 'Córdoba' 'Gipuzkoa'
 'Girona' 'Granada' 'Guadalajara' 'Huelva' 'Huesca' 'Illes Balears' 'Jaén'
 'La Rioja' 'Las Palmas' 'León' 'Lleida' 'Lugo' 'Madrid' 'Melilla'
 'Murcia' 'Málaga' 'Navarra' 'Ourense' 'Palencia' 'Pontevedra'
 'Principado de Asturias' 'Región de Murcia' 'Salamanca'
 'Santa Cruz de Tenerife' 'Segovia' 'Sevilla' 'Soria' 'Tarragona' 'Teruel'
 'Toledo' 'Valladolid' 'València/Valencia' 'Zamora' 'Zaragoza' 'Ávila']


In [28]:
olive_cultivation_system_data = olive_cultivation_system_data[
    olive_cultivation_system_data["province"].isin(provinces_to_keep)
].reset_index(drop=True)

In [29]:
olive_cultivation_system_data.head()

Unnamed: 0,province,rainfed_plantation_area,irrigated_plantation_area,total_plantation_area,rainfed_production_area,irrigated_production_area,n_scattered_trees,rainfed_yield,irrigated_yield,scattered_trees_yield,total_production,crop_type,year
0,Araba/Álava,92.0,0.0,92.0,87.0,0.0,0.0,2356.0,0.0,0.0,205.0,oil,1998
1,Navarra,1635.0,1159.0,2794.0,1594.0,944.0,78.0,2476.0,3067.0,13.0,6843.0,oil,1998
2,La Rioja,2197.0,494.0,2691.0,1862.0,346.0,2268.0,1276.0,1396.0,4.0,2868.0,oil,1998
3,Huesca,8442.0,2022.0,10464.0,8442.0,2022.0,24200.0,430.0,1037.0,0.0,5727.0,oil,1998
4,Teruel,28426.0,1461.0,29887.0,28068.0,1436.0,214.0,490.0,1015.0,10.0,15213.0,oil,1998


In [32]:
olive_cultivation_system_data.groupby(by="province").agg(n_year=("year", "nunique"))

Unnamed: 0_level_0,n_year
province,Unnamed: 1_level_1
A Coruña,4
Alacant/Alicante,25
Albacete,25
Almería,25
Araba/Álava,25
Asturias,4
Badajoz,25
Barcelona,25
Bizkaia,4
Burgos,11


Not all provinces have the same number of years.

# Add Features:

In [33]:
def add_total_production_area(
    input_data: pd.DataFrame, rainfed_prod_col: str, irrigated_prod_col: str
) -> pd.DataFrame:
    return input_data.assign(
        total_production_area=input_data[rainfed_prod_col]
        + input_data[irrigated_prod_col]
    )

In [34]:
def add_production_area_percentage(
    input_data: pd.DataFrame,
    total_plantation_area_col: str,
    total_production_area_col: str,
) -> pd.DataFrame:
    return input_data.assign(
        production_area_percentage=(
            input_data[total_production_area_col]
            / input_data[total_plantation_area_col]
        )
        * 100.0
    )

In [35]:
def add_rainfed_production_area_percentage(
    input_data: pd.DataFrame,
    rainfed_production_area_col: str,
    total_production_area_col: str,
) -> pd.DataFrame:
    return input_data.assign(
        rainfed_production_area_percentage=(
            input_data[rainfed_production_area_col]
            / input_data[total_production_area_col]
        )
        * 100.0
    )

In [36]:
def add_rainfed_total_production(
    input_data: pd.DataFrame, rainfed_production_area_col: str, rainfed_yield_col: str
) -> pd.DataFrame:
    return input_data.assign(
        rainfed_total_production=(
            input_data[rainfed_production_area_col] * input_data[rainfed_yield_col]
        )
        / 1000.0
    )

In [37]:
def add_irrigated_total_production(
    input_data: pd.DataFrame,
    irrigated_production_area_col: str,
    irrigated_yield_col: str,
) -> pd.DataFrame:
    return input_data.assign(
        irrigated_total_production=(
            input_data[irrigated_production_area_col] * input_data[irrigated_yield_col]
        )
        / 1000.0
    )

In [42]:
olive_cultivation_system_data = (
    olive_cultivation_system_data.pipe(
        add_total_production_area,
        "rainfed_production_area",
        "irrigated_production_area",
    )
    .pipe(
        add_production_area_percentage, "total_plantation_area", "total_production_area"
    )
    .pipe(
        add_rainfed_production_area_percentage,
        "rainfed_production_area",
        "total_production_area",
    )
    .pipe(add_rainfed_total_production, "rainfed_production_area", "rainfed_yield")
    .pipe(
        add_irrigated_total_production,
        "irrigated_production_area",
        "irrigated_yield",
    )
    .fillna(0.0)
)

In [46]:
output_path = Path("../data/processed/olive_cultivation_system/")

In [47]:
olive_cultivation_system_data.to_parquet(
    output_path / "olive_cultivation_system.parquet", index=False
)