## Extracting the biomass value from the dataset https://catalogue.ceda.ac.uk/uuid/bf535053562141c6bb7ad831f5998d77/
1.Download data from to you pc https://catalogue.ceda.ac.uk/uuid/bf535053562141c6bb7ad831f5998d77/ atotal of 300gb space requirement for data for "2015 to 2016

2.Read Our polygon data and overlap which year raster files and extract the biomass values

In [1]:
import geopandas as gpd
import rasterio
import os
import pandas as pd
import numpy as np

# Loading our polygons
polygons_path = "../input/new_df_reforestation_with_precipitation_biomass.geojson"
polygons_gdf = gpd.read_file(polygons_path)

# Rasters downloaded from the https://catalogue.ceda.ac.uk/uuid/bf535053562141c6bb7ad831f5998d77/
base_raster_dir = "../dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.0/geotiff"

# Years to process/extract data from (All available data to date)
years = [2015, 2016, 2017, 2018, 2019, 2020]

# Extracting the  raster precipitation values
def extract_raster_values(raster_path, centroids):
    with rasterio.open(raster_path) as src:
        transformed_centroids = centroids.to_crs(src.crs)
        values = []
        for point in transformed_centroids.geometry:
            row, col = src.index(point.x, point.y)
            if (0 <= row < src.height) and (0 <= col < src.width):
                value = src.read(1)[row, col]
                values.append(value)
            else:
                values.append(np.nan)
        return values

# processinng  GeoDataFrame in chunks due to its size and accepting a year parameter
def process_in_chunks(gdf, chunk_size, year):
    results_df = pd.DataFrame()
    raster_dir = f"{base_raster_dir}/{year}"  

    for start in range(0, len(gdf), chunk_size):
        end = start + chunk_size
        chunk = gdf.iloc[start:end]
        if chunk.crs.is_geographic:
            chunk = chunk.to_crs("EPSG:3395")
        chunk['centroid'] = chunk.geometry.centroid
        
        biomass_values = [0] * len(chunk)
        
        for raster_file in os.listdir(raster_dir):
            if raster_file.endswith(".tif"):
                raster_path = os.path.join(raster_dir, raster_file)
                values = extract_raster_values(raster_path, chunk['centroid'])
                biomass_values = [x + y if not np.isnan(y) else x for x, y in zip(biomass_values, values)]
        
        chunk[f'Biomass_{year}'] = biomass_values
        chunk.drop(columns=['centroid'], inplace=True)
        
        results_df = pd.concat([results_df, chunk])
    
    return results_df


for year in years:
    if year == years[0]:
        final_gdf = process_in_chunks(polygons_gdf, 10000, year)
    else:
        temp_gdf = process_in_chunks(polygons_gdf, 10000, year)
        final_gdf[f'Biomass_{year}'] = temp_gdf[f'Biomass_{year}']


output_path = "/home/idisc02/Forest_Monitoring/output/reforestation_with_precipitation_biomass_updated.geojson"
final_gdf.to_file(output_path, driver='GeoJSON')

### Climate data Extraction
1.Download raster  data for precipitation to Pc from https://www.worldclim.org/
2.Overlay polygons and extract rainfall from the centoid pixel of each polygon(Site)

### Precipitation

In [None]:
import rasterio
import geopandas as gpd
import numpy as np
from rasterio.mask import mask
import pandas as pd
import gc
import os
from shapely.geometry import Point

# Updated geojson file of our data
geojson_path = "../updated_with_description_reforestation_projects_with_ndvi.geojson"
# Output folder for the processed data /Geojson file
output_folder = "../input/centroid_prec"
# The directory containing the .tif files  downloaded from https://catalogue.ceda.ac.uk/uuid/bf535053562141c6bb7ad831f5998d77/
tif_folder = "../climate_data_2.5m/"

gdf = gpd.read_file(geojson_path)

# Checking for available .tif files in the directory
tif_files = [f for f in os.listdir(tif_folder) if f.endswith(".tif")]
if len(tif_files) == 0:
    raise FileNotFoundError("No .tif files found in the directory.")

# Checking the CRS of the first .tif file and reproject if necessary
tif_path = os.path.join(tif_folder, tif_files[0])
with rasterio.open(tif_path) as src:
    tif_crs = src.crs
if gdf.crs != tif_crs:
    gdf = gdf.to_crs(tif_crs)

# Extracting Year of interest from our polygon "Planting date feature"
def extract_year(date_str):
    if pd.isna(date_str):
        return np.nan
    try:
      
        date_parsed = pd.to_datetime(date_str, errors='coerce')
        if pd.notna(date_parsed):
            return date_parsed.year
    except ValueError:
        pass  
    
   
    if isinstance(date_str, str) and date_str.isdigit() and len(date_str) == 4:
        return int(date_str)
    
    return np.nan

gdf['planting_year'] = gdf['planting_date_reported'].apply(extract_year).astype('Int64')

# Processing the data in chunks to  for resources management
chunk_size = 5000


tif_files_by_year = {}
for tif_file in tif_files:
    year_month = tif_file.split("_")[-1].split(".")[0]
    year, month = int(year_month.split("-")[0]), int(year_month.split("-")[1])
    if year not in tif_files_by_year:
        tif_files_by_year[year] = {}
    tif_files_by_year[year][month] = tif_file


for i in range(0, len(gdf), chunk_size):
    gdf_chunk = gdf.iloc[i:i + chunk_size].copy()
    precipitation_by_years_after_planting = {}

    for idx, polygon in gdf_chunk.iterrows():
        planting_year = polygon['planting_year']
        if pd.isna(planting_year):
            continue

        precipitation_by_years_after_planting[idx] = {
            'planting_year': 0, 'year_1': 0, 'year_2': 0, 'year_5': 0, 
            'has_data_planting': False, 'has_data_1': False, 'has_data_2': False, 'has_data_5': False
        }

        # Getting the centroid of the polygon to extract values for each polygon
        centroid = polygon['geometry'].centroid
        centroid_point = [(centroid.x, centroid.y)]  

       # Process  data for each of the relevant years (planting year, year+1, year+2, year+5)
        for year_offset in [0, 1, 2, 5]:
            current_year = planting_year + year_offset
            if current_year in tif_files_by_year:
                total_precipitation_for_year = 0
                has_valid_data = False

                # Summing precipitation across all 12 months of each year
                for month in range(1, 13):
                    if month in tif_files_by_year[current_year]:
                        tif_file = tif_files_by_year[current_year][month]
                        tif_path = os.path.join(tif_folder, tif_file)

                        try:
                            with rasterio.open(tif_path) as src:
                                # Getting the value at the centroid instead of the whole polygon
                                for val in src.sample(centroid_point):
                                    valid_data = val[0]
                                    if not np.isnan(valid_data):
                                        total_precipitation_for_year += valid_data
                                        has_valid_data = True
                        except Exception as e:
                            print(f"Error processing {tif_file}: {e}")

                # Update the precipitation data dictionary based on the year offset
                if year_offset == 0:
                    precipitation_by_years_after_planting[idx]['planting_year'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_planting'] = has_valid_data
                elif year_offset == 1:
                    precipitation_by_years_after_planting[idx]['year_1'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_1'] = has_valid_data
                elif year_offset == 2:
                    precipitation_by_years_after_planting[idx]['year_2'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_2'] = has_valid_data
                elif year_offset == 5:
                    precipitation_by_years_after_planting[idx]['year_5'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_5'] = has_valid_data

   # Updating the  the precipitation data dictionary based on the year offset
    for idx, precip_data in precipitation_by_years_after_planting.items():
        avg_precip_planting = precip_data['planting_year'] / 12 if precip_data['has_data_planting'] else np.nan
        avg_precip_year_1 = precip_data['year_1'] / 12 if precip_data['has_data_1'] else np.nan
        avg_precip_year_2 = precip_data['year_2'] / 12 if precip_data['has_data_2'] else np.nan
        avg_precip_year_5 = precip_data['year_5'] / 12 if precip_data['has_data_5'] else np.nan

        gdf_chunk.at[idx, "avg_precip_planting_year"] = avg_precip_planting
        gdf_chunk.at[idx, "avg_precip_1_year_after"] = avg_precip_year_1
        gdf_chunk.at[idx, "avg_precip_2_years_after"] = avg_precip_year_2
        gdf_chunk.at[idx, "avg_precip_5_years_after"] = avg_precip_year_5

 # Saving each   chunking results to a new GeoJSON file
    output_geojson_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    gdf_chunk.to_file(output_geojson_path, driver="GeoJSON")

   
    del gdf_chunk, precipitation_by_years_after_planting
    gc.collect()

    print(f"Processed and saved chunk {i} to {output_geojson_path}")


In [None]:
# Combining all precipitation chunks to a single file
output_folder = "../input/centroid_prec"

geojson_files = [os.path.join(output_folder, file) for file in os.listdir(output_folder) if file.endswith('.geojson')]

gdfs = [gpd.read_file(file) for file in geojson_files]

combined_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

combined_gdf.to_file(os.path.join(output_folder, "df_reforestation_with_precipitation.geojson"), driver='GeoJSON')

### Temperature Extraction