### Climate data Extraction
1.Download raster  data for precipitation to Pc from https://www.worldclim.org/

2.Overlay polygons and extract rainfall from the centoid pixel of each polygon(Site)

### Precipitation extraction

In [None]:
import geopandas as gpd
import rasterio
import os
import numpy as np
import rasterio
from rasterio.mask import mask
import pandas as pd
import gc
from shapely.geometry import Point
from helper_functions import extract_year

In [None]:
geojson_path = "../input/Updated_Reforestation_Data.geojson"
output_folder = "../input/precipitation_output"
tif_folder = "../climate_precipitation/"
combined_output_path = "../input/Updated_Reforestation_Data.geojson"

gdf = gpd.read_file(geojson_path)

# Checking for the available .tif files
tif_files = [f for f in os.listdir(tif_folder) if f.endswith(".tif")]
if len(tif_files) == 0:
    raise FileNotFoundError("No .tif files found in the directory.")


tif_path = os.path.join(tif_folder, tif_files[0])
with rasterio.open(tif_path) as src:
    tif_crs = src.crs

# Reprojecting GeoDataFrame if CRS doesn't match
if gdf.crs != tif_crs:
    gdf = gdf.to_crs(tif_crs)

gdf['planting_year'] = gdf['planting_date_reported'].apply(extract_year).astype('Int64')

# Organizing the .tif files by year and month
tif_files_by_year = {}
for tif_file in tif_files:
    year_month = tif_file.split("_")[-1].split(".")[0]
    year, month = int(year_month.split("-")[0]), int(year_month.split("-")[1])
    if year not in tif_files_by_year:
        tif_files_by_year[year] = {}
    tif_files_by_year[year][month] = tif_file

# Setting chunk size for processing
chunk_size = 200

for i in range(0, len(gdf), chunk_size):
    gdf_chunk = gdf.iloc[i:i + chunk_size].copy()
    precipitation_by_years_after_planting = {}

    for idx, polygon in gdf_chunk.iterrows():
        planting_year = polygon['planting_year']
        if pd.isna(planting_year):
            continue

        precipitation_by_years_after_planting[idx] = {
            'planting_year': 0, 'year_1': 0, 'year_2': 0, 'year_5': 0,
            'has_data_planting': False, 'has_data_1': False, 'has_data_2': False, 'has_data_5': False
        }

        centroid = polygon['geometry'].centroid
        centroid_point = [(centroid.x, centroid.y)]

        for year_offset in [0, 1, 2, 5]:
            current_year = planting_year + year_offset
            if current_year in tif_files_by_year:
                total_precipitation_for_year = 0
                has_valid_data = False

                for month in range(1, 13):
                    if month in tif_files_by_year[current_year]:
                        tif_file = tif_files_by_year[current_year][month]
                        tif_path = os.path.join(tif_folder, tif_file)

                        try:
                            with rasterio.open(tif_path) as src:
                                for val in src.sample(centroid_point):
                                    valid_data = val[0]
                                    if not np.isnan(valid_data):
                                        total_precipitation_for_year += valid_data
                                        has_valid_data = True
                        except Exception as e:
                            print(f"Error processing {tif_file}: {e}")

                # Updating precipitation data
                if year_offset == 0:
                    precipitation_by_years_after_planting[idx]['planting_year'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_planting'] = has_valid_data
                elif year_offset == 1:
                    precipitation_by_years_after_planting[idx]['year_1'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_1'] = has_valid_data
                elif year_offset == 2:
                    precipitation_by_years_after_planting[idx]['year_2'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_2'] = has_valid_data
                elif year_offset == 5:
                    precipitation_by_years_after_planting[idx]['year_5'] = total_precipitation_for_year
                    precipitation_by_years_after_planting[idx]['has_data_5'] = has_valid_data

  
    for idx, precipitation_data in precipitation_by_years_after_planting.items():
        gdf_chunk.at[idx, "avg_precipitation_planting_year"] = precipitation_data['planting_year'] / 12 if precipitation_data['has_data_planting'] else np.nan
        gdf_chunk.at[idx, "avg_precipitation_1_year_after"] = precipitation_data['year_1'] / 12 if precipitation_data['has_data_1'] else np.nan
        gdf_chunk.at[idx, "avg_precipitation_2_years_after"] = precipitation_data['year_2'] / 12 if precipitation_data['has_data_2'] else np.nan
        gdf_chunk.at[idx, "avg_precipitation_5_years_after"] = precipitation_data['year_5'] / 12 if precipitation_data['has_data_5'] else np.nan

    output_geojson_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    gdf_chunk.to_file(output_geojson_path, driver="GeoJSON")

    del gdf_chunk, precipitation_by_years_after_planting
    gc.collect()

    print(f"Processed and saved chunk {i} to {output_geojson_path}")

# Combining all chunks into a single GeoDataFrame
combined_gdf = gpd.GeoDataFrame()

for i in range(0, len(gdf), chunk_size):
    chunk_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    chunk_gdf = gpd.read_file(chunk_path)
    combined_gdf = pd.concat([combined_gdf, chunk_gdf], ignore_index=True)

# Saving the combined GeoDataFrame to a single file
combined_gdf.to_file(combined_output_path, driver="GeoJSON")
print(f"Combined all chunks into {combined_output_path}")

# Deleting individual chunk files to free up space
for i in range(0, len(gdf), chunk_size):
    chunk_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    os.remove(chunk_path)
    print(f"Deleted chunk file: {chunk_path}")

### Maximum and Minimum Temperatuture 
1.Download raster  data for tmax(Maximum Temperature) and tmin (Minimum Temperature) 2.5  to Pc from https://www.worldclim.org/
2.Overlay polygons and extract rainfall from the centoid pixel of each polygon(Site)

In [None]:

geojson_path = "../input/Updated_Reforestation_Data.geojson"
output_folder = "../input/tmax_output"
tif_folder = "../climate_tmax/"
combined_output_path = "../input/Updated_Reforestation_Data.geojson"


gdf = gpd.read_file(geojson_path)

# Checking for the available .tif files
tif_files = [f for f in os.listdir(tif_folder) if f.endswith(".tif")]
if len(tif_files) == 0:
    raise FileNotFoundError("No .tif files found in the directory.")


tif_path = os.path.join(tif_folder, tif_files[0])
with rasterio.open(tif_path) as src:
    tif_crs = src.crs

# Reprojecting GeoDataFrame if CRS doesn't match
if gdf.crs != tif_crs:
    gdf = gdf.to_crs(tif_crs)


gdf['planting_year'] = gdf['planting_date_reported'].apply(extract_year).astype('Int64')


tif_files_by_year = {}
for tif_file in tif_files:
    year_month = tif_file.split("_")[-1].split(".")[0]  
    year, month = int(year_month.split("-")[0]), int(year_month.split("-")[1])
    if year not in tif_files_by_year:
        tif_files_by_year[year] = {}
    tif_files_by_year[year][month] = tif_file


chunk_size = 200  


for i in range(0, len(gdf), chunk_size):
    gdf_chunk = gdf.iloc[i:i + chunk_size].copy()
    tmax_by_years_after_planting = {}

    for idx, polygon in gdf_chunk.iterrows():
        planting_year = polygon['planting_year']
        if pd.isna(planting_year):
            continue

        tmax_by_years_after_planting[idx] = {
            'planting_year': 0, 'year_1': 0, 'year_2': 0, 'year_5': 0,
            'has_data_planting': False, 'has_data_1': False, 'has_data_2': False, 'has_data_5': False
        }

        centroid = polygon['geometry'].centroid
        centroid_point = [(centroid.x, centroid.y)]

        for year_offset in [0, 1, 2, 5]:
            current_year = planting_year + year_offset
            if current_year in tif_files_by_year:
                total_tmax_for_year = 0
                has_valid_data = False

                for month in range(1, 13):
                    if month in tif_files_by_year[current_year]:
                        tif_file = tif_files_by_year[current_year][month]
                        tif_path = os.path.join(tif_folder, tif_file)

                        try:
                            with rasterio.open(tif_path) as src:
                                for val in src.sample(centroid_point):
                                    valid_data = val[0]
                                    if not np.isnan(valid_data):
                                        total_tmax_for_year += valid_data
                                        has_valid_data = True
                        except Exception as e:
                            print(f"Error processing {tif_file}: {e}")

                # Updating Tmax data
                if year_offset == 0:
                    tmax_by_years_after_planting[idx]['planting_year'] = total_tmax_for_year
                    tmax_by_years_after_planting[idx]['has_data_planting'] = has_valid_data
                elif year_offset == 1:
                    tmax_by_years_after_planting[idx]['year_1'] = total_tmax_for_year
                    tmax_by_years_after_planting[idx]['has_data_1'] = has_valid_data
                elif year_offset == 2:
                    tmax_by_years_after_planting[idx]['year_2'] = total_tmax_for_year
                    tmax_by_years_after_planting[idx]['has_data_2'] = has_valid_data
                elif year_offset == 5:
                    tmax_by_years_after_planting[idx]['year_5'] = total_tmax_for_year
                    tmax_by_years_after_planting[idx]['has_data_5'] = has_valid_data

  
    for idx, tmax_data in tmax_by_years_after_planting.items():
        gdf_chunk.at[idx, "avg_tmax_planting_year"] = tmax_data['planting_year'] / 12 if tmax_data['has_data_planting'] else np.nan
        gdf_chunk.at[idx, "avg_tmax_1_year_after"] = tmax_data['year_1'] / 12 if tmax_data['has_data_1'] else np.nan
        gdf_chunk.at[idx, "avg_tmax_2_years_after"] = tmax_data['year_2'] / 12 if tmax_data['has_data_2'] else np.nan
        gdf_chunk.at[idx, "avg_tmax_5_years_after"] = tmax_data['year_5'] / 12 if tmax_data['has_data_5'] else np.nan

    output_geojson_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    gdf_chunk.to_file(output_geojson_path, driver="GeoJSON")

    del gdf_chunk, tmax_by_years_after_planting
    gc.collect()

    print(f"Processed and saved chunk {i} to {output_geojson_path}")

combined_gdf = gpd.GeoDataFrame()

for i in range(0, len(gdf), chunk_size):
    chunk_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    chunk_gdf = gpd.read_file(chunk_path)
    combined_gdf = pd.concat([combined_gdf, chunk_gdf], ignore_index=True)


combined_gdf.to_file(combined_output_path, driver="GeoJSON")
print(f"Combined all chunks into {combined_output_path}")


for i in range(0, len(gdf), chunk_size):
    chunk_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    os.remove(chunk_path)
    print(f"Deleted chunk file: {chunk_path}")

In [None]:

geojson_path = "../input/Updated_Reforestation_Data.geojson"
output_folder = "../input/tmin_output"
tif_folder = "../climate_tmin/"
combined_output_path = "../input/Updated_Reforestation_Data.geojson" 


gdf = gpd.read_file(geojson_path)


tif_files = [f for f in os.listdir(tif_folder) if f.endswith(".tif")]
if len(tif_files) == 0:
    raise FileNotFoundError("No .tif files found in the directory.")

tif_path = os.path.join(tif_folder, tif_files[0])
with rasterio.open(tif_path) as src:
    tif_crs = src.crs

if gdf.crs != tif_crs:
    gdf = gdf.to_crs(tif_crs)

tif_files_by_year = {}
for tif_file in tif_files:
    year_month = tif_file.split("_")[-1].split(".")[0]  
    year, month = int(year_month.split("-")[0]), int(year_month.split("-")[1])
    if year not in tif_files_by_year:
        tif_files_by_year[year] = {}
    tif_files_by_year[year][month] = tif_file


chunk_size = 200  


for i in range(0, len(gdf), chunk_size):
    gdf_chunk = gdf.iloc[i:i + chunk_size].copy()
    tmin_by_years_after_planting = {}

    for idx, polygon in gdf_chunk.iterrows():
        planting_year = polygon['planting_year']
        if pd.isna(planting_year):
            continue

        tmin_by_years_after_planting[idx] = {
            'planting_year': 0, 'year_1': 0, 'year_2': 0, 'year_5': 0,
            'has_data_planting': False, 'has_data_1': False, 'has_data_2': False, 'has_data_5': False
        }

        centroid = polygon['geometry'].centroid
        centroid_point = [(centroid.x, centroid.y)]

        for year_offset in [0, 1, 2, 5]:
            current_year = planting_year + year_offset
            if current_year in tif_files_by_year:
                total_tmin_for_year = 0
                has_valid_data = False

                for month in range(1, 13):
                    if month in tif_files_by_year[current_year]:
                        tif_file = tif_files_by_year[current_year][month]
                        tif_path = os.path.join(tif_folder, tif_file)

                        try:
                            with rasterio.open(tif_path) as src:
                                for val in src.sample(centroid_point):
                                    valid_data = val[0]
                                    if not np.isnan(valid_data):
                                        total_tmin_for_year += valid_data
                                        has_valid_data = True
                        except Exception as e:
                            print(f"Error processing {tif_file}: {e}")

                # Updating Tmin data
                if year_offset == 0:
                    tmin_by_years_after_planting[idx]['planting_year'] = total_tmin_for_year
                    tmin_by_years_after_planting[idx]['has_data_planting'] = has_valid_data
                elif year_offset == 1:
                    tmin_by_years_after_planting[idx]['year_1'] = total_tmin_for_year
                    tmin_by_years_after_planting[idx]['has_data_1'] = has_valid_data
                elif year_offset == 2:
                    tmin_by_years_after_planting[idx]['year_2'] = total_tmin_for_year
                    tmin_by_years_after_planting[idx]['has_data_2'] = has_valid_data
                elif year_offset == 5:
                    tmin_by_years_after_planting[idx]['year_5'] = total_tmin_for_year
                    tmin_by_years_after_planting[idx]['has_data_5'] = has_valid_data

 
    for idx, tmin_data in tmin_by_years_after_planting.items():
        gdf_chunk.at[idx, "avg_tmin_planting_year"] = tmin_data['planting_year'] / 12 if tmin_data['has_data_planting'] else np.nan
        gdf_chunk.at[idx, "avg_tmin_1_year_after"] = tmin_data['year_1'] / 12 if tmin_data['has_data_1'] else np.nan
        gdf_chunk.at[idx, "avg_tmin_2_years_after"] = tmin_data['year_2'] / 12 if tmin_data['has_data_2'] else np.nan
        gdf_chunk.at[idx, "avg_tmin_5_years_after"] = tmin_data['year_5'] / 12 if tmin_data['has_data_5'] else np.nan

    output_geojson_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    gdf_chunk.to_file(output_geojson_path, driver="GeoJSON")

    del gdf_chunk, tmin_by_years_after_planting
    gc.collect()

    print(f"Processed and saved chunk {i} to {output_geojson_path}")

combined_gdf = gpd.GeoDataFrame()

for i in range(0, len(gdf), chunk_size):
    chunk_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    chunk_gdf = gpd.read_file(chunk_path)
    combined_gdf = pd.concat([combined_gdf, chunk_gdf], ignore_index=True)


combined_gdf.to_file(combined_output_path, driver="GeoJSON")
print(f"Combined all chunks into {combined_output_path}")

for i in range(0, len(gdf), chunk_size):
    chunk_path = os.path.join(output_folder, f"df_reforestation_chunk_{i}.geojson")
    os.remove(chunk_path)
    print(f"Deleted chunk file: {chunk_path}")