In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.cluster import DBSCAN
from tqdm import tqdm
from shapely import wkb
import matplotlib.pyplot as plt
from helper_functions import  clean_data



In [2]:


file_path = "/Users/angela/Documents/GFW/Forest_Monitoring/midsave/consolidated_reforestation_projects.parquet"


projects = gpd.read_parquet(file_path)


projects .head()

Unnamed: 0,site_id_created,project_id_reported,site_id_reported,site_description_reported,site_sqkm,trees_planted_reported,country,project_description_reported,planting_date_reported,survival_rate_reported,host_name,url,species_count_reported,species_planted_reported,geometry,Creator,project_id_created
0,0,proj_ezpAp1POh20dBnYpx0BjhU35,site_W97pqKxXURFOA1E,Farm for the Future demonstration plot,0.013591,313.0,br,This project will be implemented at Farm of th...,,80.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-49.95883 -9.35107, -49.95976 -9.351...",,0
1,1,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_NekKEGqkIO4rZ5C,The area to be reforested is around the Tinguá...,0.631388,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.4725 -22.48945, -43.47236 -22.48...",,1
2,2,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_Wl3hF91IBkei1Xy,The area to be reforested is around the Tinguá...,3.076566,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.462 -22.4779, -43.46583 -22.4875...",,1
3,3,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_qHUXswEmePqou5T,The area to be reforested is around the Tinguá...,0.30486,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.46833 -22.4919, -43.46834 -22.49...",,1
4,4,proj_nXBzA2sbX2tm1D75p7bfJ81Z,site_2ITLGnOa3jbDUFa,Plant-for-Ghana is a pioneer reforestation pro...,10.375493,43814.0,gh,Plant-for-Ghana is a hybrid restoration agrofo...,2021.0,93.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-2.01902 8.21743, -2.02027 8.2264, -...",,2


# Checking for nested Polygons and creating column "Nested_in"

In [3]:
projects.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1229175 entries, 0 to 1229174
Data columns (total 17 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_created               1229175 non-null  int64   
 1   project_id_reported           1229175 non-null  object  
 2   site_id_reported              1229175 non-null  object  
 3   site_description_reported     1696 non-null     object  
 4   site_sqkm                     1229175 non-null  float64 
 5   trees_planted_reported        4349 non-null     float64 
 6   country                       5030 non-null     object  
 7   project_description_reported  1228611 non-null  object  
 8   planting_date_reported        4821 non-null     float64 
 9   survival_rate_reported        2514 non-null     float64 
 10  host_name                     1229175 non-null  object  
 11  url                           1229175 non-null  object  
 12  specie

In [5]:

invalid_geometries = projects[~projects.is_valid]

# Fixing any invalid geometries
projects['geometry'] = projects['geometry'].buffer(0)

# Initializing the 'Nested_in' column
projects['Nested_in'] = [[] for _ in range(len(projects))]

filtered_projects = projects

# Performing spatial join with the correct predicate argument
possible_matches = gpd.sjoin(filtered_projects, filtered_projects, how='left', predicate='within')

# Filtering out self-joins
possible_matches = possible_matches[possible_matches.index != possible_matches.index_right]

# Grouping by index and create a list of nested project IDs
nested_in_mapping = possible_matches.groupby(possible_matches.index)['site_id_created_right'].apply(list)

# Updating the 'Nested_in' column with the nested project IDs
for index, nested_in in nested_in_mapping.items():
    filtered_projects.at[index, 'Nested_in'] = nested_in


filtered_projects['Nested_in'] = filtered_projects['Nested_in'].apply(lambda x: ', '.join(map(str, x)) if x else None)


print(filtered_projects.head())

   site_id_created            project_id_reported      site_id_reported  \
0                0  proj_ezpAp1POh20dBnYpx0BjhU35  site_W97pqKxXURFOA1E   
1                1  proj_ZCspL8JYmUu0OXcx6O73I1j0  site_NekKEGqkIO4rZ5C   
2                2  proj_ZCspL8JYmUu0OXcx6O73I1j0  site_Wl3hF91IBkei1Xy   
3                3  proj_ZCspL8JYmUu0OXcx6O73I1j0  site_qHUXswEmePqou5T   
4                4  proj_nXBzA2sbX2tm1D75p7bfJ81Z  site_2ITLGnOa3jbDUFa   

                           site_description_reported  site_sqkm  \
0             Farm for the Future demonstration plot   0.013591   
1  The area to be reforested is around the Tinguá...   0.631388   
2  The area to be reforested is around the Tinguá...   3.076566   
3  The area to be reforested is around the Tinguá...   0.304860   
4  Plant-for-Ghana is a pioneer reforestation pro...  10.375493   

   trees_planted_reported country  \
0                   313.0      br   
1                  3418.0      br   
2                  3418.0      br  

In [6]:
filtered_projects.head()

Unnamed: 0,site_id_created,project_id_reported,site_id_reported,site_description_reported,site_sqkm,trees_planted_reported,country,project_description_reported,planting_date_reported,survival_rate_reported,host_name,url,species_count_reported,species_planted_reported,geometry,Creator,project_id_created,Nested_in
0,0,proj_ezpAp1POh20dBnYpx0BjhU35,site_W97pqKxXURFOA1E,Farm for the Future demonstration plot,0.013591,313.0,br,This project will be implemented at Farm of th...,,80.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-49.95883 -9.35107, -49.95866 -9.352...",,0,983967.0
1,1,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_NekKEGqkIO4rZ5C,The area to be reforested is around the Tinguá...,0.631388,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.4725 -22.48945, -43.47236 -22.48...",,1,900848.0
2,2,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_Wl3hF91IBkei1Xy,The area to be reforested is around the Tinguá...,3.076566,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.462 -22.4779, -43.46583 -22.4875...",,1,900848.0
3,3,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_qHUXswEmePqou5T,The area to be reforested is around the Tinguá...,0.30486,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.46833 -22.4919, -43.46834 -22.49...",,1,900848.0
4,4,proj_nXBzA2sbX2tm1D75p7bfJ81Z,site_2ITLGnOa3jbDUFa,Plant-for-Ghana is a pioneer reforestation pro...,10.375493,43814.0,gh,Plant-for-Ghana is a hybrid restoration agrofo...,2021.0,93.0,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-2.01902 8.21743, -2.02027 8.2264, -...",,2,


In [7]:

non_null_nested_in = filtered_projects[filtered_projects['Nested_in'].notnull()]


non_null_nested_in.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 430267 entries, 0 to 1229171
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype   
---  ------                        --------------   -----   
 0   site_id_created               430267 non-null  int64   
 1   project_id_reported           430267 non-null  object  
 2   site_id_reported              430267 non-null  object  
 3   site_description_reported     201 non-null     object  
 4   site_sqkm                     430267 non-null  float64 
 5   trees_planted_reported        506 non-null     float64 
 6   country                       939 non-null     object  
 7   project_description_reported  430125 non-null  object  
 8   planting_date_reported        572 non-null     float64 
 9   survival_rate_reported        311 non-null     float64 
 10  host_name                     430267 non-null  object  
 11  url                           430267 non-null  object  
 12  species_count_reported    

In [8]:

pd.options.display.float_format = '{:,.3f}'.format

bins = [0, 10, 50, 100, 500, 1000, 2000, 5000, filtered_projects['site_sqkm'].max()]
labels = ['<10', '10-50', '50-100', '100-500', '500-1000', '1000-2000', '2000-5000', '>5000']

filtered_projects['site_sqkm_range'] = pd.cut(filtered_projects['site_sqkm'], bins=bins, labels=labels, right=False)
frequency_table = filtered_projects['site_sqkm_range'].value_counts().sort_index().to_frame(name='Count')

total_count = frequency_table['Count'].sum()
frequency_table['Percentage'] = (frequency_table['Count'] / total_count) * 100

nested_areas = []
total_areas = []
for range_label in labels:
    range_projects = filtered_projects[filtered_projects['site_sqkm_range'] == range_label]
    nested_projects = range_projects[range_projects['Nested_in'].notnull()]
    nested_area = nested_projects['site_sqkm'].sum()
    nested_areas.append(nested_area)
    total_area = range_projects['site_sqkm'].sum()
    total_areas.append(total_area)

frequency_table['Nested_area'] = nested_areas
frequency_table['Total_area'] = total_areas

# Calculating the percentage of total area for each range
total_area_sum = sum(total_areas)
frequency_table['Percentage_area'] = (frequency_table['Total_area'] / total_area_sum) * 100

# Calculating the percentage of nested area for each range
frequency_table['Percentage_nested_area'] = (frequency_table['Nested_area'] / frequency_table['Total_area']) * 100

frequency_table

Unnamed: 0_level_0,Count,Percentage,Nested_area,Total_area,Percentage_area,Percentage_nested_area
site_sqkm_range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
<10,1225267,99.682,6858.107,53188.845,1.166,12.894
10-50,1423,0.116,4302.275,28885.354,0.633,14.894
50-100,232,0.019,2799.136,16500.936,0.362,16.963
100-500,636,0.052,24145.68,156765.923,3.438,15.402
500-1000,254,0.021,1908.861,183050.17,4.014,1.043
1000-2000,637,0.052,0.0,851289.896,18.668,0.0
2000-5000,488,0.04,0.0,1622356.473,35.577,0.0
>5000,237,0.019,0.0,1648053.725,36.141,0.0


### Intersecting Polgons checking and adding the area of the polygon intersecting with the other

In [11]:
filtered_projects = filtered_projects.dropna(subset=['geometry'])

spatial_index = filtered_projects.sindex

filtered_projects['Intersecting_with'] = None
filtered_projects['Intersection_Area_sqkm'] = None
intersections_dict = {i: [] for i in filtered_projects.index}
num_intersecting_pairs = 0

def find_intersections(row):
    global num_intersecting_pairs
    if row.geometry is None or not row.geometry.is_valid:
        return [], []
    
    possible_matches_index = list(spatial_index.intersection(row.geometry.bounds))
    intersecting_ids = []
    intersecting_areas = []
    
    for j in possible_matches_index:
        if row.name >= j:
            continue  
        
        other_row = filtered_projects.iloc[j]
        if other_row.geometry is None or not other_row.geometry.is_valid:
            continue
        
        if row.geometry.intersects(other_row.geometry):
            intersecting_ids.append(other_row['site_id_created'])
            intersections_dict[j].append(row['site_id_created'])
      
            intersection_geom = row.geometry.intersection(other_row.geometry)
            intersection_area_sqkm = intersection_geom.area / 1e6 
            intersecting_areas.append(intersection_area_sqkm)
            
            num_intersecting_pairs += 1
    return intersecting_ids, intersecting_areas

filtered_projects[['Intersecting_with', 'Intersection_Area_sqkm']] = filtered_projects.apply(
    lambda row: pd.Series(find_intersections(row)), axis=1)

for idx, intersecting_ids in filtered_projects['Intersecting_with'].items():
    if intersecting_ids:  # Checking if the list is not empty
        intersections_dict[idx] += intersecting_ids

filtered_projects['Intersection_Area_sqkm'] = filtered_projects['Intersection_Area_sqkm'].apply(
    lambda x: ', '.join(map(str, x)) if x else None)

print(filtered_projects[['site_id_created', 'Intersecting_with', 'Intersection_Area_sqkm']].head())
print(f"Number of intersecting pairs: {num_intersecting_pairs}")

   site_id_created Intersecting_with  \
0                0          [983967]   
1                1       [900848, 2]   
2                2       [900848, 3]   
3                3          [900848]   
4                4                []   

                           Intersection_Area_sqkm  
0                           1.082189494094502e-12  
1   4.707428100000711e-11, 4.8000000258555815e-17  
2  2.2939758700001258e-10, 1.4999999917151865e-17  
3                          2.2730217500009304e-11  
4                                            None  
Number of intersecting pairs: 1238071


In [12]:
filtered_projects.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1229174 entries, 0 to 1229174
Data columns (total 21 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_created               1229174 non-null  int64   
 1   project_id_reported           1229174 non-null  object  
 2   site_id_reported              1229174 non-null  object  
 3   site_description_reported     1695 non-null     object  
 4   site_sqkm                     1229174 non-null  float64 
 5   trees_planted_reported        4348 non-null     float64 
 6   country                       5029 non-null     object  
 7   project_description_reported  1228610 non-null  object  
 8   planting_date_reported        4821 non-null     float64 
 9   survival_rate_reported        2513 non-null     float64 
 10  host_name                     1229174 non-null  object  
 11  url                           1229174 non-null  object  
 12  species_cou