In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.cluster import DBSCAN
from tqdm import tqdm
import matplotlib.pyplot as plt
from helper_functions import  clean_data

In [5]:
projects = gpd.read_file("../input/Updated_Reforestation_Data.geojson")

# Checking for nested Polygons and creating column "Nested_in"

In [7]:


projects['Nested_in'] = [[] for _ in range(len(projects))]

filtered_projects = projects

# Performing spatial join with the correct predicate argument
possible_matches = gpd.sjoin(filtered_projects, filtered_projects, how='left', predicate='within')

# Filtering out self-joins
possible_matches = possible_matches[possible_matches.index != possible_matches.index_right]

# Groupping by index and create a list of nested project IDs
nested_in_mapping = possible_matches.groupby(possible_matches.index)['created_site_ids_right'].apply(list)

# Updating the 'Nested_in' column with the nested project IDs
for index, nested_in in nested_in_mapping.items():
    filtered_projects.at[index, 'Nested_in'] = nested_in


filtered_projects['Nested_in'] = filtered_projects['Nested_in'].apply(lambda x: ', '.join(map(str, x)) if x else None)

### Filtering out all Nesting polygons with site_sqkm>=100,optional step

In [8]:


# site_sqkm_lookup = filtered_projects.set_index('created_site_ids')['site_sqkm'].to_dict()


# nested_ids = filtered_projects['Nested_in'].dropna().apply(lambda x: x.split(', ')).explode().unique()


# nested_ids_set = set(nested_ids)


# rows_to_drop = filtered_projects[
#     filtered_projects['created_site_ids'].isin(nested_ids_set) & 
#     (filtered_projects['site_sqkm'] >= 100)
# ].index


# filtered_projects = filtered_projects.drop(index=rows_to_drop).reset_index(drop=True)



### For Multiple nesting polygons(Optional cleanup and filtering step)

In [9]:

# def clean_data(value):
#     if isinstance(value, dict):
#         return str(value)
#     if isinstance(value, list):
#         return str(value)
#     return value if not pd.isna(value) else None


# for column in filtered_projects.columns:
 
#     filtered_projects[column] = filtered_projects[column].apply(clean_data)

# for column in filtered_projects.columns:
#     non_string_mask = filtered_projects[column].apply(lambda x: isinstance(x, dict) or not isinstance(x, (str, int, float, type(None))))
#     if non_string_mask.any():
#         print(f"Column {column} contains problematic entries after cleanup:")
#         print(filtered_projects[non_string_mask])
#     else:
#         print(f"Column {column} is clean.")


# single_or_no_nested_rows = filtered_projects[
#     filtered_projects['Nested_in'].apply(lambda x: isinstance(x, str) and len(x.split(', ')) == 1 or pd.isna(x))
# ]
# print(f"Rows with single or no 'Nested_in': {single_or_no_nested_rows.shape[0]}")


# multi_nested_rows = filtered_projects[
#     filtered_projects['Nested_in'].apply(lambda x: isinstance(x, str) and len(x.split(', ')) > 1)
# ]
# print(f"Rows with multiple values in 'Nested_in': {multi_nested_rows.shape[0]}")


# multi_nested_ids = multi_nested_rows['Nested_in'].str.split(', ').explode().unique()


# relevant_rows = filtered_projects[
#     filtered_projects['created_site_ids'].isin(multi_nested_ids)
# ]


# max_site_sqkm_rows = relevant_rows.groupby('created_site_ids').apply(
#     lambda x: x.loc[x['site_sqkm'].idxmax()]
# ).reset_index(drop=True)


# try:
#     rows_to_keep_from_multi_nested = multi_nested_rows.merge(
#         max_site_sqkm_rows[['created_site_ids']],
#         on='created_site_ids'
#     ).drop_duplicates()
#     print(f"Rows to keep from multi-nested rows: {rows_to_keep_from_multi_nested.shape[0]}")
# except Exception as e:
#     print(f"Error during merge: {e}")
#     rows_to_keep_from_multi_nested = pd.DataFrame()

# try:
#     filtered_projects = pd.concat([single_or_no_nested_rows, rows_to_keep_from_multi_nested]).drop_duplicates().reset_index(drop=True)
#     print(f"Final number of rows: {filtered_projects.shape[0]}")
# except Exception as e:
#     print(f"Error during concatenation: {e}")


Column id is clean.
Column country is clean.
Column description_reported is clean.
Column host_name is clean.
Column planting_date_reported is clean.
Column project_id_reported is clean.
Column site_id_reported is clean.
Column site_sqkm is clean.
Column survival_rate_reported is clean.
Column trees_planted_reported is clean.
Column url is clean.
Column geometry_reported is clean.
Column Top_Three_NDVI_Months is clean.
Column species_planted_reported is clean.
Column species_count_reported is clean.
Column built_area is clean.
Column intersecting_roads_count is clean.
Column total_road_length_km is clean.
Column loss_pre_5 is clean.
Column loss_post_3 is clean.
Column loss_post_5 is clean.
Column tree_cover_area_2020 is clean.
Column tree_cover_area_2015 is clean.
Column tree_cover_area_2010 is clean.
Column tree_cover_area_2005 is clean.
Column tree_cover_area_2000 is clean.
Column cropland gain from trees is clean.
Column cropland loss to tree is clean.
Column permanent water is clea

  max_site_sqkm_rows = relevant_rows.groupby('created_site_ids').apply(


Rows to keep from multi-nested rows: 228
Final number of rows: 772494


### Intersecting Polgons checking and adding the area of the polygon intersecting with the other

In [14]:



spatial_index = filtered_projects.sindex

filtered_projects['Intersecting_with'] = None
filtered_projects['Intersection_Area_sqkm'] = None

intersections_dict = {i: [] for i in filtered_projects.index}

def find_intersections(row):
    global num_intersecting_pairs
    possible_matches_index = list(spatial_index.intersection(row.geometry.bounds))
    intersecting_ids = []
    intersecting_areas = []
    
    for j in possible_matches_index:
        if row.name >= j:
            continue  
        
        other_row = filtered_projects.iloc[j]
        if row.geometry.intersects(other_row.geometry):
            intersecting_ids.append(other_row['created_site_ids'])
            intersections_dict[j].append(row['created_site_ids'])
      
            intersection_geom = row.geometry.intersection(other_row.geometry)
            intersection_area_sqkm = intersection_geom.area / 1e6 
            intersecting_areas.append(intersection_area_sqkm)
            
            num_intersecting_pairs += 1
    return intersecting_ids, intersecting_areas


filtered_projects[['Intersecting_with', 'Intersection_Area_sqkm']] = filtered_projects.apply(
    lambda row: pd.Series(find_intersections(row)), axis=1)


for idx, (intersecting_ids, _) in enumerate(zip(filtered_projects['Intersecting_with'], filtered_projects['Intersection_Area_sqkm'])):
    intersections_dict[idx] += intersecting_ids


filtered_projects['Intersection_Area_sqkm'] = filtered_projects['Intersection_Area_sqkm'].apply(
    lambda x: ', '.join(map(str, x)) if x else None)


print(filtered_projects[['created_site_ids', 'Intersecting_with', 'Intersection_Area_sqkm']].head())
print(f"Number of intersecting pairs: {num_intersecting_pairs}")


  created_site_ids  Intersecting_with Intersection_Area_sqkm
0  reforest_site_1  [reforest_site_2]   0.054014828436071013
1  reforest_site_2                 []                   None
2  reforest_site_3                 []                   None
3  reforest_site_4                 []                   None
4  reforest_site_5                 []                   None
Number of intersecting pairs: 1131480


In [None]:
filtered_projects.info()