In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.cluster import DBSCAN
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
projects = gpd.read_file('/home/idisc02/Forest_Monitoring/src/df_reforestation.geojson')

# Checking for nested Polygons and creating column "Nested_in"

In [None]:
projects['Nested_in'] = [[] for _ in range(len(projects))]


filtered_projects=projects

possible_matches = gpd.sjoin(filtered_projects, filtered_projects, how='left', op='within')

possible_matches = possible_matches[possible_matches.index != possible_matches.index_right]


nested_in_mapping = possible_matches.groupby(possible_matches.index)['created_site_ids_right'].apply(list)


for index, nested_in in nested_in_mapping.items():
    filtered_projects.at[index, 'Nested_in'] = nested_in


filtered_projects['Nested_in'] = filtered_projects['Nested_in'].apply(lambda x: ', '.join(map(str, x)) if x else None)

## Filtering out all Nesting polygons with site_sqkm>=100

In [None]:


site_sqkm_lookup = filtered_projects.set_index('created_site_ids')['site_sqkm'].to_dict()


nested_ids = filtered_projects['Nested_in'].dropna().apply(lambda x: x.split(', ')).explode().unique()


nested_ids_set = set(nested_ids)


rows_to_drop = filtered_projects[
    filtered_projects['created_site_ids'].isin(nested_ids_set) & 
    (filtered_projects['site_sqkm'] >= 100)
].index


filtered_projects = filtered_projects.drop(index=rows_to_drop).reset_index(drop=True)



### For Multiple nesting polygons

In [None]:



site_sqkm_lookup = filtered_projects.set_index('created_site_ids')['site_sqkm'].to_dict()


single_or_no_nested_rows = filtered_projects[
    filtered_projects['Nested_in'].apply(lambda x: isinstance(x, str) and len(x.split(', ')) == 1 or pd.isna(x))
]

print(f"Rows with single or no 'Nested_in': {single_or_no_nested_rows.shape[0]}")

multi_nested_rows = filtered_projects[
    filtered_projects['Nested_in'].apply(lambda x: isinstance(x, str) and len(x.split(', ')) > 1)
]


multi_nested_ids = multi_nested_rows['Nested_in'].str.split(', ').explode().unique()

print(f"Rows with multiple values in 'Nested_in': {multi_nested_rows.shape[0]}")

relevant_rows = filtered_projects[
    filtered_projects['created_site_ids'].isin(multi_nested_ids)
]


max_site_sqkm_rows = relevant_rows.groupby('created_site_ids').apply(
    lambda x: x.loc[x['site_sqkm'].idxmax()]
).reset_index(drop=True)


rows_to_keep_from_multi_nested = multi_nested_rows.merge(
    max_site_sqkm_rows[['created_site_ids']], 
    on='created_site_ids'
).drop_duplicates()

print(f"Rows to keep from multi-nested rows: {rows_to_keep_from_multi_nested.shape[0]}")

filtered_projects = pd.concat([single_or_no_nested_rows, rows_to_keep_from_multi_nested]).drop_duplicates().reset_index(drop=True)

print(f"Final number of rows: {filtered_projects.shape[0]}")


In [None]:

columns_with_lists = [col for col in filtered_projects.columns if col != 'geometry' and filtered_projects[col].apply(lambda x: isinstance(x, list)).any()]


for col in columns_with_lists:
    filtered_projects[col] = filtered_projects[col].apply(lambda x: str(x) if isinstance(x, list) else x)

output_path = '/home/idisc02/Forest_Monitoring/new_filtered_nested_mult.gpkg'


filtered_projects.to_file(output_path, driver="GPKG")

### Intersecting Polgons checking

In [None]:
filtered_projects = gpd.GeoDataFrame(filtered_projects, geometry='geometry')


spatial_index = filtered_projects.sindex

filtered_projects['Intersecting_with'] = None


intersections_dict = {i: [] for i in filtered_projects.index}


def find_intersections(row):
    global num_intersecting_pairs
    possible_matches_index = list(spatial_index.intersection(row.geometry.bounds))
    intersecting_ids = []
    
    for j in possible_matches_index:
        if row.name >= j:
            continue  
        
        other_row = filtered_projects.iloc[j]
        if row.geometry.intersects(other_row.geometry):
            intersecting_ids.append(other_row['created_site_ids'])
            intersections_dict[j].append(row['created_site_ids'])
            num_intersecting_pairs += 1
    return intersecting_ids


filtered_projects['Intersecting_with'] = filtered_projects.apply(find_intersections, axis=1)


for idx, intersecting_ids in enumerate(filtered_projects['Intersecting_with']):
    intersections_dict[idx] += intersecting_ids


filtered_projects['Intersecting_with'] = filtered_projects['Intersecting_with'].apply(lambda x: ', '.join(x) if x else None)

print(filtered_projects[['created_site_ids', 'Intersecting_with']].head())
print(f"Number of intersecting pairs: {num_intersecting_pairs}")