In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from helper_functions import extract_planting_date,get_nearest_tree_cover,check_intersection
file_path = "../midsave/newest_consolidated_reforestation_projects_with_cicular.parquet"


merged_df= gpd.read_parquet(file_path)


In [None]:
merged_df =merged_df.rename(columns={
    "NDVI_Polygon_atplanting": "NDVI_Polygon_At_Planting",
    "NDVI_Polygon_1yr_before": "NDVI_Polygon_1YearBefore",
    "NDVI_Polygon_1yr_after": "NDVI_Polygon_1YearAfter",
    "NDVI_Polygon_2yr_after": "NDVI_Polygon_2YearsAfter",
    "NDVI_Polygon_5yr_after": "NDVI_Polygon_5YearsAfter",
    "NDVI_Buffer_atplanting": "NDVI_Buffer_At_Planting",
    "NDVI_Buffer_1yr_before": "NDVI_Buffer_1YearBefore",
    "NDVI_Buffer_1yr_after": "NDVI_Buffer_1YearAfter",
    "NDVI_Buffer_2yr_after": "NDVI_Buffer_2YearsAfter",
    "NDVI_Buffer_5yr_after": "NDVI_Buffer_5YearsAfter",
      "NDRE_Polygon_atplanting": "NDRE_Polygon_At_Planting",
    "NDRE_Polygon_1yr_before": "NDRE_Polygon_1YearBefore",
    "NDRE_Polygon_1yr_after": "NDRE_Polygon_1YearAfter",
    "NDRE_Polygon_2yr_after": "NDRE_Polygon_2YearsAfter",
    "NDRE_Polygon_5yr_after": "NDRE_Polygon_5YearsAfter",
    "NDRE_Buffer_atplanting": "NDRE_Buffer_At_Planting",
    "NDRE_Buffer_1yr_before": "NDRE_Buffer_1YearBefore",
    "NDRE_Buffer_1yr_after": "NDRE_Buffer_1YearAfter",
    "NDRE_Buffer_2yr_after": "NDRE_Buffer_2YearsAfter",
    "NDRE_Buffer_5yr_after": "NDRE_Buffer_5YearsAfter",
        "SAVI_Polygon_atplanting": "SAVI_Polygon_At_Planting",
    "SAVI_Polygon_1yr_before": "SAVI_Polygon_1YearBefore",
    "SAVI_Polygon_1yr_after": "SAVI_Polygon_1YearAfter",
    "SAVI_Polygon_2yr_after": "SAVI_Polygon_2YearsAfter",
    "SAVI_Polygon_5yr_after": "SAVI_Polygon_5YearsAfter",
    "SAVI_Buffer_atplanting": "SAVI_Buffer_At_Planting",
    "SAVI_Buffer_1yr_before": "SAVI_Buffer_1YearBefore",
    "SAVI_Buffer_1yr_after": "SAVI_Buffer_1YearAfter",
    "SAVI_Buffer_2yr_after": "SAVI_Buffer_2YearsAfter",
    "SAVI_Buffer_5yr_after": "SAVI_Buffer_5YearsAfter"
})

In [None]:
columns_to_check = [
    "NDVI_Polygon_At_Planting",
    "NDVI_Polygon_1YearAfter",
    "NDVI_Polygon_2YearsAfter",
    # "NDVI_Polygon_5YearsAfter"
]

filtered_data = merged_df.dropna(subset=columns_to_check)
filtered_data.info()

### The  columns to consider in the Data quality Framework


1. Site_sqkm
2. Road presence
3. Nested_in
4. Intersecting_with
5. Built area presence
6. Forest at planting
7. Presence of other land cover classes
8. Administrative area overlap
9. Circular shape indicator
10. Invalid geometries indicator
11. Elevation Indicator
11. Buffer change and polygon area change difference




In [None]:

merged_data=merged_df.copy()



merged_data['planting_date_reported'] = merged_data['planting_date_reported'].apply(extract_planting_date)


merged_data['planting_date_reported'] = pd.to_datetime(merged_data['planting_date_reported'], errors='coerce', utc=True)


merged_data['PlantingYear'] = merged_data['planting_date_reported'].dt.year

merged_data.info()


In [None]:
years = [2000, 2005, 2010, 2015, 2020]




merged_data['treecover_atplanting'] = merged_data.apply(get_nearest_tree_cover, axis=1)

merged_data.head()


In [None]:
# Selecting the  specific quality assessment columns
columns_of_interest = ['site_id_created','site_id_reported','project_id_reported','site_sqkm','trees_planted_reported','Intersecting_with','Nested_in','built_area_2018','total_road_length_km',"loss_post_3",
                       "loss_post_5","loss_pre_5","planting_date_reported","other_land_cover_area_2020",
                       'treecover_atplanting',"mean_elevation",
                       "geometry","Polygon_acircle_oval_95","exact-admin_area"

                       ]
filtered_df = merged_data[columns_of_interest]
filtered_df.info()



# Intersecting and nested polygons to identify quality,completeness of the  Data

In [None]:
# Extracting nested values and handle conversions safely
nested_values = set()
for val in filtered_df['Nested_in'].dropna():
    nested_values.update(map(str.strip, str(val).split(',')))


converted_nested = set()
for value in nested_values:
    try:

        converted_value = str(int(float(value)))
        converted_nested.add(converted_value)
    except ValueError:

        pass


filtered_df['Contains_small_polygon'] = filtered_df['site_id_created'].astype(str).apply(
    lambda x: 'Yes' if x in converted_nested else 'No'
)

print(filtered_df['Contains_small_polygon'].value_counts())

In [None]:


filtered_df['Intersecting Polygon'] = filtered_df['Intersecting_with'].apply(check_intersection)

# Not nesting other polygon allocate 1 else 0
filtered_df['Nesting Polygon'] = np.where(
    filtered_df['Contains_small_polygon'] == 'No',
    1,
    0
)
filtered_df['Intersecting Polygon'].value_counts()


# Presence of Forest at Planting
Presence of vegetation at planting date either missed planting date or missed geometry boundaries

In [None]:
filtered_df["forest_at_planting_glad"] = np.where(
    filtered_df["treecover_atplanting"].isna(),
    np.nan,
    np.where(
        filtered_df["treecover_atplanting"] >= 0.2 * filtered_df["site_sqkm"],
        0,
        1
    )
)
filtered_df["forest_at_planting_glad"].value_counts()


# Presence Other Landcover Score

In [None]:

filtered_df['other_landcover_score'] = np.select(
    [
        filtered_df['other_land_cover_area_2020'].isna(),

        filtered_df['other_land_cover_area_2020'] >= 0.2 * filtered_df['site_sqkm']  # If it's at least 10% of site_sqkm, return 0
    ],
    [

        np.nan,
        0
    ],
    default=1  # Else, return 1
)
filtered_df['other_landcover_score'].value_counts()


# Infrastructure Presence
Roads and built area

In [None]:
filtered_df["Road_presence"] = filtered_df["total_road_length_km"].apply(
    lambda x: 0 if x > 0 else (1 if x == 0 else np.nan)
)

filtered_df["Road_presence"].value_counts()

Built Area Presence

In [None]:

ratio = filtered_df["built_area_2018"] / filtered_df["site_sqkm"]

filtered_df["Built_area_presence"] = np.where(
    filtered_df["built_area_2018"].isna() | filtered_df["site_sqkm"].isna(),  # Checking for NaN
    np.nan,  # Assigning NaN if either value is missing
    np.where(ratio >= 0.10, 0, 1)  # 0 if ≥10%, 1 otherwise
)


filtered_df["Built_area_presence"].value_counts()

# If Project Geometries Invalid

In [None]:

filtered_df['project_geometries_invalid'] = ~filtered_df.geometry.is_valid


print(filtered_df['project_geometries_invalid'].value_counts())
filtered_df['geometry validity'] = np.where(
    filtered_df['project_geometries_invalid'] == False,
    1,
    0
)
filtered_df['geometry validity'].value_counts()

## Circularity

In [None]:

filtered_df['perfect_circle_indicator'] = np.where(
    filtered_df['Polygon_acircle_oval_95'] == 'No',
    1,
    0
)
filtered_df['perfect_circle_indicator'].value_counts()

# Exact Admin area

In [None]:

filtered_df['admin_area_indicator'] = np.where(
    filtered_df['exact-admin_area'] == 'No',
    1,
    0
)
filtered_df['admin_area_indicator'].value_counts()

## Elevation Indicator

In [None]:
threshold = 1500
filtered_df['Elevation_indicator'] = filtered_df["mean_elevation"].apply(lambda x: 1 if x < threshold else 0)
filtered_df['Elevation_indicator'].value_counts()

In [None]:

score_columns = [
     'forest_at_planting_glad',
     # "Elevation_indicator",
     "other_landcover_score",'Built_area_presence','Road_presence',"geometry validity",'admin_area_indicator','perfect_circle_indicator','Intersecting Polygon','Nesting Polygon'
]



In [None]:



filtered_df['quality_score'] = filtered_df[score_columns].sum(axis=1)
filtered_df['quality_score'] = filtered_df['quality_score'].round(2)
filtered_df.head()


In [None]:
filtered_large_sites = filtered_df[filtered_df["site_sqkm"]<=5]
filtered_large_sites.info()

In [None]:
#visualizing the distribution of quality scores

plt.figure(figsize=(8, 5))


global_min = 1
global_max = 9


bins = np.arange(global_min - 0.5, global_max + 1.5, 1)


plt.hist(filtered_large_sites['quality_score'], bins=bins, color="#08519c", edgecolor='black', alpha=0.6)

# plt.title('Distribution of Quality Scores', fontsize=16)
plt.xlabel('LDIS', fontsize=16)
plt.ylabel('Frequency', fontsize=16)


plt.xlim(global_min - 0.5, global_max + 0.5)
plt.xticks(np.arange(global_min, global_max + 1), fontsize=14)

plt.yticks(fontsize=14)

# plt.grid(axis='y', linestyle='--', alpha=0.7)
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()
