# Consolidating and preprocessing source data

In this script, we combine all the sites from the different reforestation projects from different organization to one dataset and do some initial filtering.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import pycountry
from shapely import wkt
from shapely.geometry import MultiPolygon, Polygon
import geopandas as gpd

### Importing provider data

#### Plant_planet_data


We extracted the data from https://www.plant-for-the-planet.org as described in the script 'Plant_Planet_Meta_Data_preprocessing.ipynb'.

In [2]:
df_plant = gpd.read_file("../midsave/plant_for_planet.gpkg")
df_plant.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3382 entries, 0 to 3381
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           3382 non-null   object  
 1   site_id_reported              3382 non-null   object  
 2   site_description_reported     1972 non-null   object  
 3   site_status_reported          3380 non-null   object  
 4   site_sqkm                     3382 non-null   float64 
 5   trees_planted_reported        3367 non-null   object  
 6   country                       3382 non-null   object  
 7   project_description_reported  3382 non-null   object  
 8   planting_date_reported        2889 non-null   float64 
 9   survival_rate_reported        2445 non-null   object  
 10  host_name                     3382 non-null   object  
 11  url                           3382 non-null   object  
 12  species_count_reported        0 non-null

#### Tree_Nation

We extracted and filtered  the data from https://tree-nation.com/projects as described in the script 'Tree_Nation-meta_data_pre.ipynb'.

In [3]:
df_tn = gpd.read_file("../midsave/tree_nation.gpkg")
df_tn.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1273 entries, 0 to 1272
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   planting_date_reported        1273 non-null   int64   
 1   project_description_reported  1273 non-null   object  
 2   site_id_reported              1273 non-null   int64   
 3   url                           1273 non-null   object  
 4   project_id_reported           1273 non-null   int64   
 5   site_sqkm                     1273 non-null   float64 
 6   planting_date_type            1273 non-null   object  
 7   host_name                     1273 non-null   object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  country                       0 non-null      object  
 11  survival_rate_reported        0 non-null      object  
 12  project_geometries_invalid    1273 non-n

#### Open_Forest_protocol

We extracted and filtered  the data from https://atlas.openforestprotocol.org/  as described in the script  "open_forest_projests_Data_filtering.ipynb".

In [4]:
df_atlas = gpd.read_file("../midsave/atlas.gpkg")
df_atlas.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           268 non-null    object  
 1   site_id_reported              268 non-null    object  
 2   project_description_reported  268 non-null    object  
 3   site_sqkm                     268 non-null    float64 
 4   country                       267 non-null    object  
 5   host_name                     268 non-null    object  
 6   url                           268 non-null    object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   planting_date_reported        0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  project_geometries_invalid    268 non-null

#### Verra

We extracted and filtered  the data from https://registry.verra.org  as described in the script  "extracting_verra_sites.ipynb".

In [5]:
df_verra = gpd.read_parquet("../midsave/verra.parquet")
df_verra.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225162 entries, 0 to 1225161
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_reported              1225162 non-null  int64   
 1   project_id_reported           1225162 non-null  int64   
 2   project_description_reported  1225162 non-null  object  
 3   site_sqkm                     1225162 non-null  float64 
 4   status_reported               1225162 non-null  object  
 5   country                       0 non-null        object  
 6   url                           1225162 non-null  object  
 7   host_name                     1225162 non-null  object  
 8   species_count_reported        0 non-null        object  
 9   species_planted_reported      0 non-null        object  
 10  survival_rate_reported        0 non-null        object  
 11  geometry                      1225159 non-null  geometry
 12  planti

In [6]:
df_verra.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225162 entries, 0 to 1225161
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_reported              1225162 non-null  int64   
 1   project_id_reported           1225162 non-null  int64   
 2   project_description_reported  1225162 non-null  object  
 3   site_sqkm                     1225162 non-null  float64 
 4   status_reported               1225162 non-null  object  
 5   country                       0 non-null        object  
 6   url                           1225162 non-null  object  
 7   host_name                     1225162 non-null  object  
 8   species_count_reported        0 non-null        object  
 9   species_planted_reported      0 non-null        object  
 10  survival_rate_reported        0 non-null        object  
 11  geometry                      1225159 non-null  geometry
 12  planti

In [7]:
df_verra["project_geometries_invalid"].value_counts()

project_geometries_invalid
False    1225159
True           3
Name: count, dtype: int64

In [8]:
folder_path="../midsave/Date_evaluation"

#### Restor.eco

We extracted and filtered  the data from  https://restor.eco/?lat=10.743821093825016&lng=4.473759981496621&zoom=4 as described in the script  "restor.ipynb".

In [9]:
df_restor = gpd.read_file("../midsave/restor_eco.gpkg")
df_restor.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 54880 entries, 0 to 54879
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           54880 non-null  object  
 1   project_description_reported  34283 non-null  object  
 2   planting_date_reported        53032 non-null  object  
 3   url                           54880 non-null  object  
 4   site_sqkm                     54880 non-null  float64 
 5   country                       54880 non-null  object  
 6   project_geometries_invalid    54880 non-null  bool    
 7   site_id_reported              54880 non-null  object  
 8   host_name                     54880 non-null  object  
 9   species_count_reported        0 non-null      object  
 10  species_planted_reported      0 non-null      object  
 11  survival_rate_reported        0 non-null      object  
 12  trees_planted_reported        0 non-nu

#### Explorer Data

This data is extracted from the projects website https://explorer.land/x/projects as described in the script 'explorer_land.ipynb'. The column names were manually edited.

In [10]:
df_ex = gpd.read_file("../midsave/explorer_land.gpkg")
df_ex.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3243 entries, 0 to 3242
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_created               3243 non-null   int64   
 1   site_id_reported              3243 non-null   object  
 2   project_id_reported           3243 non-null   object  
 3   site_description_reported     1801 non-null   object  
 4   project_description_reported  3241 non-null   object  
 5   host_name                     3243 non-null   object  
 6   url                           3243 non-null   object  
 7   site_sqkm                     3243 non-null   float64 
 8   project_geometries_invalid    3243 non-null   bool    
 9   geometry                      3243 non-null   geometry
dtypes: bool(1), float64(1), geometry(1), int64(1), object(6)
memory usage: 231.3+ KB


In [11]:
df_ex["project_geometries_invalid"].value_counts()

project_geometries_invalid
False    3243
Name: count, dtype: int64

#### Face the Future

We extracted  the data from  https://facethefuture.com/#projects described in the script 'Face_Future_metadata_prepro.ipynb'.

In [12]:
df_ftf = gpd.read_file("../midsave/face_the_future.gpkg")
df_ftf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 568 entries, 0 to 567
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           94 non-null     object  
 1   site_id_reported              568 non-null    int64   
 2   trees_planted_reported        94 non-null     float64 
 3   site_sqkm_reported            94 non-null     float64 
 4   planting_date_reported        94 non-null     float64 
 5   project_description_reported  2 non-null      object  
 6   Creator                       94 non-null     object  
 7   site_sqkm                     568 non-null    float64 
 8   host_name                     568 non-null    object  
 9   url                           568 non-null    object  
 10  project_geometries_invalid    568 non-null    bool    
 11  planting_date_type            568 non-null    object  
 12  species_count_reported        0 non-null  

#### Climate Partner Impact
We extracted the data as described in the script 'climate_partner_impact.ipynb'.

In [13]:
df_cpi = gpd.read_file("../midsave/climate_partner_impact.gpkg")
df_cpi.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_reported              0 non-null      object  
 1   project_id_reported           7 non-null      object  
 2   country                       0 non-null      object  
 3   url                           7 non-null      object  
 4   host_name                     7 non-null      object  
 5   project_description_reported  0 non-null      object  
 6   site_sqkm                     0 non-null      object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  geometry                      7 non-null      

#### Forest Trends
We extracted the data as described in the script 'forest_trends.ipynb'.

In [14]:
df_ft = gpd.read_file("../midsave/forest_trends.gpkg")
df_ft.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           277 non-null    object  
 1   site_sqkm_reported            246 non-null    float64 
 2   project_description_reported  277 non-null    object  
 3   country                       277 non-null    object  
 4   host_name                     277 non-null    object  
 5   url                           277 non-null    object  
 6   project_geometries_invalid    277 non-null    bool    
 7   site_sqkm                     0 non-null      object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  site_id_reported              0 non-null  

#### One Tree Planted
We extracted the data as described in the script 'one_tree_planted.ipynb'.

In [15]:
df_otp = gpd.read_file("../midsave/one_tree_planted.gpkg")
df_otp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 819 entries, 0 to 818
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           819 non-null    int64   
 1   project_description_reported  0 non-null      object  
 2   url                           819 non-null    object  
 3   host_name                     819 non-null    object  
 4   project_geometries_invalid    819 non-null    bool    
 5   site_sqkm                     0 non-null      object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  planting_date_reported        0 non-null      object  
 11  country                       0 non-null      object  
 12  site_id_reported              0 non-null  

#### Reforestaction
We extracted the data as described in the script 'reforestaction.ipynb'.

In [16]:
df_reforestaction = gpd.read_file("../midsave/reforestaction.gpkg")
df_reforestaction.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           1100 non-null   int64   
 1   site_id_reported              1100 non-null   int64   
 2   host_name                     1100 non-null   object  
 3   url                           1100 non-null   object  
 4   project_geometries_invalid    1100 non-null   bool    
 5   project_description_reported  0 non-null      object  
 6   site_sqkm                     0 non-null      object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  country                       0 non-null

#### Reforestum
We extracted the data as described in the script 'reforestum.ipynb'.

In [17]:
df_reforestum = gpd.read_file("../midsave/reforestum.gpkg")
df_reforestum.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3691 entries, 0 to 3690
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           3691 non-null   object  
 1   project_description_reported  2082 non-null   object  
 2   country                       0 non-null      object  
 3   site_sqkm                     0 non-null      object  
 4   url                           3691 non-null   object  
 5   host_name                     3691 non-null   object  
 6   project_geometries_invalid    3691 non-null   bool    
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  site_id_reported              0 non-null

#### Trees.org
We extracted the data as described in the script 'trees_org.ipynb'.

In [18]:
df_to = gpd.read_file("../midsave/trees_org.gpkg")
df_to.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           86 non-null     int64   
 1   project_description_reported  86 non-null     object  
 2   country                       86 non-null     object  
 3   planting_date_reported        86 non-null     int32   
 4   url                           86 non-null     object  
 5   planting_date_type            86 non-null     object  
 6   host_name                     86 non-null     object  
 7   project_geometries_invalid    86 non-null     bool    
 8   site_sqkm                     0 non-null      object  
 9   species_count_reported        0 non-null      object  
 10  species_planted_reported      0 non-null      object  
 11  survival_rate_reported        0 non-null      object  
 12  trees_planted_reported        0 non-null    

#### Veritree
We extracted the data as described in the script 'verritree.ipynb'.

In [19]:
df_vt = gpd.read_file("../midsave/veritree.gpkg")
df_vt.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_reported              122 non-null    int64   
 1   project_id_reported           122 non-null    int64   
 2   project_description_reported  122 non-null    object  
 3   country                       122 non-null    object  
 4   survival_rate_reported        122 non-null    int64   
 5   species_planted_reported      122 non-null    object  
 6   host_name                     122 non-null    object  
 7   project_geometries_invalid    122 non-null    bool    
 8   url                           122 non-null    object  
 9   site_sqkm                     122 non-null    float64 
 10  species_count_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  planting_date_reported        0 non-null  

In [20]:
df_vt["site_sqkm"].sum()

1436972.5645987687

#### Zero CO2
We extracted the data as described in the script 'zero_co2.ipynb'.

In [21]:
df_zc = gpd.read_file("../midsave/zero_co2.gpkg")
df_zc.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           7 non-null      object  
 1   site_id_reported              7 non-null      object  
 2   project_description_reported  7 non-null      object  
 3   country                       0 non-null      object  
 4   planting_date_reported        7 non-null      int32   
 5   url                           7 non-null      object  
 6   host_name                     7 non-null      object  
 7   project_geometries_invalid    7 non-null      bool    
 8   site_sqkm                     0 non-null      object  
 9   species_count_reported        0 non-null      object  
 10  species_planted_reported      0 non-null      object  
 11  survival_rate_reported        0 non-null      object  
 12  trees_planted_reported        0 non-null      

In [22]:
df_zc["geometry"]

0     POINT (-89.7276 16.6798)
1       POINT (-91.603 14.097)
2      POINT (36.5974 -3.3363)
3      POINT (12.5324 41.9028)
4       POINT (9.6131 40.9965)
5    POINT (-69.1935 -12.5892)
6    POINT (-71.3322 -42.9407)
Name: geometry, dtype: geometry

 #### Gold Standards
We extracted the data as described in the script 'gold_standards.ipynb'.

In [23]:
df_gs = gpd.read_file('../midsave/gold_standards.gpkg')
df_gs.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           71 non-null     int64   
 1   country                       71 non-null     object  
 2   project_description_reported  71 non-null     object  
 3   project_pdf_available         71 non-null     bool    
 4   url                           71 non-null     object  
 5   host_name                     71 non-null     object  
 6   site_sqkm                     0 non-null      object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  site_id_reported              0 non-null    

 #### American Carbon Registry
We extracted the data as described in the script 'american_carbon_registry.ipynb'.

In [24]:
df_acr= gpd.read_file('../midsave/american_carbon_registry.gpkg')
df_acr.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   project_id_created        7 non-null      int64   
 1   project_id_reported       7 non-null      object  
 2   country                   7 non-null      object  
 3   planting_date_reported    0 non-null      object  
 4   project_pdf_available     7 non-null      bool    
 5   planting_date_type        7 non-null      object  
 6   site_id_created           7 non-null      int64   
 7   site_sqkm                 0 non-null      object  
 8   species_count_reported    0 non-null      object  
 9   species_planted_reported  0 non-null      object  
 10  survival_rate_reported    0 non-null      object  
 11  trees_planted_reported    0 non-null      object  
 12  geometry                  0 non-null      geometry
dtypes: bool(1), geometry(1), int64(2), object(9)
m

 #### Climate Action Reserve
We extracted the data as described in the script 'climate_reserve.ipynb'.

In [54]:
df_car= gpd.read_file('../midsave/climate_action_reserve.gpkg')
df_car.info()
# Ensure the geometry column is not None
if df_car.geometry.isna().any():
    print("There are missing geometries in the dataframe.")
else:
    # Apply the lambda function to get geometry types
    geometry_types = df_car.geometry.apply(lambda geom: geom.geom_type)
    geometry_counts = geometry_types.value_counts()

    print(geometry_counts)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 513 entries, 0 to 512
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   site_id_created             513 non-null    int64   
 1   project_id_reported         513 non-null    object  
 2   planting_date_reported      513 non-null    object  
 3   site_sqkm                   504 non-null    float64 
 4   project_geometries_invalid  504 non-null    object  
 5   url                         513 non-null    object  
 6   project_pdf_available       513 non-null    bool    
 7   species_count_reported      0 non-null      object  
 8   species_planted_reported    0 non-null      object  
 9   survival_rate_reported      0 non-null      object  
 10  trees_planted_reported      0 non-null      object  
 11  geometry                    504 non-null    geometry
dtypes: bool(1), float64(1), geometry(1), int64(1), object(8)
memory usage:

In [56]:
df_car["site_sqkm"].sum()

1683.1142200452955

### Combining the datasets

In [26]:

geo_dfs = [df_plant, df_tn, df_verra, df_atlas, df_restor, df_ftf, df_ex, df_cpi, df_ft, df_otp, df_reforestaction, df_reforestum, df_to, df_vt, df_zc, df_gs, df_acr, df_car]


target_crs = 'EPSG:4326'

for i, gdf in enumerate(geo_dfs):
    if gdf.crs != target_crs:
        geo_dfs[i] = gdf.to_crs(target_crs)


gdf = pd.concat(geo_dfs, ignore_index=True).drop(columns=['site_status_reported', 'status_reported'])

gdf.info()

  gdf = pd.concat(geo_dfs, ignore_index=True).drop(columns=['site_status_reported', 'status_reported'])


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1295476 entries, 0 to 1295475
Data columns (total 21 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1295002 non-null  object  
 1   site_id_reported              1290005 non-null  object  
 2   site_description_reported     3773 non-null     object  
 3   site_sqkm                     1289402 non-null  float64 
 4   trees_planted_reported        3461 non-null     object  
 5   country                       59092 non-null    object  
 6   project_description_reported  1270256 non-null  object  
 7   planting_date_reported        1249358 non-null  object  
 8   survival_rate_reported        2567 non-null     object  
 9   host_name                     1294956 non-null  object  
 10  url                           1295469 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  specie

In [27]:
# Filter rows with null values in the 'geometry' column
null_geometry_rows = gdf[gdf['geometry'].isnull()]

null_geometry_rows.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 149 entries, 617 to 1295475
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           149 non-null    object  
 1   site_id_reported              62 non-null     object  
 2   site_description_reported     4 non-null      object  
 3   site_sqkm                     62 non-null     float64 
 4   trees_planted_reported        4 non-null      object  
 5   country                       82 non-null     object  
 6   project_description_reported  133 non-null    object  
 7   planting_date_reported        67 non-null     object  
 8   survival_rate_reported        4 non-null      object  
 9   host_name                     133 non-null    object  
 10  url                           142 non-null    object  
 11  species_count_reported        0 non-null      object  
 12  species_planted_reported      0 non-null 

### Data cleanup

In [28]:
gdf_filtered = gdf.copy().reset_index(drop = True)
gdf_filtered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1295476 entries, 0 to 1295475
Data columns (total 21 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1295002 non-null  object  
 1   site_id_reported              1290005 non-null  object  
 2   site_description_reported     3773 non-null     object  
 3   site_sqkm                     1289402 non-null  float64 
 4   trees_planted_reported        3461 non-null     object  
 5   country                       59092 non-null    object  
 6   project_description_reported  1270256 non-null  object  
 7   planting_date_reported        1249358 non-null  object  
 8   survival_rate_reported        2567 non-null     object  
 9   host_name                     1294956 non-null  object  
 10  url                           1295469 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  specie

Drop duplicate geographies

In [29]:
gdf_filtered.drop_duplicates('geometry', inplace = True)
gdf_filtered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1133170 entries, 0 to 1295470
Data columns (total 21 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1132698 non-null  object  
 1   site_id_reported              1128303 non-null  object  
 2   site_description_reported     3695 non-null     object  
 3   site_sqkm                     1127724 non-null  float64 
 4   trees_planted_reported        3409 non-null     object  
 5   country                       56586 non-null    object  
 6   project_description_reported  1110631 non-null  object  
 7   planting_date_reported        1110475 non-null  object  
 8   survival_rate_reported        2522 non-null     object  
 9   host_name                     1132666 non-null  object  
 10  url                           1133170 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  species_pla

Clean up country names

In [30]:
gdf_filtered.country.unique()

array(['BR', 'GH', 'VN', 'CO', 'UG', 'MZ', 'US', 'MW', 'ES', 'AE', 'KE',
       'MG', 'GB', 'MX', 'NP', 'ET', 'PH', 'BO', 'NA', 'IN', 'AR', 'HN',
       'EG', 'CA', 'TZ', 'IR', 'ID', 'UA', 'TH', 'AU', 'AM', 'IE', 'BD',
       'CL', 'NG', 'CI', 'JO', 'CD', 'CM', 'EC', 'CR', 'DE', 'GT', 'TW',
       'BE', 'ZA', 'HT', 'CZ', 'NI', 'PA', 'FR', 'BI', 'IT', 'SL', 'PE',
       'DK', 'ZM', 'TG', 'MN', 'ZW', 'NO', None, 'us', 'in', 'ke', 'gt',
       'ch', 'gb', 'tz', 'pt', 'lt', 'co', 'cr', 'cg', 'mx', 'au', 'th',
       'it', 'de', 'pe', 'ni', 'rs', 'ug', 'za', 'mg', 'bi', 'vn', 'br',
       'mw', 'gh', 'KI', 'GL', 'DO', 'BZ', 'VE', 'PR', 'JM', 'BQ', 'BS',
       'BB', 'SV', 'CU', 'AN', 'AW', 'PY', 'UY', 'PT', 'IS', 'FO', 'GN',
       'LR', 'GM', 'MR', 'SN', 'BF', 'ML', 'MA', 'SE', 'RO', 'RU', 'AT',
       'GE', 'TR', 'PL', 'SI', 'BA', 'HU', 'SK', 'HR', 'RS', 'BG', 'CH',
       'EE', 'NL', 'NE', 'TD', 'LB', 'GR', 'DZ', 'SS', 'BJ', 'SA', 'IL',
       'IQ', 'MT', 'SD', 'TN', 'YE', 'ER', 'CF', 'C

In [31]:
country_to_iso2 = {}

for country in pycountry.countries:
    country_to_iso2[country.alpha_2.lower()] = country.alpha_2.lower()
    country_to_iso2[country.name.lower()] = country.alpha_2.lower()

gdf_filtered['country'] = gdf_filtered['country'].apply(lambda country: country_to_iso2.get(country.lower(), None) if pd.notna(country) else None)

Fix datatypes

In [32]:
gdf_filtered['site_id_reported'] = gdf_filtered['site_id_reported'].astype(str)
gdf_filtered['project_id_reported'] = gdf_filtered['project_id_reported'].astype(str)
gdf_filtered['trees_planted_reported'] = gdf_filtered['trees_planted_reported'].astype(float)
gdf_filtered['survival_rate_reported'] = gdf_filtered['survival_rate_reported'].astype(float)


In [33]:
gdf_filtered["geometry"][:400]

0      POLYGON ((-49.95883 -9.35107, -49.95976 -9.351...
1      POLYGON ((-43.4725 -22.48945, -43.47236 -22.48...
2      POLYGON ((-43.462 -22.4779, -43.46583 -22.4875...
3      POLYGON ((-43.46833 -22.4919, -43.46834 -22.49...
4      POLYGON ((-1.99088 8.22041, -1.99354 8.23531, ...
                             ...                        
399    POLYGON ((-99.79492 20.06158, -99.79477 20.061...
400    POLYGON ((-99.52344 19.92507, -99.52355 19.925...
401    POLYGON ((-100.12657 19.33874, -100.12582 19.3...
402    POLYGON ((-99.93197 19.20798, -99.93201 19.208...
403    POLYGON ((-100.1359 19.44393, -100.13585 19.44...
Name: geometry, Length: 400, dtype: geometry

In [34]:


def split_multipolygon(row):
    geom = row['geometry']
    if geom is None:
        return [{**row.to_dict(), 'geometry': None}]
    if isinstance(geom, str):
        geom = wkt.loads(geom)
    if isinstance(geom, MultiPolygon):
        return [{**row.to_dict(), 'geometry': poly.wkt} for poly in geom.geoms]
    else:
        return [{**row.to_dict(), 'geometry': geom.wkt}]

expanded_rows = gdf_filtered.apply(split_multipolygon, axis=1).explode().reset_index(drop=True)
expanded_gdf = gpd.GeoDataFrame(expanded_rows.tolist())

expanded_gdf.head()

Unnamed: 0,project_id_reported,site_id_reported,site_description_reported,site_sqkm,trees_planted_reported,country,project_description_reported,planting_date_reported,survival_rate_reported,host_name,...,species_count_reported,species_planted_reported,planting_date_type,project_geometries_invalid,geometry,site_sqkm_reported,Creator,site_id_created,project_pdf_available,project_id_created
0,proj_ezpAp1POh20dBnYpx0BjhU35,site_W97pqKxXURFOA1E,Farm for the Future demonstration plot,0.013591,313.0,br,This project will be implemented at Farm of th...,,80.0,Planet for the Planet,...,,,planting_date,False,POLYGON ((-49.958833158016205 -9.3510733412391...,,,,,
1,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_NekKEGqkIO4rZ5C,The area to be reforested is around the Tinguá...,0.631388,3573.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,...,,,planting_date,False,"POLYGON ((-43.472501 -22.489448, -43.472365 -2...",,,,,
2,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_Wl3hF91IBkei1Xy,The area to be reforested is around the Tinguá...,3.076566,3573.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,...,,,planting_date,False,"POLYGON ((-43.462002 -22.477901, -43.46583 -22...",,,,,
3,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_qHUXswEmePqou5T,The area to be reforested is around the Tinguá...,0.30486,3573.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,...,,,planting_date,False,"POLYGON ((-43.468334 -22.491903, -43.468336 -2...",,,,,
4,proj_nXBzA2sbX2tm1D75p7bfJ81Z,site_2ITLGnOa3jbDUFa,Plant-for-Ghana is a pioneer reforestation pro...,2.720114,49319.0,gh,Plant-for-Ghana is a hybrid restoration agrofo...,2021.0,93.0,Planet for the Planet,...,,,planting_date,False,POLYGON ((-1.9908797494838661 8.22041325747265...,,,,,


Create unique site and project ids

In [35]:

expanded_gdf['project_id_created'] = pd.factorize(expanded_gdf['project_id_reported'].astype(str) + '-' + expanded_gdf['host_name'].astype(str))[0]

In [36]:

if 'site_id_created' in expanded_gdf.columns:
    expanded_gdf.drop(columns=['site_id_created'], inplace=True)


expanded_gdf.reset_index(names='site_id_created', inplace=True)

In [37]:
expanded_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1289644 entries, 0 to 1289643
Data columns (total 21 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   site_id_created               1289644 non-null  int64  
 1   project_id_reported           1289644 non-null  object 
 2   site_id_reported              1289644 non-null  object 
 3   site_description_reported     3695 non-null     object 
 4   site_sqkm                     1284198 non-null  float64
 5   trees_planted_reported        3409 non-null     float64
 6   country                       56606 non-null    object 
 7   project_description_reported  1267105 non-null  object 
 8   planting_date_reported        1266869 non-null  object 
 9   survival_rate_reported        2578 non-null     float64
 10  host_name                     1289140 non-null  object 
 11  url                           1289644 non-null  object 
 12  species_count_report

In [38]:

expanded_gdf['geometry'] = expanded_gdf['geometry'].apply(wkt.loads)  

In [39]:

is_geodataframe = isinstance(expanded_gdf, gpd.GeoDataFrame)

print(f"Is expanded_gdf a GeoDataFrame? {is_geodataframe}")

Is expanded_gdf a GeoDataFrame? True


In [40]:


expanded_gdf = expanded_gdf[expanded_gdf['geometry'].notnull()]


points_gdf = expanded_gdf[expanded_gdf['geometry'].apply(lambda geom: geom.type == 'Point')]

# points_gdf['derived_geometry'] = points_gdf['geometry'].buffer(100)


points_gdf.info()

Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

  return self.notna()
  points_gdf = expanded_gdf[expanded_gdf['geometry'].apply(lambda geom: geom.type == 'Point')]


<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 87814 entries, 16912 to 1289139
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_created               87814 non-null  int64   
 1   project_id_reported           87814 non-null  object  
 2   site_id_reported              87814 non-null  object  
 3   site_description_reported     0 non-null      object  
 4   site_sqkm                     82368 non-null  float64 
 5   trees_planted_reported        0 non-null      float64 
 6   country                       320 non-null    object  
 7   project_description_reported  84805 non-null  object  
 8   planting_date_reported        82383 non-null  object  
 9   survival_rate_reported        0 non-null      float64 
 10  host_name                     87814 non-null  object  
 11  url                           87814 non-null  object  
 12  species_count_reported        0 non-n

In [41]:

points_gdf = points_gdf.set_geometry("geometry")


points_gdf = points_gdf.set_crs("EPSG:4326", allow_override=True)


points_gdf.to_parquet("../midsave/Onlypoints_data.parquet")

In [42]:
print(points_gdf.crs)

EPSG:4326


In [43]:
expanded_gdf.survival_rate_reported.unique()

array([ 80.,  90.,  93.,  83.,  95.,  78.,  98.,  nan,  92.,  50.,  70.,
        89.,  94.,  85.,  99.,  87., 100.,  73.,  75.,  60.,  91.,  82.,
        68.,  69.,  65.,  88.,  97.,   0.])

In [44]:
expanded_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1289643 entries, 0 to 1289643
Data columns (total 21 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_created               1289643 non-null  int64   
 1   project_id_reported           1289643 non-null  object  
 2   site_id_reported              1289643 non-null  object  
 3   site_description_reported     3694 non-null     object  
 4   site_sqkm                     1284197 non-null  float64 
 5   trees_planted_reported        3408 non-null     float64 
 6   country                       56605 non-null    object  
 7   project_description_reported  1267104 non-null  object  
 8   planting_date_reported        1266869 non-null  object  
 9   survival_rate_reported        2577 non-null     float64 
 10  host_name                     1289139 non-null  object  
 11  url                           1289643 non-null  object  
 12  species_cou

In [46]:

is_geodataframe = isinstance(expanded_gdf, gpd.GeoDataFrame)
print(f"Is expanded_gdf a GeoDataFrame? {is_geodataframe}")

if is_geodataframe:
   
    expanded_gdf['planting_date_reported'] = pd.to_datetime(expanded_gdf['planting_date_reported'], errors='coerce')
    
 
    print(expanded_gdf['project_geometries_invalid'].dtype)
    
   
    expanded_gdf['project_geometries_invalid'] = expanded_gdf['project_geometries_invalid'].astype(bool, errors='ignore')
    
 
    expanded_gdf = expanded_gdf.set_geometry("geometry")
    

    expanded_gdf.to_parquet("../midsave/consolidated_reforestation_projects.parquet")

Is expanded_gdf a GeoDataFrame? True
object


In [47]:
# import geopandas as gpd
# expanded_gdf= gpd.read_parquet("../midsave/consolidated_reforestation_projects.parquet")
# expanded_gdf.info()

In [48]:
# df = gpd.read_parquet("/Users/angela/Downloads/newest_consolidated_reforestation_projects_with_cicular.parquet")
df=gpd.read_parquet("/Users/angela/Downloads/new_updated_reforestation_projects 1.parquet")
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1229172 entries, 0 to 1229171
Data columns (total 66 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_created               1229172 non-null  int64   
 1   project_id_reported           1229172 non-null  object  
 2   site_description_reported     1695 non-null     object  
 3   site_sqkm                     1229172 non-null  float64 
 4   trees_planted_reported        4348 non-null     float64 
 5   country                       5029 non-null     object  
 6   project_description_reported  1228608 non-null  object  
 7   planting_date_reported        1229172 non-null  object  
 8   survival_rate_reported        2513 non-null     float64 
 9   host_name                     1229172 non-null  object  
 10  url                           1229172 non-null  object  
 11  species_count_reported        0 non-null        float64 
 12  specie

In [50]:

filtered_expanded_df = expanded_gdf.merge(df[['site_id_reported']], on='site_id_reported', how='left', indicator=True)
filtered_expanded_df = filtered_expanded_df[filtered_expanded_df['_merge'] == 'left_only'].drop(columns=['_merge'])

# Display the filtered dataframe
filtered_expanded_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 59365 entries, 5 to 66717563
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   site_id_created               59365 non-null  int64         
 1   project_id_reported           59365 non-null  object        
 2   site_id_reported              59365 non-null  object        
 3   site_description_reported     2004 non-null   object        
 4   site_sqkm                     54995 non-null  float64       
 5   trees_planted_reported        296 non-null    float64       
 6   country                       51582 non-null  object        
 7   project_description_reported  39205 non-null  object        
 8   planting_date_reported        49881 non-null  datetime64[ns]
 9   survival_rate_reported        62 non-null     float64       
 10  host_name                     58861 non-null  object        
 11  url                   

In [51]:
filtered_expanded_df.to_parquet("../midsave/additional_reforestation_projects.parquet")