# Consolidating and preprocessing source data

In this script, we combine all the sites from the different reforestation projects from different organization to one dataset and do some initial filtering.

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np
import pycountry
from shapely import wkt
from shapely.geometry import MultiPolygon, Polygon
import geopandas as gpd

### Importing provider data

#### Plant_planet_data


We extracted the data from https://www.plant-for-the-planet.org as described in the script 'Plant_Planet_Meta_Data_preprocessing.ipynb'.

In [3]:
df_plant = gpd.read_file("../midsave/plant_for_planet.gpkg")
df_plant.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3060 entries, 0 to 3059
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           3060 non-null   object  
 1   site_id_reported              3060 non-null   object  
 2   site_description_reported     1726 non-null   object  
 3   site_status_reported          3058 non-null   object  
 4   site_sqkm                     3060 non-null   float64 
 5   trees_planted_reported        3045 non-null   object  
 6   country                       3060 non-null   object  
 7   project_description_reported  3060 non-null   object  
 8   planting_date_reported        2591 non-null   float64 
 9   survival_rate_reported        2358 non-null   object  
 10  host_name                     3060 non-null   object  
 11  url                           3060 non-null   object  
 12  species_count_reported        0 non-null

#### Tree_Nation

We extracted and filtered  the data from https://tree-nation.com/projects as described in the script 'Tree_Nation-meta_data_pre.ipynb'.

In [4]:
df_tn = gpd.read_file("../midsave/tree_nation.gpkg")
df_tn.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1238 entries, 0 to 1237
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   planting_date_reported        1238 non-null   int32   
 1   project_description_reported  1238 non-null   object  
 2   site_id_reported              1238 non-null   int64   
 3   url                           1238 non-null   object  
 4   project_id_reported           1238 non-null   int64   
 5   trees_planted_reported        1238 non-null   int64   
 6   site_sqkm                     1238 non-null   float64 
 7   host_name                     1238 non-null   object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  country                       0 non-null      object  
 11  survival_rate_reported        0 non-null      object  
 12  geometry                      1178 non-n

#### Open_Forest_protocol

We extracted and filtered  the data from https://atlas.openforestprotocol.org/  as described in the script  "open_forest_projests_Data_filtering.ipynb".

In [5]:
df_atlas = gpd.read_file("../midsave/atlas.gpkg")
df_atlas.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           112 non-null    int64   
 1   site_id_reported              112 non-null    int64   
 2   project_description_reported  112 non-null    object  
 3   site_sqkm                     112 non-null    float64 
 4   country                       111 non-null    object  
 5   host_name                     112 non-null    object  
 6   url                           112 non-null    object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   planting_date_reported        0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      103 non-null

#### Verra

We extracted and filtered  the data from https://registry.verra.org  as described in the script  "extracting_verra_sites.ipynb".

In [6]:
df_verra = gpd.read_parquet("../midsave/verra.parquet")
df_verra.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225618 entries, 0 to 1225617
Data columns (total 13 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_reported              1225618 non-null  int64   
 1   project_id_reported           1225618 non-null  int64   
 2   project_description_reported  1225618 non-null  object  
 3   geometry                      1225618 non-null  geometry
 4   site_sqkm                     1225618 non-null  float64 
 5   status_reported               1225618 non-null  object  
 6   country                       0 non-null        object  
 7   planting_date_reported        0 non-null        object  
 8   url                           1225618 non-null  object  
 9   host_name                     1225618 non-null  object  
 10  species_count_reported        0 non-null        object  
 11  species_planted_reported      0 non-null        object  
 12  surviv

In [7]:
# using all verra datacsv downloaded from the verra registry to fill column planting_date_reported with the date of project registration
verra_data=pd.read_csv("../midsave/all_verra_projects.csv")
verra_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4512 entries, 0 to 4511
Data columns (total 13 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   ID                                    4512 non-null   int64 
 1   Name                                  4511 non-null   object
 2   Proponent                             4510 non-null   object
 3   Project Type                          4512 non-null   object
 4   AFOLU Activities                      1531 non-null   object
 5   Methodology                           4488 non-null   object
 6   Status                                4512 non-null   object
 7   Country/Area                          4512 non-null   object
 8   Estimated Annual Emission Reductions  4512 non-null   object
 9   Region                                4273 non-null   object
 10  Project Registration Date             2503 non-null   object
 11  Crediting Period Start Date   

In [8]:
# column ID in verra_data has same values as column planting_date_reported in df_verra
registration_date_mapping = verra_data.set_index('ID')['Project Registration Date']
crediting_period_start_date_mapping = verra_data.set_index('ID')['Crediting Period Start Date']


df_verra['planting_date_reported'] = df_verra['project_id_reported'].map(registration_date_mapping)

df_verra['planting_date_reported'] = df_verra['planting_date_reported'].fillna(
    df_verra['project_id_reported'].map(crediting_period_start_date_mapping)
)

In [9]:
# # column ID in verra_data has same values as column planting_date_reported in df_verra
# registration_date_mapping = verra_data.set_index('ID')['Project Registration Date']
# crediting_period_start_date_mapping = verra_data.set_index('ID')['Crediting Period Start Date']

# # Generate 'registration_date' column
# df_verra['registration_date'] = df_verra['project_id_reported'].map(registration_date_mapping)

# # Generate 'crediting_start_period' column
# df_verra['crediting_start_period'] = df_verra['project_id_reported'].map(crediting_period_start_date_mapping)

# # Fill 'registration_date' with 'crediting_start_period' if NaN
# #df_verra['registration_date'] = df_verra['registration_date'].fillna(df_verra['crediting_start_period'])

In [10]:
df_verra.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225618 entries, 0 to 1225617
Data columns (total 13 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_reported              1225618 non-null  int64   
 1   project_id_reported           1225618 non-null  int64   
 2   project_description_reported  1225618 non-null  object  
 3   geometry                      1225618 non-null  geometry
 4   site_sqkm                     1225618 non-null  float64 
 5   status_reported               1225618 non-null  object  
 6   country                       0 non-null        object  
 7   planting_date_reported        1205774 non-null  object  
 8   url                           1225618 non-null  object  
 9   host_name                     1225618 non-null  object  
 10  species_count_reported        0 non-null        object  
 11  species_planted_reported      0 non-null        object  
 12  surviv

In [11]:
df_verra.head()

Unnamed: 0,site_id_reported,project_id_reported,project_description_reported,geometry,site_sqkm,status_reported,country,planting_date_reported,url,host_name,species_count_reported,species_planted_reported,survival_rate_reported
0,0,4107,Gansu Longnan Afforestation Project (hereinaft...,"POLYGON ((105.42801 33.28925, 105.42802 33.289...",0.088937,Under validation,,2020-05-20,https://registry.verra.org/app/projectDetail/V...,Verra,,,
1,1,4107,Gansu Longnan Afforestation Project (hereinaft...,"POLYGON ((105.42302 33.28654, 105.42323 33.286...",0.126815,Under validation,,2020-05-20,https://registry.verra.org/app/projectDetail/V...,Verra,,,
2,2,4107,Gansu Longnan Afforestation Project (hereinaft...,"POLYGON ((105.46761 33.27973, 105.46762 33.279...",0.025083,Under validation,,2020-05-20,https://registry.verra.org/app/projectDetail/V...,Verra,,,
3,3,4107,Gansu Longnan Afforestation Project (hereinaft...,"POLYGON ((105.46989 33.28532, 105.4699 33.2853...",0.060386,Under validation,,2020-05-20,https://registry.verra.org/app/projectDetail/V...,Verra,,,
4,4,4107,Gansu Longnan Afforestation Project (hereinaft...,"POLYGON ((105.46846 33.28361, 105.4687 33.2839...",0.064132,Under validation,,2020-05-20,https://registry.verra.org/app/projectDetail/V...,Verra,,,


In [12]:
folder_path="../midsave/Date_evaluation"

#### Restor.eco

We extracted and filtered  the data from  https://restor.eco/?lat=10.743821093825016&lng=4.473759981496621&zoom=4 as described in the script  "restor.ipynb".

In [13]:
df_restor = gpd.read_file("../midsave/restor_eco.gpkg")
df_restor.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1741 entries, 0 to 1740
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           1741 non-null   object  
 1   project_description_reported  1741 non-null   object  
 2   planting_date_reported        960 non-null    float64 
 3   url                           1741 non-null   object  
 4   site_sqkm                     1741 non-null   float64 
 5   country                       1741 non-null   object  
 6   site_id_reported              1741 non-null   object  
 7   host_name                     1741 non-null   object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      1741 non-n

#### Explorer Data

This data is extracted from the projects website https://explorer.land/x/projects as described in the script 'explorer_land.ipynb'. The column names were manually edited.

In [14]:
df_ex = gpd.read_file("../midsave/explorer_land.gpkg")
df_ex.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           0 non-null      float64 
 1   country                       0 non-null      float64 
 2   planting_date_reported        35 non-null     float64 
 3   species_count_reported        0 non-null      float64 
 4   species_planted_reported      0 non-null      float64 
 5   survival_rate_reported        0 non-null      float64 
 6   site_sqkm                     36 non-null     float64 
 7   trees_planted_reported        34 non-null     float64 
 8   site_id_reported              36 non-null     object  
 9   project_description_reported  36 non-null     object  
 10  host_name                     36 non-null     object  
 11  url                           36 non-null     object  
 12  geometry                      36 non-null   

#### Face the Future

We extracted  the data from  https://facethefuture.com/#projects described in the script 'Face_Future_metadata_prepro.ipynb'.

In [15]:
df_ftf = gpd.read_file("../midsave/face_the_future.gpkg")
df_ftf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           94 non-null     object  
 1   site_id_reported              566 non-null    int64   
 2   trees_planted_reported        94 non-null     float64 
 3   site_sqkm                     566 non-null    float64 
 4   planting_date_reported        94 non-null     float64 
 5   project_description_reported  2 non-null      object  
 6   Creator                       94 non-null     object  
 7   host_name                     566 non-null    object  
 8   url                           566 non-null    object  
 9   species_count_reported        0 non-null      object  
 10  country                       0 non-null      object  
 11  species_planted_reported      0 non-null      object  
 12  survival_rate_reported        0 non-null  

#### Climate Partner Impact
We extracted the data as described in the script 'climate_partner_impact.ipynb'.

In [16]:
df_cpi = gpd.read_file("../midsave/climate_partner_impact.gpkg")
df_cpi.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_reported              0 non-null      object  
 1   project_id_reported           7 non-null      object  
 2   country                       0 non-null      object  
 3   url                           7 non-null      object  
 4   host_name                     7 non-null      object  
 5   project_description_reported  0 non-null      object  
 6   site_sqkm                     0 non-null      object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  geometry                      7 non-null      

#### Forest Trends
We extracted the data as described in the script 'forest_trends.ipynb'.

In [17]:
df_ft = gpd.read_file("../midsave/forest_trends.gpkg")
df_ft.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4587 entries, 0 to 4586
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           4587 non-null   object  
 1   project_description_reported  4587 non-null   object  
 2   country                       4587 non-null   object  
 3   host_name                     4587 non-null   object  
 4   url                           4587 non-null   object  
 5   site_sqkm                     0 non-null      object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  site_id_reported              0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  geometry                      4587 non-n

#### One Tree Planted
We extracted the data as described in the script 'one_tree_planted.ipynb'.

In [18]:
df_otp = gpd.read_file("../midsave/one_tree_planted.gpkg")
df_otp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           695 non-null    int64   
 1   project_description_reported  0 non-null      object  
 2   url                           695 non-null    object  
 3   host_name                     695 non-null    object  
 4   site_sqkm                     0 non-null      object  
 5   species_count_reported        0 non-null      object  
 6   species_planted_reported      0 non-null      object  
 7   survival_rate_reported        0 non-null      object  
 8   trees_planted_reported        0 non-null      object  
 9   planting_date_reported        0 non-null      object  
 10  country                       0 non-null      object  
 11  site_id_reported              0 non-null      object  
 12  geometry                      693 non-null

#### Reforestaction
We extracted the data as described in the script 'reforestaction.ipynb'.

In [19]:
df_reforestaction = gpd.read_file("../midsave/reforestaction.gpkg")
df_reforestaction.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           1100 non-null   int64   
 1   site_id_reported              1100 non-null   int64   
 2   host_name                     1100 non-null   object  
 3   url                           1100 non-null   object  
 4   project_description_reported  0 non-null      object  
 5   site_sqkm                     0 non-null      object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  planting_date_reported        0 non-null      object  
 11  country                       0 non-null      object  
 12  geometry                      1100 non-n

#### Reforestum
We extracted the data as described in the script 'reforestum.ipynb'.

In [20]:
df_reforestum = gpd.read_file("../midsave/reforestum.gpkg")
df_reforestum.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3691 entries, 0 to 3690
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           3691 non-null   object  
 1   project_description_reported  2082 non-null   object  
 2   country                       0 non-null      object  
 3   site_sqkm                     0 non-null      object  
 4   url                           3691 non-null   object  
 5   host_name                     3691 non-null   object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  planting_date_reported        0 non-null      object  
 11  site_id_reported              0 non-null      object  
 12  geometry                      3691 non-n

#### Trees.org
We extracted the data as described in the script 'trees_org.ipynb'.

In [21]:
df_to = gpd.read_file("../midsave/trees_org.gpkg")
df_to.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           86 non-null     int64   
 1   project_description_reported  86 non-null     object  
 2   country                       86 non-null     object  
 3   planting_date_reported        86 non-null     int32   
 4   url                           86 non-null     object  
 5   host_name                     86 non-null     object  
 6   site_sqkm                     0 non-null      object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  site_id_reported              0 non-null      object  
 12  geometry                      86 non-null   

#### Veritree
We extracted the data as described in the script 'verritree.ipynb'.

In [22]:
df_vt = gpd.read_file("../midsave/veritree.gpkg")
df_vt.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_reported              122 non-null    int64   
 1   project_id_reported           122 non-null    int64   
 2   project_description_reported  122 non-null    object  
 3   country                       122 non-null    object  
 4   survival_rate_reported        122 non-null    int64   
 5   species_planted_reported      122 non-null    object  
 6   host_name                     122 non-null    object  
 7   url                           122 non-null    object  
 8   site_sqkm                     122 non-null    float64 
 9   species_count_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  geometry                      122 non-null

#### Zero CO2
We extracted the data as described in the script 'zero_co2.ipynb'.

In [23]:
df_zc = gpd.read_file("../midsave/zero_co2.gpkg")
df_zc.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           7 non-null      object  
 1   site_id_reported              7 non-null      object  
 2   project_description_reported  7 non-null      object  
 3   country                       0 non-null      object  
 4   planting_date_reported        7 non-null      int32   
 5   url                           7 non-null      object  
 6   host_name                     7 non-null      object  
 7   site_sqkm                     0 non-null      object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      7 non-null      

 #### Gold Standards
We extracted the data as described in the script 'gold_standards.ipynb'.

In [47]:
df_gs = gpd.read_file('../midsave/gold_standards.gpkg')
df_gs.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           71 non-null     int64   
 1   country                       71 non-null     object  
 2   project_description_reported  0 non-null      object  
 3   url                           71 non-null     object  
 4   host_name                     71 non-null     object  
 5   site_sqkm                     0 non-null      object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  planting_date_reported        0 non-null      object  
 11  site_id_reported              0 non-null      object  
 12  geometry                      0 non-null    

### Combining the datasets

In [48]:
gdf = pd.concat([df_plant,df_tn,df_verra,df_atlas,df_restor,df_ftf,df_ex, df_cpi, df_ft, df_otp, df_reforestaction, df_reforestum, df_to, df_vt, df_zc,df_gs]
                , ignore_index = True).drop(columns = ['site_status_reported', 'status_reported'])
gdf.info()

  gdf = pd.concat([df_plant,df_tn,df_verra,df_atlas,df_restor,df_ftf,df_ex, df_cpi, df_ft, df_otp, df_reforestaction, df_reforestum, df_to, df_vt, df_zc,df_gs]


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1242737 entries, 0 to 1242736
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1242229 non-null  object  
 1   site_id_reported              1233600 non-null  object  
 2   site_description_reported     1726 non-null     object  
 3   site_sqkm                     1232493 non-null  float64 
 4   trees_planted_reported        4411 non-null     object  
 5   country                       9778 non-null     object  
 6   project_description_reported  1238691 non-null  object  
 7   planting_date_reported        1210785 non-null  object  
 8   survival_rate_reported        2480 non-null     object  
 9   host_name                     1242737 non-null  object  
 10  url                           1242737 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  specie

In [49]:
# Filter rows with null values in the 'geometry' column
null_geometry_rows = gdf[gdf['geometry'].isnull()]

null_geometry_rows.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 142 entries, 3060 to 1242736
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           142 non-null    object  
 1   site_id_reported              69 non-null     object  
 2   site_description_reported     0 non-null      object  
 3   site_sqkm                     69 non-null     float64 
 4   trees_planted_reported        60 non-null     object  
 5   country                       80 non-null     object  
 6   project_description_reported  69 non-null     object  
 7   planting_date_reported        60 non-null     object  
 8   survival_rate_reported        0 non-null      object  
 9   host_name                     142 non-null    object  
 10  url                           142 non-null    object  
 11  species_count_reported        0 non-null      object  
 12  species_planted_reported      0 non-null

### Data cleanup

Filter polygons with 0 < site_sqkm < 10000

In [61]:
gdf_filtered = gdf[gdf['site_sqkm'] < 10000].copy().reset_index(drop = True)
gdf_filtered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1232217 entries, 0 to 1232216
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1231709 non-null  object  
 1   site_id_reported              1232217 non-null  object  
 2   site_description_reported     1725 non-null     object  
 3   site_sqkm                     1232217 non-null  float64 
 4   trees_planted_reported        4407 non-null     object  
 5   country                       4999 non-null     object  
 6   project_description_reported  1231653 non-null  object  
 7   planting_date_reported        1210431 non-null  object  
 8   survival_rate_reported        2475 non-null     object  
 9   host_name                     1232217 non-null  object  
 10  url                           1232217 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  specie

Drop duplicate geographies

In [62]:
gdf_filtered.drop_duplicates('geometry', inplace = True)
gdf_filtered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1072682 entries, 0 to 1232215
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1072176 non-null  object  
 1   site_id_reported              1072682 non-null  object  
 2   site_description_reported     1682 non-null     object  
 3   site_sqkm                     1072682 non-null  float64 
 4   trees_planted_reported        4295 non-null     object  
 5   country                       4935 non-null     object  
 6   project_description_reported  1072120 non-null  object  
 7   planting_date_reported        1069515 non-null  object  
 8   survival_rate_reported        2430 non-null     object  
 9   host_name                     1072682 non-null  object  
 10  url                           1072682 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  species_pla

Clean up country names

In [63]:
gdf_filtered.country.unique()

array(['BR', 'GH', 'VN', 'CO', 'UG', 'MZ', 'US', 'MW', 'ES', 'AE', 'KE',
       'MG', 'GB', 'MX', 'NP', 'ET', 'PH', 'BO', 'NA', 'IN', 'AR', 'HN',
       'EG', 'CA', 'TZ', 'IR', 'ID', 'UA', 'TH', 'AU', 'AM', 'IE', 'BD',
       'CL', 'NG', 'CI', 'JO', 'CD', 'CM', 'EC', 'CR', 'DE', 'GT', 'TW',
       'BE', 'ZA', 'HT', 'CZ', 'NI', 'PA', 'FR', 'BI', 'IT', 'SL', 'PE',
       'DK', 'ZM', 'TG', 'MN', 'ZW', 'NO', None, 'us', 'in', 'gt', 'ch',
       'gb', 'tz', 'pt', 'lt', 'co', 'cr', 'ke', 'cg', 'mx', 'au', 'th',
       'it', 'de', 'pe', 'ni', 'KM', 'RW', nan, 'Madagascar',
       'Mozambique', 'Nepal', 'Kenya', 'Canada', 'Indonesia', 'Senegal',
       'United States', 'Rwanda', 'Tanzania', 'Peru', 'Ethiopia', 'Haiti',
       'Mexico', 'Brazil', 'India', 'Uganda', 'Malawi', 'Cambodia',
       'China', 'Japan', 'Philippines', 'Honduras'], dtype=object)

In [64]:
country_to_iso2 = {}

for country in pycountry.countries:
    country_to_iso2[country.alpha_2.lower()] = country.alpha_2.lower()
    country_to_iso2[country.name.lower()] = country.alpha_2.lower()

gdf_filtered['country'] = gdf_filtered['country'].apply(lambda country: country_to_iso2.get(country.lower(), None) if pd.notna(country) else None)

Fix datatypes

In [66]:
gdf_filtered['site_id_reported'] = gdf_filtered['site_id_reported'].astype(str)
gdf_filtered['project_id_reported'] = gdf_filtered['project_id_reported'].astype(str)
gdf_filtered['trees_planted_reported'] = gdf_filtered['trees_planted_reported'].astype(float)
gdf_filtered['survival_rate_reported'] = gdf_filtered['survival_rate_reported'].astype(float)


In [67]:
gdf_filtered["geometry"][:400]

0      POLYGON ((-49.95883 -9.35107, -49.95976 -9.351...
1      POLYGON ((-43.4725 -22.48945, -43.47236 -22.48...
2      POLYGON ((-43.462 -22.4779, -43.46583 -22.4875...
3      POLYGON ((-43.46833 -22.4919, -43.46834 -22.49...
4      POLYGON ((-2.00562 8.21886, -2.00756 8.23202, ...
                             ...                        
401    POLYGON ((-88.50459 18.77865, -88.51044 18.778...
402    POLYGON ((-88.50057 18.79101, -88.49862 18.791...
403    POLYGON ((-88.48006 18.83761, -88.47991 18.837...
404    POLYGON ((-88.48776 18.82797, -88.48743 18.828...
405    POLYGON ((-88.48975 18.81903, -88.48951 18.819...
Name: geometry, Length: 400, dtype: geometry

In [68]:


def split_multipolygon(row):
    geom = row['geometry']
    if geom is None:
        return [{**row.to_dict(), 'geometry': None}]
    if isinstance(geom, str):
        geom = wkt.loads(geom)
    if isinstance(geom, MultiPolygon):
        return [{**row.to_dict(), 'geometry': poly.wkt} for poly in geom.geoms]
    else:
        return [{**row.to_dict(), 'geometry': geom.wkt}]

expanded_rows = gdf_filtered.apply(split_multipolygon, axis=1).explode().reset_index(drop=True)
expanded_gdf = gpd.GeoDataFrame(expanded_rows.tolist())

expanded_gdf.head()

Unnamed: 0,project_id_reported,site_id_reported,site_description_reported,site_sqkm,trees_planted_reported,country,project_description_reported,planting_date_reported,survival_rate_reported,host_name,url,species_count_reported,species_planted_reported,geometry,Creator
0,proj_ezpAp1POh20dBnYpx0BjhU35,site_W97pqKxXURFOA1E,Farm for the Future demonstration plot,0.013591,313.0,br,This project will be implemented at Farm of th...,,80.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/farmof...,,,POLYGON ((-49.958833158016205 -9.3510733412391...,
1,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_NekKEGqkIO4rZ5C,The area to be reforested is around the Tinguá...,0.631388,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/forest...,,,"POLYGON ((-43.472501 -22.489448, -43.472365 -2...",
2,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_Wl3hF91IBkei1Xy,The area to be reforested is around the Tinguá...,3.076566,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/forest...,,,"POLYGON ((-43.462002 -22.477901, -43.46583 -22...",
3,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_qHUXswEmePqou5T,The area to be reforested is around the Tinguá...,0.30486,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/forest...,,,"POLYGON ((-43.468334 -22.491903, -43.468336 -2...",
4,proj_nXBzA2sbX2tm1D75p7bfJ81Z,site_XZlBcSdlO42ErL3,Plant-for-Ghana Phase II,2.226044,48792.0,gh,Plant-for-Ghana is a hybrid restoration agrofo...,2021.0,93.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/plant-...,,,POLYGON ((-2.005615225307139 8.218860927120645...,


In [69]:
expanded_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1229230 entries, 0 to 1229229
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   project_id_reported           1229230 non-null  object 
 1   site_id_reported              1229230 non-null  object 
 2   site_description_reported     1696 non-null     object 
 3   site_sqkm                     1229230 non-null  float64
 4   trees_planted_reported        4405 non-null     float64
 5   country                       5085 non-null     object 
 6   project_description_reported  1228666 non-null  object 
 7   planting_date_reported        1225975 non-null  object 
 8   survival_rate_reported        2570 non-null     float64
 9   host_name                     1229230 non-null  object 
 10  url                           1229230 non-null  object 
 11  species_count_reported        0 non-null        float64
 12  species_planted_repo

Create unique site and project ids

In [70]:

expanded_gdf['project_id_created'] = pd.factorize(expanded_gdf['project_id_reported'].astype(str) + '-' + expanded_gdf['host_name'].astype(str))[0]

In [71]:
expanded_gdf.reset_index(names = 'site_id_created', inplace = True)

In [72]:
expanded_gdf.head(3)

Unnamed: 0,site_id_created,project_id_reported,site_id_reported,site_description_reported,site_sqkm,trees_planted_reported,country,project_description_reported,planting_date_reported,survival_rate_reported,host_name,url,species_count_reported,species_planted_reported,geometry,Creator,project_id_created
0,0,proj_ezpAp1POh20dBnYpx0BjhU35,site_W97pqKxXURFOA1E,Farm for the Future demonstration plot,0.013591,313.0,br,This project will be implemented at Farm of th...,,80.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/farmof...,,,POLYGON ((-49.958833158016205 -9.3510733412391...,,0
1,1,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_NekKEGqkIO4rZ5C,The area to be reforested is around the Tinguá...,0.631388,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/forest...,,,"POLYGON ((-43.472501 -22.489448, -43.472365 -2...",,1
2,2,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_Wl3hF91IBkei1Xy,The area to be reforested is around the Tinguá...,3.076566,3418.0,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90.0,Planet for the Planet,https://web.plant-for-the-planet.org/en/forest...,,,"POLYGON ((-43.462002 -22.477901, -43.46583 -22...",,1


In [73]:

expanded_gdf['geometry'] = expanded_gdf['geometry'].apply(wkt.loads)  

In [74]:

is_geodataframe = isinstance(expanded_gdf, gpd.GeoDataFrame)

print(f"Is expanded_gdf a GeoDataFrame? {is_geodataframe}")

Is expanded_gdf a GeoDataFrame? True


In [75]:
expanded_gdf.survival_rate_reported.unique()

array([ 80.,  90.,  93.,  83.,  95.,  78.,  98.,  nan,  92.,  50.,  70.,
        89.,  94.,  85.,  99.,  87., 100.,  73.,  75.,  60.,  91.,  82.,
        68.,  69.,  65.,  88.,  97.,   0.])

In [76]:
import pandas as pd

is_geodataframe = isinstance(expanded_gdf, gpd.GeoDataFrame)
print(f"Is expanded_gdf a GeoDataFrame? {is_geodataframe}")

if is_geodataframe:
 
    expanded_gdf['planting_date_reported'] = pd.to_datetime(expanded_gdf['planting_date_reported'], errors='coerce')
    
    expanded_gdf = expanded_gdf.set_geometry("geometry")
    expanded_gdf.to_parquet("../midsave/consolidated_reforestation_projects.parquet")

Is expanded_gdf a GeoDataFrame? True
