# Consolidating and preprocessing source data

In this script, we combine all the sites from the different reforestation projects from different organization to one dataset and do some initial filtering.

In [319]:
import geopandas as gpd
import pandas as pd
import pycountry

### Importing provider data

#### Plant_planet_data


We extracted the data from https://www.plant-for-the-planet.org as described in the script 'Plant_Planet_Meta_Data_preprocessing.ipynb'.

In [323]:
df_plant = gpd.read_file("../midsave/plant_for_planet.gpkg")
df_plant.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3003 entries, 0 to 3002
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           3003 non-null   object  
 1   site_id_reported              3003 non-null   object  
 2   site_description_reported     1724 non-null   object  
 3   site_status_reported          3001 non-null   object  
 4   site_sqkm                     3003 non-null   float64 
 5   trees_planted_reported        2988 non-null   object  
 6   country                       3003 non-null   object  
 7   project_description_reported  3003 non-null   object  
 8   planting_date_reported        2533 non-null   float64 
 9   survival_rate_reported        2300 non-null   object  
 10  host_name                     3003 non-null   object  
 11  url                           3003 non-null   object  
 12  species_count_reported        0 non-null

#### Tree_Nation

We extracted and filtered  the data from https://tree-nation.com/projects as described in the script 'Tree_Nation-meta_data_pre.ipynb'.

In [326]:
df_tn = gpd.read_file("../midsave/tree_nation.gpkg")
df_tn.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1238 entries, 0 to 1237
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   planting_date_reported        1238 non-null   int64   
 1   project_description_reported  1238 non-null   object  
 2   site_id_reported              1238 non-null   int64   
 3   url                           1238 non-null   object  
 4   project_id_reported           1238 non-null   int64   
 5   trees_planted_reported        1238 non-null   int64   
 6   site_sqkm                     1238 non-null   float64 
 7   host_name                     1238 non-null   object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  country                       0 non-null      object  
 11  survival_rate_reported        0 non-null      object  
 12  geometry                      1178 non-n

#### Open_Forest_protocol

We extracted and filtered  the data from https://atlas.openforestprotocol.org/  as described in the script  "open_forest_projests_Data_filtering.ipynb".

In [329]:
df_atlas = gpd.read_file("../midsave/atlas.gpkg")
df_atlas.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           112 non-null    int64   
 1   site_id_reported              112 non-null    int64   
 2   project_description_reported  112 non-null    object  
 3   site_sqkm                     112 non-null    float64 
 4   country                       111 non-null    object  
 5   host_name                     112 non-null    object  
 6   url                           112 non-null    object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   planting_date_reported        0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      103 non-null

#### Verra

We extracted and filtered  the data from https://registry.verra.org  as described in the script  "extracting_verra_sites.ipynb".

In [332]:
df_verra = gpd.read_file("../midsave/verra.gpkg")
df_verra.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225162 entries, 0 to 1225161
Data columns (total 13 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_reported              1225162 non-null  int64   
 1   project_id_reported           1225162 non-null  int64   
 2   project_description_reported  1225162 non-null  object  
 3   site_sqkm                     1225162 non-null  float64 
 4   status_reported               1225162 non-null  object  
 5   country                       0 non-null        object  
 6   url                           1225162 non-null  object  
 7   host_name                     1225162 non-null  object  
 8   species_count_reported        0 non-null        object  
 9   species_planted_reported      0 non-null        object  
 10  survival_rate_reported        0 non-null        object  
 11  planting_date_reported        0 non-null        object  
 12  geomet

#### Restor.eco

We extracted and filtered  the data from  https://restor.eco/?lat=10.743821093825016&lng=4.473759981496621&zoom=4 as described in the script  "restor.ipynb".

In [335]:
df_restor = gpd.read_file("../midsave/restor_eco.gpkg")
df_restor.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1741 entries, 0 to 1740
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           1741 non-null   object  
 1   project_description_reported  1741 non-null   object  
 2   planting_date_reported        960 non-null    float64 
 3   url                           1741 non-null   object  
 4   site_sqkm                     1741 non-null   float64 
 5   country                       1741 non-null   object  
 6   site_id_reported              1741 non-null   object  
 7   host_name                     1741 non-null   object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      1741 non-n

#### Explorer Data

This data is extracted from the projects website https://explorer.land/x/projects as described in the script 'explorer_land.ipynb'. The column names were manually edited.

In [338]:
df_ex = gpd.read_file("../midsave/explorer_land.gpkg")
df_ex.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           0 non-null      object  
 1   country                       0 non-null      object  
 2   planting_date_reported        35 non-null     float64 
 3   species_count_reported        0 non-null      object  
 4   species_planted_reported      0 non-null      object  
 5   survival_rate_reported        0 non-null      object  
 6   site_sqkm                     36 non-null     float64 
 7   trees_planted_reported        34 non-null     float64 
 8   site_id_reported              36 non-null     object  
 9   project_description_reported  36 non-null     object  
 10  host_name                     36 non-null     object  
 11  url                           36 non-null     object  
 12  geometry                      36 non-null   

#### Face the Future

We extracted  the data from  https://facethefuture.com/#projects described in the script 'Face_Future_metadata_prepro.ipynb'.

In [341]:
df_ftf = gpd.read_file("../midsave/face_the_future.gpkg")
df_ftf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           94 non-null     object  
 1   site_id_reported              566 non-null    int64   
 2   trees_planted_reported        94 non-null     float64 
 3   site_sqkm                     566 non-null    float64 
 4   planting_date_reported        94 non-null     float64 
 5   project_description_reported  2 non-null      object  
 6   Creator                       94 non-null     object  
 7   host_name                     566 non-null    object  
 8   url                           566 non-null    object  
 9   species_count_reported        0 non-null      object  
 10  country                       0 non-null      object  
 11  species_planted_reported      0 non-null      object  
 12  survival_rate_reported        0 non-null  

#### Climate Partner Impact
We extracted the data as described in the script 'climate_partner_impact.ipynb'.

In [343]:
df_cpi = gpd.read_file("../midsave/climate_partner_impact.gpkg")
df_cpi.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_reported              0 non-null      object  
 1   project_id_reported           7 non-null      object  
 2   country                       0 non-null      object  
 3   url                           7 non-null      object  
 4   host_name                     7 non-null      object  
 5   project_description_reported  0 non-null      object  
 6   site_sqkm                     0 non-null      object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  geometry                      7 non-null      

#### Forest Trends
We extracted the data as described in the script 'forest_trends.ipynb'.

In [345]:
df_ft = gpd.read_file("../midsave/forest_trends.gpkg")
df_ft.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4587 entries, 0 to 4586
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           4587 non-null   object  
 1   project_description_reported  4587 non-null   object  
 2   country                       4587 non-null   object  
 3   host_name                     4587 non-null   object  
 4   url                           4587 non-null   object  
 5   site_sqkm                     0 non-null      object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  site_id_reported              0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  geometry                      4587 non-n

#### One Tree Planted
We extracted the data as described in the script 'one_tree_planted.ipynb'.

In [347]:
df_otp = gpd.read_file("../midsave/one_tree_planted.gpkg")
df_otp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 695 entries, 0 to 694
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           695 non-null    int64   
 1   project_description_reported  0 non-null      object  
 2   url                           695 non-null    object  
 3   host_name                     695 non-null    object  
 4   site_sqkm                     0 non-null      object  
 5   species_count_reported        0 non-null      object  
 6   species_planted_reported      0 non-null      object  
 7   survival_rate_reported        0 non-null      object  
 8   trees_planted_reported        0 non-null      object  
 9   planting_date_reported        0 non-null      object  
 10  country                       0 non-null      object  
 11  site_id_reported              0 non-null      object  
 12  geometry                      693 non-null

#### Reforestaction
We extracted the data as described in the script 'reforestaction.ipynb'.

In [349]:
df_reforestaction = gpd.read_file("../midsave/reforestaction.gpkg")
df_reforestaction.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           1100 non-null   int64   
 1   site_id_reported              1100 non-null   int64   
 2   host_name                     1100 non-null   object  
 3   url                           1100 non-null   object  
 4   project_description_reported  0 non-null      object  
 5   site_sqkm                     0 non-null      object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  planting_date_reported        0 non-null      object  
 11  country                       0 non-null      object  
 12  geometry                      1100 non-n

#### Reforestum
We extracted the data as described in the script 'reforestum.ipynb'.

In [351]:
df_reforestum = gpd.read_file("../midsave/reforestum.gpkg")
df_reforestum.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3691 entries, 0 to 3690
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           3691 non-null   object  
 1   project_description_reported  2082 non-null   object  
 2   country                       0 non-null      object  
 3   site_sqkm                     0 non-null      object  
 4   url                           3691 non-null   object  
 5   host_name                     3691 non-null   object  
 6   species_count_reported        0 non-null      object  
 7   species_planted_reported      0 non-null      object  
 8   survival_rate_reported        0 non-null      object  
 9   trees_planted_reported        0 non-null      object  
 10  planting_date_reported        0 non-null      object  
 11  site_id_reported              0 non-null      object  
 12  geometry                      3691 non-n

#### Trees.org
We extracted the data as described in the script 'trees_org.ipynb'.

In [353]:
df_to = gpd.read_file("../midsave/trees_org.gpkg")
df_to.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           86 non-null     int64   
 1   project_description_reported  86 non-null     object  
 2   country                       86 non-null     object  
 3   planting_date_reported        86 non-null     int64   
 4   url                           86 non-null     object  
 5   host_name                     86 non-null     object  
 6   site_sqkm                     0 non-null      object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   survival_rate_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  site_id_reported              0 non-null      object  
 12  geometry                      86 non-null   

#### Veritree
We extracted the data as described in the script 'verritree.ipynb'.

In [355]:
df_vt = gpd.read_file("../midsave/veritree.gpkg")
df_vt.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   site_id_reported              122 non-null    int64   
 1   project_id_reported           122 non-null    int64   
 2   project_description_reported  122 non-null    object  
 3   country                       122 non-null    object  
 4   survival_rate_reported        122 non-null    int64   
 5   species_planted_reported      122 non-null    object  
 6   host_name                     122 non-null    object  
 7   url                           122 non-null    object  
 8   site_sqkm                     122 non-null    float64 
 9   species_count_reported        0 non-null      object  
 10  trees_planted_reported        0 non-null      object  
 11  planting_date_reported        0 non-null      object  
 12  geometry                      122 non-null

#### Zero CO2
We extracted the data as described in the script 'zero_co2.ipynb'.

In [357]:
df_zc = gpd.read_file("../midsave/zero_co2.gpkg")
df_zc.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           7 non-null      object  
 1   site_id_reported              7 non-null      object  
 2   project_description_reported  7 non-null      object  
 3   country                       0 non-null      object  
 4   planting_date_reported        7 non-null      int64   
 5   url                           7 non-null      object  
 6   host_name                     7 non-null      object  
 7   site_sqkm                     0 non-null      object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      7 non-null      

### Combining the datasets

In [359]:
gdf = pd.concat([df_plant,df_tn,df_verra,df_atlas,df_restor,df_ftf,df_ex, df_cpi, df_ft, df_otp, df_reforestaction, df_reforestum, df_to, df_vt, df_zc]
                , ignore_index = True).drop(columns = ['site_status_reported', 'status_reported'])
gdf.info()

  gdf = pd.concat([df_plant,df_tn,df_verra,df_atlas,df_restor,df_ftf,df_ex, df_cpi, df_ft, df_otp, df_reforestaction, df_reforestum, df_to, df_vt, df_zc]


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1242153 entries, 0 to 1242152
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1241645 non-null  object  
 1   site_id_reported              1233087 non-null  object  
 2   site_description_reported     1724 non-null     object  
 3   site_sqkm                     1231980 non-null  float64 
 4   trees_planted_reported        4354 non-null     object  
 5   country                       9650 non-null     object  
 6   project_description_reported  1238178 non-null  object  
 7   planting_date_reported        4953 non-null     float64 
 8   survival_rate_reported        2422 non-null     object  
 9   host_name                     1242153 non-null  object  
 10  url                           1242153 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  specie

### Data cleanup

Filter polygons with 0 < site_sqkm < 10000

In [362]:
gdf_filtered = gdf[gdf['site_sqkm'] < 10000].copy().reset_index(drop = True)
gdf_filtered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1231704 entries, 0 to 1231703
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1231196 non-null  object  
 1   site_id_reported              1231704 non-null  object  
 2   site_description_reported     1723 non-null     object  
 3   site_sqkm                     1231704 non-null  float64 
 4   trees_planted_reported        4350 non-null     object  
 5   country                       4942 non-null     object  
 6   project_description_reported  1231140 non-null  object  
 7   planting_date_reported        4834 non-null     float64 
 8   survival_rate_reported        2417 non-null     object  
 9   host_name                     1231704 non-null  object  
 10  url                           1231704 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  specie

Drop duplicate geographies

In [364]:
gdf_filtered.drop_duplicates('geometry', inplace = True)
gdf_filtered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1072466 entries, 0 to 1231702
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           1071960 non-null  object  
 1   site_id_reported              1072466 non-null  object  
 2   site_description_reported     1682 non-null     object  
 3   site_sqkm                     1072466 non-null  float64 
 4   trees_planted_reported        4239 non-null     object  
 5   country                       4880 non-null     object  
 6   project_description_reported  1071904 non-null  object  
 7   planting_date_reported        4733 non-null     float64 
 8   survival_rate_reported        2374 non-null     object  
 9   host_name                     1072466 non-null  object  
 10  url                           1072466 non-null  object  
 11  species_count_reported        0 non-null        object  
 12  species_pla

Clean up country names

In [366]:
gdf_filtered.country.unique()

array(['BR', 'GH', 'VN', 'CO', 'UG', 'MZ', 'US', 'MW', 'ES', 'AE', 'KE',
       'MG', 'GB', 'MX', 'NP', 'ET', 'PH', 'BO', 'NA', 'IN', 'AR', 'HN',
       'EG', 'CA', 'TZ', 'IR', 'ID', 'UA', 'TH', 'AU', 'AM', 'IE', 'BD',
       'CL', 'NG', 'CI', 'JO', 'CD', 'CM', 'EC', 'CR', 'DE', 'GT', 'TW',
       'BE', 'ZA', 'HT', 'CZ', 'NI', 'PA', 'FR', 'BI', 'IT', 'SL', 'PE',
       'DK', 'ZM', 'TG', 'MN', 'ZW', 'NO', None, 'us', 'in', 'gt', 'ch',
       'gb', 'tz', 'pt', 'lt', 'co', 'cr', 'ke', 'cg', 'mx', 'au', 'th',
       'it', 'de', 'pe', 'ni', 'KM', 'RW', 'Madagascar', 'Mozambique',
       'Nepal', 'Kenya', 'Canada', 'Indonesia', 'Senegal',
       'United States', 'Rwanda', 'Tanzania', 'Peru', 'Ethiopia', 'Haiti',
       'Mexico', 'Brazil', 'India', 'Uganda', 'Malawi', 'Cambodia',
       'China', 'Japan', 'Philippines', 'Honduras'], dtype=object)

In [367]:
country_to_iso2 = {}

for country in pycountry.countries:
    country_to_iso2[country.alpha_2.lower()] = country.alpha_2.lower()
    country_to_iso2[country.name.lower()] = country.alpha_2.lower()

gdf_filtered['country'] = gdf_filtered['country'].apply(lambda country: country_to_iso2.get(country.lower(), None) if pd.notna(country) else None)

Create unique site and project ids

In [369]:
gdf_filtered['project_id_created'] = pd.factorize(gdf_filtered['project_id_reported'].astype(str) + '-' + gdf_filtered['host_name'].astype(str))[0]

In [370]:
gdf_filtered.reset_index(names = 'site_id_created', inplace = True)

In [371]:
gdf_filtered.head(3)

Unnamed: 0,site_id_created,project_id_reported,site_id_reported,site_description_reported,site_sqkm,trees_planted_reported,country,project_description_reported,planting_date_reported,survival_rate_reported,host_name,url,species_count_reported,species_planted_reported,geometry,Creator,project_id_created
0,0,proj_ezpAp1POh20dBnYpx0BjhU35,site_W97pqKxXURFOA1E,Farm for the Future demonstration plot,0.013591,313,br,This project will be implemented at Farm of th...,,80,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-49.95883 -9.35107, -49.95976 -9.351...",,0
1,1,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_NekKEGqkIO4rZ5C,The area to be reforested is around the Tinguá...,0.631388,3418,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.47250 -22.48945, -43.47237 -22.4...",,1
2,2,proj_ZCspL8JYmUu0OXcx6O73I1j0,site_Wl3hF91IBkei1Xy,The area to be reforested is around the Tinguá...,3.076566,3418,br,"ITPA was born in 1998, from the initiative of ...",2012.0,90,Planet for the Planet,https://www.plant-for-the-planet.org/,,,"POLYGON ((-43.46200 -22.47790, -43.46583 -22.4...",,1


In [None]:
gdf_filtered.to_file("../midsave/consolidated_reforestation_projects.geojson", driver="GeoJSON")