In this script, we combine all the sites from the different reforestation projects from different organization to one dataset and do some initial filtering.

In [7]:
import geopandas as gpd
from shapely.geometry import shape
import json
import pandas as pd
import numpy as np
from shapely import wkt

#### Plant_planet_data


We extracted the data from https://www.plant-for-the-planet.org as described in the script 'Plant_Planet_Meta_Data_preprocessing.ipynb'.

In [66]:
df_plant = gpd.read_file("../midsave/plant_for_planet.gpkg")
df_plant.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3003 entries, 0 to 3002
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           3003 non-null   object  
 1   site_id_reported              3003 non-null   object  
 2   site_description_reported     1724 non-null   object  
 3   site_status_reported          3001 non-null   object  
 4   site_sqkm                     3003 non-null   float64 
 5   trees_planted_reported        2988 non-null   object  
 6   country                       3003 non-null   object  
 7   project_description_reported  3003 non-null   object  
 8   planting_date_reported        2533 non-null   object  
 9   survival_rate_reported        2300 non-null   object  
 10  host_name                     3003 non-null   object  
 11  url                           3003 non-null   object  
 12  species_count_reported        0 non-null

#### Tree_Nation

We extracted and filtered  the data from https://tree-nation.com/projects as described in the script 'Tree_Nation-meta_data_pre.ipynb'.

In [68]:
df_tn =gpd.read_file("../midsave/tree_nation.gpkg")
df_tn.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1238 entries, 0 to 1237
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype              
---  ------                        --------------  -----              
 0   planting_date_reported        1238 non-null   datetime64[ns, UTC]
 1   project_description_reported  1238 non-null   object             
 2   site_id_reported              1238 non-null   int64              
 3   url                           1238 non-null   object             
 4   project_id_reported           1238 non-null   int64              
 5   trees_planted_reported        1238 non-null   int64              
 6   site_sqkm                     1238 non-null   float64            
 7   host_name                     1238 non-null   object             
 8   species_count_reported        0 non-null      object             
 9   species_planted_reported      0 non-null      object             
 10  country                     

  as_dt = pd.to_datetime(df[k], errors="ignore")


#### Open_Forest_protocol

We extracted and filtered  the data from https://atlas.openforestprotocol.org/  as described in the script  "open_forest_projests_Data_filtering.ipynb".

In [70]:
df_atlas = gpd.read_file("../midsave/atlas.gpkg")
df_atlas.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           112 non-null    int64   
 1   site_id_reported              112 non-null    int64   
 2   project_description_reported  112 non-null    object  
 3   site_sqkm                     112 non-null    float64 
 4   country                       111 non-null    object  
 5   host_name                     112 non-null    object  
 6   url                           112 non-null    object  
 7   species_count_reported        0 non-null      object  
 8   species_planted_reported      0 non-null      object  
 9   planting_date_reported        0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      103 non-null

#### Verra

We extracted and filtered  the data from https://registry.verra.org  as described in the script  "extracting_verra_sites.ipynb".

In [20]:
df_verra = gpd.read_file("../midsave/verra.gpkg")
df_verra.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225162 entries, 0 to 1225161
Data columns (total 8 columns):
 #   Column               Non-Null Count    Dtype   
---  ------               --------------    -----   
 0   site_id              1225162 non-null  int64   
 1   project_id           1225162 non-null  int64   
 2   project_description  1225162 non-null  object  
 3   sites_sqkm           1225162 non-null  float64 
 4   project_name         1225162 non-null  object  
 5   status_reported      1225162 non-null  object  
 6   country_reported     1225162 non-null  object  
 7   geometry             1225159 non-null  geometry
dtypes: float64(1), geometry(1), int64(2), object(4)
memory usage: 74.8+ MB


#### Restor.eco

We extracted and filtered  the data from  https://restor.eco/?lat=10.743821093825016&lng=4.473759981496621&zoom=4 as described in the script  "restor.ipynb".

In [72]:
df_restor = gpd.read_file("../midsave/restor_eco.gpkg")
df_restor.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1741 entries, 0 to 1740
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           1741 non-null   object  
 1   project_description_reported  1741 non-null   object  
 2   planting_date_reported        1741 non-null   object  
 3   url                           1741 non-null   object  
 4   site_sqkm                     1741 non-null   float64 
 5   country                       1741 non-null   object  
 6   site_id_reported              1741 non-null   object  
 7   host_name                     1741 non-null   object  
 8   species_count_reported        0 non-null      object  
 9   species_planted_reported      0 non-null      object  
 10  survival_rate_reported        0 non-null      object  
 11  trees_planted_reported        0 non-null      object  
 12  geometry                      1741 non-n

#### Explorer Data

This data is extracted from the projects website https://explorer.land/x/projects as described in the script 'explorer_land.ipynb'. The column names were manually edited.

In [74]:
df_ex = gpd.read_file("../midsave/explorer_land.gpkg")
df_ex.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           0 non-null      object  
 1   country                       0 non-null      object  
 2   planting_date_reported        35 non-null     float64 
 3   species_count_reported        0 non-null      object  
 4   species_planted_reported      0 non-null      object  
 5   survival_rate_reported        0 non-null      object  
 6   site_sqkm                     36 non-null     float64 
 7   trees_planted_reported        34 non-null     float64 
 8   site_id_reported              36 non-null     object  
 9   project_description_reported  36 non-null     object  
 10  host_name                     36 non-null     object  
 11  url                           36 non-null     object  
 12  geometry                      36 non-null   

#### Face the Future

We extracted  the data from  https://facethefuture.com/#projects described in the script 'Face_Future_metadata_prepro.ipynb'.

In [84]:
df_ftf = gpd.read_file("../midsave/face_the_future.gpkg")
df_ftf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           94 non-null     object  
 1   site_id_reported              566 non-null    int64   
 2   trees_planted_reported        94 non-null     float64 
 3   site_sqkm                     566 non-null    float64 
 4   planting_date_reported        94 non-null     float64 
 5   project_description_reported  2 non-null      object  
 6   host_name                     94 non-null     object  
 7   url                           566 non-null    object  
 8   species_count_reported        0 non-null      object  
 9   country                       0 non-null      object  
 10  species_planted_reported      0 non-null      object  
 11  survival_rate_reported        0 non-null      object  
 12  geometry                      566 non-null

### Combining the datasets

In [87]:
df = pd.concat([df_plant,df_tn,df_verra,df_atlas,df_restor,df_ftf,df_ex], ignore_index = True)

In [89]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1231858 entries, 0 to 1231857
Data columns (total 22 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   project_id_reported           6188 non-null     object  
 1   site_id_reported              6696 non-null     object  
 2   site_description_reported     1724 non-null     object  
 3   site_status_reported          3001 non-null     object  
 4   site_sqkm                     6696 non-null     float64 
 5   trees_planted_reported        4354 non-null     object  
 6   country                       4855 non-null     object  
 7   project_description_reported  6132 non-null     object  
 8   planting_date_reported        5641 non-null     object  
 9   survival_rate_reported        2300 non-null     object  
 10  host_name                     6224 non-null     object  
 11  url                           6696 non-null     object  
 12  specie

In [None]:

reference_order = dfs[0].columns.tolist()
reordered_dfs = [df[reference_order] for df in dfs]

In [None]:
df_reforestation= pd.concat(reordered_dfs, ignore_index=True)
df_reforestation.info()

### Filtering only polygon/Multipolygon rows

In [None]:


def is_polygon_or_multipolygon(geometry):
    if not isinstance(geometry, str):
        return False
    
    
    try:
        geometry_dict = json.loads(geometry)
        if isinstance(geometry_dict, dict) and 'type' in geometry_dict:
            return geometry_dict['type'] in ['Polygon', 'MultiPolygon']
    except json.JSONDecodeError:
        pass
    
   
    try:
        geom = wkt.loads(geometry)
        return geom.geom_type in ['Polygon', 'MultiPolygon']
    except Exception:
        return False

df_reforestation = df_reforestation[df_reforestation['geometry_reported'].apply(is_polygon_or_multipolygon)]

df_reforestation.info()

### Filtering only polygons less than 10000kmsq

In [None]:


def parse_geometry(geometry):
    if pd.isnull(geometry):
        return None
    try:
        # Try to parse as JSON
        return shape(json.loads(geometry))
    except json.JSONDecodeError:
        # If JSON parsing fails, try to parse as WKT
        try:
            return wkt.loads(geometry)
        except Exception:
            return None

df_reforestation['geometry'] = df_reforestation['geometry_reported'].apply(parse_geometry)

gdf = gpd.GeoDataFrame(df_reforestation, geometry='geometry')

gdf.crs = "EPSG:4326"

gdf = gdf.to_crs(epsg=3395)

gdf.loc[gdf['site_sqkm'].isnull(), 'site_sqkm'] = gdf['geometry'].area / 1e6

In [None]:



gdf['site_sqkm'] = pd.to_numeric(gdf['site_sqkm'], errors='coerce')


gdf_filtered = gdf[gdf['site_sqkm'] < 10000]
gdf_filtered.info()

### Dropping duplicates

In [None]:
gdf_filtered.drop_duplicates('geometry_reported', inplace = True)

In [None]:
gdf_filtered.info()

#### Creating unique site and projects Ids

In [None]:

gdf_filtered['created_site_ids'] = ['reforest_site_{}'.format(i) for i in range(1, len(gdf_filtered) + 1)]


unique_project_ids = gdf_filtered['project_id_reported'].unique()
project_id_mapping = {id: 'reforest_proj_{}'.format(i) for i, id in enumerate(unique_project_ids, 1)}


gdf_filtered['created_project_ids'] = gdf_filtered['project_id_reported'].map(project_id_mapping)

In [None]:
gdf_filtered.describe()

In [None]:
#Saving The  data to csv
#gdf_filtered.to_csv('"../input/consolidated_reforestation_projects.csv"', index=False)
# Saving the data to GeoJSON
gdf_filtered.to_file("../input/consolidated_reforestation_projects.geojson", driver="GeoJSON")