# Consolidating and preprocessing source data

In this script, we combine all the sites from the different reforestation projects from different organization to one dataset and do some initial filtering.

In [None]:
import geopandas as gpd
import pandas as pd
import pycountry

### Importing provider data

#### Plant_planet_data


We extracted the data from https://www.plant-for-the-planet.org as described in the script 'Plant_Planet_Meta_Data_preprocessing.ipynb'.

In [None]:
df_plant = gpd.read_file("../midsave/plant_for_planet.gpkg")
df_plant.info()

#### Tree_Nation

We extracted and filtered  the data from https://tree-nation.com/projects as described in the script 'Tree_Nation-meta_data_pre.ipynb'.

In [None]:
df_tn = gpd.read_file("../midsave/tree_nation.gpkg")
df_tn.info()

#### Open_Forest_protocol

We extracted and filtered  the data from https://atlas.openforestprotocol.org/  as described in the script  "open_forest_projests_Data_filtering.ipynb".

In [None]:
df_atlas = gpd.read_file("../midsave/atlas.gpkg")
df_atlas.info()

#### Verra

We extracted and filtered  the data from https://registry.verra.org  as described in the script  "extracting_verra_sites.ipynb".

In [None]:
df_verra = gpd.read_file("../midsave/verra.gpkg")
df_verra.info()

#### Restor.eco

We extracted and filtered  the data from  https://restor.eco/?lat=10.743821093825016&lng=4.473759981496621&zoom=4 as described in the script  "restor.ipynb".

In [None]:
df_restor = gpd.read_file("../midsave/restor_eco.gpkg")
df_restor.info()

#### Explorer Data

This data is extracted from the projects website https://explorer.land/x/projects as described in the script 'explorer_land.ipynb'. The column names were manually edited.

In [None]:
df_ex = gpd.read_file("../midsave/explorer_land.gpkg")
df_ex.info()

#### Face the Future

We extracted  the data from  https://facethefuture.com/#projects described in the script 'Face_Future_metadata_prepro.ipynb'.

In [None]:
df_ftf = gpd.read_file("../midsave/face_the_future.gpkg")
df_ftf.info()

#### Climate Partner Impact
We extracted the data as described in the script 'climate_partner_impact.ipynb'.

In [None]:
df_cpi = gpd.read_file("../midsave/climate_partner_impact.gpkg")
df_cpi.info()

#### Forest Trends
We extracted the data as described in the script 'forest_trends.ipynb'.

In [None]:
df_ft = gpd.read_file("../midsave/forest_trends.gpkg")
df_ft.info()

#### One Tree Planted
We extracted the data as described in the script 'one_tree_planted.ipynb'.

In [None]:
df_otp = gpd.read_file("../midsave/one_tree_planted.gpkg")
df_otp.info()

#### Reforestaction
We extracted the data as described in the script 'reforestaction.ipynb'.

In [None]:
df_reforestaction = gpd.read_file("../midsave/reforestaction.gpkg")
df_reforestaction.info()

#### Reforestum
We extracted the data as described in the script 'reforestum.ipynb'.

In [None]:
df_reforestum = gpd.read_file("../midsave/reforestum.gpkg")
df_reforestum.info()

#### Trees.org
We extracted the data as described in the script 'trees_org.ipynb'.

In [None]:
df_to = gpd.read_file("../midsave/trees_org.gpkg")
df_to.info()

#### Veritree
We extracted the data as described in the script 'verritree.ipynb'.

In [None]:
df_vt = gpd.read_file("../midsave/veritree.gpkg")
df_vt.info()

#### Zero CO2
We extracted the data as described in the script 'zero_co2.ipynb'.

In [None]:
df_zc = gpd.read_file("../midsave/zero_co2.gpkg")
df_zc.info()

### Combining the datasets

In [None]:
gdf = pd.concat([df_plant,df_tn,df_verra,df_atlas,df_restor,df_ftf,df_ex, df_cpi, df_ft, df_otp, df_reforestaction, df_reforestum, df_to, df_vt, df_zc]
                , ignore_index = True).drop(columns = ['site_status_reported', 'status_reported'])
gdf.info()

### Data cleanup

Filter polygons with 0 < site_sqkm < 10000

In [None]:
gdf_filtered = gdf[gdf['site_sqkm'] < 10000].copy().reset_index(drop = True)
gdf_filtered.info()

Drop duplicate geographies

In [None]:
gdf_filtered.drop_duplicates('geometry', inplace = True)
gdf_filtered.info()

Clean up country names

In [None]:
gdf_filtered.country.unique()

In [None]:
country_to_iso2 = {}

for country in pycountry.countries:
    country_to_iso2[country.alpha_2.lower()] = country.alpha_2.lower()
    country_to_iso2[country.name.lower()] = country.alpha_2.lower()

gdf_filtered['country'] = gdf_filtered['country'].apply(lambda country: country_to_iso2.get(country.lower(), None) if pd.notna(country) else None)

Create unique site and project ids

In [None]:
gdf_filtered['project_id_created'] = pd.factorize(gdf_filtered['project_id_reported'].astype(str) + '-' + gdf_filtered['host_name'].astype(str))[0]

In [None]:
gdf_filtered.reset_index(names = 'site_id_created', inplace = True)

In [None]:
gdf_filtered.head(3)

In [None]:
gdf_filtered.to_file("../midsave/consolidated_reforestation_projects.geojson", driver="GeoJSON")