In [2]:
import requests
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
from shapely.ops import transform
from shapely.geometry import MultiPolygon, MultiLineString, MultiPoint, Polygon, LineString, Point
from io import BytesIO
from tqdm import tqdm
import time
from random import uniform
import zipfile
import os

from helper_functions import kmz_to_kml, fetch_kml, parse_kml, process_kml_uris, convert_3d_to_2d



### Read project list

Project lists were acquired from the Verra registry at July 8th and 9th, 2024 by using the export to csv functionality for bulk download:

- Verified Carbon Standard (VCS): https://registry.verra.org/app/search/VCS/All%20Projects
- Climate, Community & Biodiversity Standards (CCB): https://registry.verra.org/app/search/CCB/All%20Projects
- Sustainable Development Verified Impact Standard (VISta): https://registry.verra.org/app/search/SDVISTA/All%20Projects

In [None]:
projects_vcs = pd.read_csv('../input/Verra/allprojects_vcs.csv')
projects_ccb = pd.read_csv('../input/Verra/allprojects_ccb.csv')
projects_vista = pd.read_csv('../input/Verra/allprojects_vista.csv')

In [None]:
projects_vcs['registry_name'] = 'VCS'
projects_ccb['registry_name'] = 'CCB'
projects_vista['registry_name'] = 'VISTA'

In [None]:
print(projects_vcs.shape, projects_ccb.shape, projects_vista.shape)

In [None]:
projects_vcs.dropna(subset = 'AFOLU Activities', inplace = True)
projects_ccb.dropna(subset = 'CCB Project Type', inplace = True)
projects_vista.dropna(subset = 'Project Type', inplace = True)

In [None]:
print(projects_vcs.shape, projects_ccb.shape, projects_vista.shape)

In [None]:
project_list_vcs = projects_vcs[projects_vcs['AFOLU Activities'].str.contains("ARR")].ID.tolist()
project_list_ccb = projects_ccb[projects_ccb['CCB Project Type'].str.contains("Afforestation, Reforestation and Revegetation")].ID.tolist()
project_list_vista = projects_vista[projects_vista['Project Type'].str.contains("Agriculture Forestry and Other Land Use")].ID.tolist()

List of unique project IDs

In [None]:
project_list = list(set(project_list_vcs + project_list_ccb + project_list_vista))

In [None]:
print(len(project_list), len(project_list_vcs), len(project_list_ccb), len(project_list_vista))

### Extract geometries per project

In [None]:
gdf = pd.DataFrame()
no_geom_list = []

In [None]:
for project_id in tqdm(project_list):

    try:
        response = requests.get(f'https://registry.verra.org/uiapi/resource/resourceSummary/{project_id}')

    except Exception as e:
        print(f"Error with project {project_id}: {e}")
        continue

    if response.status_code == 200:
        data = response.json()
        
        # Extract KML URIs
        kml_uris = []
        for group in data.get('documentGroups', []):
            for document in group.get('documents', []):
                if document['documentType'].lower() == 'kml file' or document['documentName'].endswith('.kml'):
                    kml_uris.append(document['uri'])
        if kml_uris:
            kml_uris = list(set(kml_uris))
            try:
                # Process the KML URIs to get geometries
                geometries = process_kml_uris(kml_uris)
    
            except Exception as e:
                print(f"Error querying the geometry of project {project_id}: {e}")
                continue
        else:
            no_geom_list.append(project_id)
            print(f'No geometries available for project: {project_id}')

        # Convert geometries to GeoPandas DataFrame
        temp = gpd.GeoDataFrame(geometry=geometries)
        
        # Assign CRS
        if abs(temp.geometry.centroid.y).max() > 180:
            temp = temp.set_crs(3857).to_crs(4326)
        else:
            temp = temp.set_crs(4326)

        # Check if original project geometry is valid
        invalid_geom = False
        if len(temp[temp.geometry.is_valid == False]) > 0:
            invalid_geom = True   
        
        # 3D to 2D geometries
        temp['geometry'] = temp['geometry'].apply(lambda geometry: transform(lambda x, y, z=None: (x, y), geometry))
        
        # Explode deeply nested geometries and keep only polygons
        while 'MultiPolygon' in temp.geometry.geom_type.unique().tolist():
            temp = temp.explode(index_parts=False)
            temp['geometry'] = temp['geometry'].make_valid()
            temp['geometry'] = temp['geometry'].apply(
    lambda geom: Polygon(list(geom.coords) + [geom.coords[0]]) if isinstance(geom, LineString) and not geom.is_closed and len(geom.coords) > 0 else
                 Polygon(geom.coords) if isinstance(geom, LineString) and geom.is_closed else
                 geom)

        temp['geometry'] = temp['geometry'].make_valid()
        temp = temp.loc[temp.geometry.geom_type.isin(['Polygon', 'Point'])].copy()

        # Extract Project Description URIs

        pdf_path = f"/Users/tillkoebe/Documents/GitHub/Forest_Monitoring/midsave/project_descriptions/pd_verra_{project_id}.pdf"
        if os.path.exists(pdf_path):
            continue
        
        pd_uris = []
        for group in data.get('documentGroups', []):
            for document in group.get('documents', []):
                if document['documentType'].lower() == 'project description': # or 'pd' in document['documentName'].lower()
                    pd_uris.append(document['uri'])
        
        pd_available = False
        for uri in pd_uris:
            uri_content = fetch_kml(uri)
            if uri_content:
                open(os.path.join(pdf_path), "wb").write(uri_content)
                pd_available = True
        
        # Assign identifiers
        temp['project_id_reported'] = project_id
        temp['project_geometries_invalid'] = invalid_geom
        temp['project_pdf_available'] = pd_available
        
        if data['description']:
            temp['project_description_reported'] = data['description']
        else:
            temp['project_description_reported'] = None
        temp = temp.reset_index(drop = True).reset_index().rename(columns={'index': 'site_id_reported'})
        
        # Add project to output
        gdf = pd.concat([gdf, temp], ignore_index=True)
        
        # Delay to avoid excess request responses
        time.sleep(uniform(0, 5.0))
        
    else:
        print(f"Request failed with status code: {response.status_code}")


Check which project ids are not included

In [None]:
set(project_list) - set(gdf['project_id_reported']) - set(no_geom_list)

In [None]:
project_list = set(project_list) - set(gdf['project_id_reported']) - set(no_geom_list)

!! Important: Re-run function above to ensure all projects have been queried !!

In [None]:
gdf.project_id_reported.nunique()

In [None]:
gdf.info()

### Fix geometries

In [None]:
gdf['site_sqkm'] = gdf.to_crs(3857).area/1e6

In [None]:
gdf.site_sqkm.describe()

### Add project-level metadata

In [None]:
projects_df = (pd.concat([projects_vcs[['ID', 'Status', 'Country/Area', 'Crediting Period Start Date', 'registry_name']], 
                         projects_ccb[['ID', 'Status', 'Country/Area', 'registry_name']], 
                         projects_vista[['ID', 'Status', 'Country/Area', 'registry_name']]])
               .drop_duplicates(subset = 'ID')
               .rename(columns = {'ID':'project_id_reported', 'Status':'status_reported', 'Country/Area':'country',
                                 'Crediting Period Start Date':'planting_date_reported'}))

In [None]:
projects_df['planting_date_reported'] = pd.to_datetime(projects_df['planting_date_reported']).dt.year

In [None]:
gdf = gdf.merge(projects_df, on = 'project_id_reported', how = 'left')

In [None]:
gdf.info()

### Harmonize nomenclature

In [14]:
# gdf= gpd.read_parquet("/Users/angela/Documents/Forest_Monitoring/midsave/verra.parquet")
# gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225162 entries, 0 to 1225161
Data columns (total 14 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_reported              1225162 non-null  int64   
 1   project_id_reported           1225162 non-null  int64   
 2   project_description_reported  1225162 non-null  object  
 3   site_sqkm                     1225162 non-null  float64 
 4   status_reported               1225162 non-null  object  
 5   country                       0 non-null        object  
 6   url                           1225162 non-null  object  
 7   host_name                     1225162 non-null  object  
 8   species_count_reported        0 non-null        object  
 9   species_planted_reported      0 non-null        object  
 10  survival_rate_reported        0 non-null        object  
 11  geometry                      1225159 non-null  geometry
 12  planti

In [15]:
print(gdf.crs)

{"$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "type": "GeographicCRS", "name": "WGS 84", "datum_ensemble": {"name": "World Geodetic System 1984 ensemble", "members": [{"name": "World Geodetic System 1984 (Transit)"}, {"name": "World Geodetic System 1984 (G730)"}, {"name": "World Geodetic System 1984 (G873)"}, {"name": "World Geodetic System 1984 (G1150)"}, {"name": "World Geodetic System 1984 (G1674)"}, {"name": "World Geodetic System 1984 (G1762)"}, {"name": "World Geodetic System 1984 (G2139)"}], "ellipsoid": {"name": "WGS 84", "semi_major_axis": 6378137, "inverse_flattening": 298.257223563}, "accuracy": "2.0", "id": {"authority": "EPSG", "code": 6326}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"name": "Geodetic latitude", "abbreviation": "Lat", "direction": "north", "unit": "degree"}, {"name": "Geodetic longitude", "abbreviation": "Lon", "direction": "east", "unit": "degree"}]}, "scope": "Horizontal component of 3D system.", "area": "World.", "bb

In [16]:
gdf["geometry"]

0          POLYGON ((105.42801 33.28925, 105.42802 33.289...
1          POLYGON ((105.42302 33.28654, 105.42323 33.286...
2          POLYGON ((105.46761 33.27973, 105.46762 33.279...
3          POLYGON ((105.46989 33.28532, 105.4699 33.2853...
4          POLYGON ((105.46846 33.28361, 105.4687 33.2839...
                                 ...                        
1225157    POLYGON ((-41.3239 -16.65141, -41.3239 -16.651...
1225158    POLYGON ((-41.33726 -16.65268, -41.3372 -16.65...
1225159    POLYGON ((-41.33997 -16.64192, -41.3407 -16.64...
1225160    POLYGON ((-41.3364 -16.63705, -41.33559 -16.63...
1225161    POLYGON ((-41.33486 -16.62492, -41.33446 -16.6...
Name: geometry, Length: 1225162, dtype: geometry

In [17]:
invalid_geom = gdf.geometry.is_valid == False
gdf['project_geometries_invalid'] = invalid_geom


In [18]:
gdf['project_geometries_invalid']

0          False
1          False
2          False
3          False
4          False
           ...  
1225157    False
1225158    False
1225159    False
1225160    False
1225161    False
Name: project_geometries_invalid, Length: 1225162, dtype: bool

In [19]:
gdf['url'] = 'https://registry.verra.org/app/projectDetail/' + gdf.registry_name + '/' + gdf.project_id_reported
gdf['host_name'] = 'Verra'
gdf["planting_date_type"]="Crediting Period Start Date"
gdf = (gdf
       .assign(species_count_reported=None, species_planted_reported=None, survival_rate_reported=None)
       .drop(columns = ['registry_name']))

In [20]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1225162 entries, 0 to 1225161
Data columns (total 15 columns):
 #   Column                        Non-Null Count    Dtype   
---  ------                        --------------    -----   
 0   site_id_reported              1225162 non-null  int64   
 1   project_id_reported           1225162 non-null  int64   
 2   project_description_reported  1225162 non-null  object  
 3   site_sqkm                     1225162 non-null  float64 
 4   status_reported               1225162 non-null  object  
 5   country                       0 non-null        object  
 6   url                           1225162 non-null  object  
 7   host_name                     1225162 non-null  object  
 8   species_count_reported        0 non-null        object  
 9   species_planted_reported      0 non-null        object  
 10  survival_rate_reported        0 non-null        object  
 11  geometry                      1225159 non-null  geometry
 12  planti

### Save it

In [22]:
gdf.to_parquet("../midsave/verra.parquet")