In [1]:
import requests
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
from shapely.ops import transform
from shapely.geometry import MultiPolygon, MultiLineString, MultiPoint, Polygon, LineString, Point
from io import BytesIO
from tqdm import tqdm
import time
from random import uniform
import zipfile

from helper_functions import kmz_to_kml, fetch_kml, parse_kml, process_kml_uris

### Read project list

Project lists were acquired from the Verra registry at July 8th and 9th, 2024 by using the export to csv functionality for bulk download:

- Verified Carbon Standard (VCS): https://registry.verra.org/app/search/VCS/All%20Projects
- Climate, Community & Biodiversity Standards (CCB): https://registry.verra.org/app/search/CCB/All%20Projects
- Sustainable Development Verified Impact Standard (VISta): https://registry.verra.org/app/search/SDVISTA/All%20Projects

In [3]:
projects_vcs = pd.read_csv('/Users/angela/Documents/GFW/Forest_Monitoring/input/Verra/allprojects_vcs.csv')
projects_ccb = pd.read_csv('/Users/angela/Documents/GFW/Forest_Monitoring/input/Verra/allprojects_ccb.csv')
projects_vista = pd.read_csv('/Users/angela/Documents/GFW/Forest_Monitoring/input/Verra/allprojects_vista.csv')

In [4]:
print(projects_vcs.shape, projects_ccb.shape, projects_vista.shape)

(4418, 13) (463, 9) (200, 10)


In [5]:
projects_vcs.dropna(subset = 'AFOLU Activities', inplace = True)
projects_ccb.dropna(subset = 'CCB Project Type', inplace = True)
projects_vista.dropna(subset = 'Project Type', inplace = True)

In [6]:
print(projects_vcs.shape, projects_ccb.shape, projects_vista.shape)

(1475, 13) (463, 9) (200, 10)


In [7]:
project_list_vcs = projects_vcs[projects_vcs['AFOLU Activities'].str.contains("ARR")].ID.tolist()
project_list_ccb = projects_ccb[projects_ccb['CCB Project Type'].str.contains("Afforestation, Reforestation and Revegetation")].ID.tolist()
project_list_vista = projects_vista[projects_vista['Project Type'].str.contains("Agriculture Forestry and Other Land Use")].ID.tolist()

List of unique project IDs

In [8]:
project_list = list(set(project_list_vcs + project_list_ccb + project_list_vista))

In [9]:
print(len(project_list), len(project_list_vcs), len(project_list_ccb), len(project_list_vista))

631 511 201 65


### Extract geometries per project

In [10]:
gdf = pd.DataFrame()
no_geom_list = []

In [11]:
for project_id in tqdm(project_list):

    try:
        response = requests.get(f'https://registry.verra.org/uiapi/resource/resourceSummary/{project_id}')

    except Exception as e:
        print(f"Error with project {project_id}: {e}")
        continue

    if response.status_code == 200:
        data = response.json()
        
        # Extract KML URIs
        kml_uris = []
        for group in data.get('documentGroups', []):
            for document in group.get('documents', []):
                if document['documentType'].lower() == 'kml file' or document['documentName'].endswith('.kml'):
                    kml_uris.append(document['uri'])
        if kml_uris:
            kml_uris = list(set(kml_uris))
            try:
                # Process the KML URIs to get geometries
                geometries = process_kml_uris(kml_uris)
    
            except Exception as e:
                print(f"Error querying the geometry of project {project_id}: {e}")
                continue
        else:
            no_geom_list.append(project_id)
            print(f'No geometries available for project: {project_id}')

        # Convert geometries to GeoPandas DataFrame
        temp = gpd.GeoDataFrame(geometry=geometries)
        
        # Assign CRS
        if abs(temp.geometry.centroid.y).max() > 180:
            temp = temp.set_crs(3857).to_crs(4326)
        else:
            temp = temp.set_crs(4326)

        # Explode MultiPolygons into individual Polygons
        temp = temp.explode(index_parts=False)
        
        # 3D to 2D geometries
        temp['geometry'] = temp['geometry'].apply(lambda geometry: transform(lambda x, y, z=None: (x, y), geometry))
        
        # Assign identifiers
        temp['project_id_reported'] = project_id
        if data['description']:
            temp['project_description_reported'] = data['description']
        else:
            temp['project_description_reported'] = None
        temp = temp.reset_index(drop = True).reset_index().rename(columns={'index': 'site_id_reported'})
        
        # Add project to output
        gdf = pd.concat([gdf, temp], ignore_index=True)
        
        # Delay to avoid excess request responses
        time.sleep(uniform(0, 2.0))
        
    else:
        print(f"Request failed with status code: {response.status_code}")


  0%|          | 0/631 [01:10<?, ?it/s]


KeyboardInterrupt: 

Check which project ids are not included

In [None]:
set(project_list) - set(gdf['project_id_reported']) - set(no_geom_list)

In [None]:
project_list = set(project_list) - set(gdf['project_id_reported']) - set(no_geom_list)

!! Important: Re-run function above to ensure all projects have been queried !!

In [None]:
gdf.project_id_reported.nunique()

In [None]:
gdf.info()

### Fix geometries

In [None]:
gdf['geometry'] = gdf['geometry'].make_valid()

In [None]:
gdf = gdf.explode(index_parts=False).explode(index_parts=False).reset_index(drop = True)

Turn linestrings into polygons

In [None]:
gdf['geometry'] = gdf['geometry'].apply(
    lambda geom: Polygon(list(geom.coords) + [geom.coords[0]]) if isinstance(geom, LineString) and not geom.is_closed and len(geom.coords) > 0 else
                 Polygon(geom.coords) if isinstance(geom, LineString) and geom.is_closed else
                 geom
)

In [None]:
gdf['geometry'] = gdf['geometry'].make_valid()

In [None]:
gdf['site_sqkm'] = gdf.to_crs(3857).area/1e6

In [None]:
gdf.site_sqkm.describe()

### Add project-level metadata

In [None]:
projects_df = (pd.concat([projects_vcs[['ID', 'Status', 'Country/Area', 'Crediting Period Start Date']], 
                         projects_ccb[['ID', 'Status', 'Country/Area']], 
                         projects_vista[['ID', 'Status', 'Country/Area']]])
               .drop_duplicates(subset = 'ID')
               .rename(columns = {'ID':'project_id_reported', 'Status':'status_reported', 'Country/Area':'country',
                                 'Crediting Period Start Date':'planting_date_reported'}))

In [None]:
projects_df['planting_date_reported'] = pd.to_datetime(projects_df['planting_date_reported']).dt.year

In [None]:
gdf = gdf.merge(projects_df, on = 'project_id_reported', how = 'left')

In [None]:
gdf.info()

### Harmonize nomenclature

In [None]:
gdf['url'] = 'https://registry.verra.org'
gdf['host_name'] = 'Verra'
gdf = gdf.assign(species_count_reported=None, species_planted_reported=None,country=None,survival_rate_reported=None,planting_date_reported=None)

### Save it

In [None]:
gdf.to_parquet("../midsave/verra.parquet")