In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape, Polygon, MultiPolygon
import numpy as np
import requests
import json
import time
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from helper_functions import *

In [2]:


# Defining  longitude ranges (left to right) in 45-degree increments to extract the ids from the Restor website
longitude_lefts = [-180 + i * 45 for i in range(8)]

#Defining the latitude steps following the web mercator projection as the Restor website uses this projection
latitude_steps = [
    85.0511,  # The Maximum latitude in Web Mercator
    79.17133464081945,
    66.51326044311186,
    40.97989806962013,
    0,
    -40.97989806962013,
    -66.51326044311186,
    -79.17133464081945,
    -85.0511  # Minimum latitude in Web Mercator
]

# extracting all urls for the Restor website using the longitude and latitude stepsas per the restor website
urls = []
for left in longitude_lefts:
    right = left + 45
    for i in range(len(latitude_steps) - 1):
        top = latitude_steps[i]
        bottom = latitude_steps[i + 1]
        if top > bottom:  
            url = (
                f"https://restor2-prod-1-api.restor.eco/sites/3/center-points/"
                f"?bottom={bottom}&left={left}&right={right}&top={top}&visibility=PUBLIC"
            )
            urls.append(url)

# Collect all ids from all urls in the Restor website
all_data = []

for url in urls:
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  
        data = response.json()
        if isinstance(data, list):
            all_data.extend(data)
            print(f"Fetched {len(data)} items from {url}")
        else:
            print(f"Unexpected data format from {url}")
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    time.sleep(1)  

# Saving all collected ids to a json file
with open('../input/Restor_Eco/all_restor_data.json', 'w') as f:
    json.dump(all_data, f, indent=2)

print(f"Data collection complete. Saved {len(all_data)} items to all_restor_data.json.")

Fetched 0 items from https://restor2-prod-1-api.restor.eco/sites/3/center-points/?bottom=79.17133464081945&left=-180&right=-135&top=85.0511&visibility=PUBLIC
Fetched 0 items from https://restor2-prod-1-api.restor.eco/sites/3/center-points/?bottom=66.51326044311186&left=-180&right=-135&top=79.17133464081945&visibility=PUBLIC
Fetched 8 items from https://restor2-prod-1-api.restor.eco/sites/3/center-points/?bottom=40.97989806962013&left=-180&right=-135&top=66.51326044311186&visibility=PUBLIC
Fetched 29 items from https://restor2-prod-1-api.restor.eco/sites/3/center-points/?bottom=0&left=-180&right=-135&top=40.97989806962013&visibility=PUBLIC
Fetched 0 items from https://restor2-prod-1-api.restor.eco/sites/3/center-points/?bottom=-40.97989806962013&left=-180&right=-135&top=0&visibility=PUBLIC
Fetched 0 items from https://restor2-prod-1-api.restor.eco/sites/3/center-points/?bottom=-66.51326044311186&left=-180&right=-135&top=-40.97989806962013&visibility=PUBLIC
Fetched 0 items from https://r

In [3]:

with open('../input/Restor_Eco/all_restor_data.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)

result_list = []

for id in df['id']:
    data = fetch_data(id)
    if data is not None:
        result_list.append(data)

    time.sleep(1)  

# Converting the list of dictionaries to DataFrame
if result_list:
    final_df = pd.DataFrame(result_list)
    final_df.reset_index(drop=True, inplace=True)
    print(final_df)

    final_df.to_csv('../input/Restor_Eco/final_restor_data.csv', index=False)
else:
    print("No data was retrieved")

Error fetching data for id e8378894-7ceb-4fa0-8abd-f678bc7c4dac: HTTPSConnectionPool(host='restor2-prod-1-api.restor.eco', port=443): Max retries exceeded with url: /sites/3/e8378894-7ceb-4fa0-8abd-f678bc7c4dac (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x344836210>: Failed to resolve 'restor2-prod-1-api.restor.eco' ([Errno 8] nodename nor servname provided, or not known)"))
Error fetching data for id 1b87ce33-8d90-40fb-ae01-82003603acca: HTTPSConnectionPool(host='restor2-prod-1-api.restor.eco', port=443): Max retries exceeded with url: /sites/3/1b87ce33-8d90-40fb-ae01-82003603acca (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x344835bd0>: Failed to resolve 'restor2-prod-1-api.restor.eco' ([Errno 8] nodename nor servname provided, or not known)"))
                                         id  \
0      db1554a2-636c-42d2-bdd9-c5b6321bfdb4   
1      aa80c579-5626-4451-aec8-30d7305bb854   
2      f10cefc0-1ca4-4325-90fd-f4

In [4]:
df = final_df[final_df["siteType"] == "RESTORATION"]
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 27259 entries, 8 to 59898
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         27259 non-null  object 
 1   name                       27259 non-null  object 
 2   polygon                    27259 non-null  object 
 3   boundingBox                27259 non-null  object 
 4   siteType                   27259 non-null  object 
 5   siteVisibility             27259 non-null  object 
 6   surfaceAreaKm2             27259 non-null  float64
 7   countryCode                27259 non-null  object 
 8   interventionStartYear      26385 non-null  float64
 9   stage                      27259 non-null  object 
 10  interventionType           27259 non-null  object 
 11  goals                      27259 non-null  object 
 12  supportSought              27259 non-null  object 
 13  website                    12919 non-null  object 


### Fix geometries

In [5]:
# Converting string to dictionary
df['polygon'] = df['polygon'].apply(ast.literal_eval)

df['geometry'] = df['polygon'].apply(lambda x: shape(x) if isinstance(x, dict) else Polygon())


ValueError: malformed node or string: {'type': 'MultiPolygon', 'coordinates': [[[[-156.0355582, 19.71438405], [-156.03455464, 19.71421748], [-156.0343196, 19.71560723], [-156.03536541, 19.7155774], [-156.0355582, 19.71438405]]]]}

In [None]:
gdf = gpd.GeoDataFrame(df, geometry = 'geometry', crs = 'EPSG:4326')
gdf['geometry'] = gdf['geometry'].make_valid()

In [None]:
gdf = gdf.explode(index_parts = False)

In [None]:
gdf = gdf[(gdf.geometry.geom_type == 'Polygon') | (gdf.geometry.geom_type == 'MultiPolygon')]
gdf = gdf.explode(index_parts = False)
gdf = gdf.reset_index(drop = True)

### Harmonize nomenclature

In [None]:
gdf['site_sqkm'] = gdf['geometry'].to_crs(3857).area / 1e6
gdf['site_sqkm'].describe()

count    5.488000e+04
mean     5.656300e+01
std      1.730234e+03
min      6.279266e-13
25%      1.811333e-03
50%      1.075112e-02
75%      5.809420e-02
max      2.362557e+05
Name: site_sqkm, dtype: float64

In [None]:
gdf['country'] = gdf['countryCode'].apply(lambda x: x if isinstance(x, str) else '')

In [None]:
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 54880 entries, 0 to 54879
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   id                         54880 non-null  object  
 1   name                       54785 non-null  object  
 2   polygon                    54880 non-null  object  
 3   boundingBox                54880 non-null  object  
 4   siteType                   54880 non-null  object  
 5   siteVisibility             54880 non-null  object  
 6   surfaceAreaKm2             54880 non-null  float64 
 7   countryCode                54860 non-null  object  
 8   interventionStartYear      53032 non-null  float64 
 9   stage                      54880 non-null  object  
 10  interventionType           54880 non-null  object  
 11  goals                      54880 non-null  object  
 12  supportSought              54880 non-null  object  
 13  website                

In [None]:
# Renaming the columns to follow our naming format in the paper columns section
columns_rename_mapping = {
    'id': 'project_id_reported',
    'description': 'project_description_reported',
    'interventionStartYear': 'planting_date_reported',
    'website': 'url'
}
gdf.rename(columns=columns_rename_mapping, inplace=True)

In [None]:
gdf["planting_date_reported"]

0        2022.0
1        2020.0
2        2022.0
3        2016.0
4        2021.0
          ...  
54875    2015.0
54876    2022.0
54877       NaN
54878    2020.0
54879    2021.0
Name: planting_date_reported, Length: 54880, dtype: float64

In [None]:
gdf['project_description_reported'] = gdf['project_description_reported'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
gdf['planting_date_reported'] = gdf['planting_date_reported'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
gdf['url'] = gdf['url'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)

In [None]:
columns_to_keep = [
     'project_id_reported',
    'project_description_reported',
     'planting_date_reported',
     'geometry',
     'url',
     'site_sqkm',
     'country'
]
gdf=gdf[columns_to_keep]

In [None]:
gdf['planting_date_reported'] = gdf['planting_date_reported'].replace('', np.nan).astype(float)
gdf['planting_date_reported'] = gdf['planting_date_reported'].astype(object)

In [None]:
gdf['site_id_reported'] = ['restor_site_{}'.format(i) for i in range(1, len(gdf) + 1)]
gdf ['host_name'] = 'Restor Eco'
gdf['url'] = 'https://restor.eco/sites/' + gdf['project_id_reported'].astype(str)

gdf = gdf.assign(species_count_reported=None, species_planted_reported=None, survival_rate_reported=None,trees_planted_reported=None)
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 54880 entries, 0 to 54879
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   project_id_reported           54880 non-null  object  
 1   project_description_reported  34283 non-null  object  
 2   planting_date_reported        53032 non-null  object  
 3   geometry                      54880 non-null  geometry
 4   url                           54880 non-null  object  
 5   site_sqkm                     54880 non-null  float64 
 6   country                       54880 non-null  object  
 7   project_geometries_invalid    54880 non-null  bool    
 8   site_id_reported              54880 non-null  object  
 9   host_name                     54880 non-null  object  
 10  species_count_reported        0 non-null      object  
 11  species_planted_reported      0 non-null      object  
 12  survival_rate_reported        0 non-nu

In [None]:
invalid_geom = False

if len(gdf[gdf.geometry.is_valid == False]) > 0:
    invalid_geom = True
gdf['project_geometries_invalid'] = invalid_geom
gdf["planting_date_type"]="Intrvation Start Year"

### Save it

In [None]:
gdf.to_file('../midsave/restor_eco.gpkg')