# climatecasechart, fetch metadata and download documents

In [18]:
#@title Collect overview  (~1')

import re
import os
import json
import time
import pickle
import requests
import subprocess
import numpy as np
import pandas as pd
from tqdm.cli import tqdm
from bs4 import BeautifulSoup
from multiprocessing import Pool
from collections import defaultdict

try:
  import pypdfium2 as pdfium
except ImportError:
  !pip3 install pypdfium2
  import pypdfium2 as pdfium


tqdm.pandas()

# os.chdir('/slow-data/unitednationsclimatehealth/')
os.chdir('/ipfs-storage/ipfs/herbert/unitednationsclimatehealth/')

base_url = 'https://climatecasechart.com/non-us-jurisdiction/' #@param {"type": "string"}
experiment_date = '2025-03-28-run-01' #@param {"type": "string"}

def extract_case(article):
  """Parse one hit."""
  not_found = article.select('header.entry-header h1.entry-title')
  not_found = len(not_found) > 0 and not_found[0].text.strip() == 'Not found, error 404'
  if not_found:
    return None
  try:
    return {
        "title": article.select('h2.entry-title')[0].text,
        "description": next((d.text for d in article.select('.entry-meta-item .highlight')), None),
        "link": article.select('a.read-more')[0].attrs['href'],
    }
  except:
    print(article.prettify())
    raise

def extract_cases(doc):
  """Fetch all hits from one hits page on climatecasewebsite, return as
  dataframe with title, description and link (url)."""
  return [
    extract_case(article)
    for article in doc.select('main.content article')
  ]

def extract_country_cases(country_url):
  """Go through pagination of a country and extract all hits"""
  try:
    doc = BeautifulSoup(requests.get(country_url).text)
    n_cases = int(
        doc.select('h3.search-results-header')[0].text.lower()
        .replace('cases found', '')
        .replace('case found', '').strip().strip())
    page = 1
    cases = []
    while True:
      cases.extend(extract_cases(doc))
      if len(cases) < n_cases:
        page += 1
        doc = BeautifulSoup(requests.get(f'{country_url}/page/{page}').text)
      else:
        return cases
  except:
    print(country_url)
    raise

def extract_all_cases(base_url):
  """Go through all countries and extract all hits, returns dataframe."""
  doc = BeautifulSoup(requests.get(base_url).text)
  countries = {a.text: a.attrs['href'] for a in doc.select('.entry-content li > a')}
  return pd.DataFrame([
      dict(**article, country=country)
      for country, country_url in tqdm(countries.items(), position=0)
      for article in extract_country_cases(country_url)
  ])


if not os.path.exists(f'data/{experiment_date}-cases.p3'):
  cases = extract_all_cases(base_url)
  cases.to_pickle(f'data/{experiment_date}-cases.p3')
else:
  cases = pd.read_pickle(f'data/{experiment_date}-cases.p3')

In [19]:
#@markdown #Definitions of WHO53 and in-scope bodies (Arbitral Tribunal, etc) and explictly in/excluded cases

#@title Included and excluded cases (email Monique van Cauwenberghe, 2025-03-27)

included_cases = set('''
https://climatecasechart.com/non-us-case/nz-students-for-climate-solutions-and-uk-youth-climate-coalition-v-board-of-bp/
https://climatecasechart.com/non-us-case/request-for-an-advisory-opinion-on-the-obligations-of-states-with-respect-to-climate-change/
https://climatecasechart.com/non-us-case/18416/
https://climatecasechart.com/non-us-case/engels-and-others-v-germany/
https://climatecasechart.com/non-us-case/the-norwegian-grandparents-climate-campaign-and-others-v-norway/
https://climatecasechart.com/non-us-case/plan-bearth-and-others-v-united-kingdom/
https://climatecasechart.com/non-us-case/factory-farming-v-uk/
https://climatecasechart.com/non-us-case/soubeste-and-others-v-austria-and-11-other-states/
https://climatecasechart.com/non-us-case/de-conto-v-italy-and-32-other-states/
https://climatecasechart.com/non-us-case/uricchio-v-italy-and-32-other-states/
https://climatecasechart.com/non-us-case/careme-v-france/
https://climatecasechart.com/non-us-case/greenpeace-nordic-assn-v-ministry-of-petroleum-and-energy-ecthr/
https://climatecasechart.com/non-us-case/union-of-swiss-senior-women-for-climate-protection-v-swiss-federal-council-and-others/
https://climatecasechart.com/non-us-case/mex-m-v-austria/
https://climatecasechart.com/non-us-case/youth-for-climate-justice-v-austria-et-al/
https://climatecasechart.com/non-us-case/declic-v-euro-sun-mining-through-its-subsidiary-samax-romania/
https://climatecasechart.com/non-us-case/communications-to-saudi-arabia-japan-france-usa-and-the-uk-and-13-financial-institutions-concerning-saudi-aramcos-business-activities-in-the-fossil-fuel-sector/
https://climatecasechart.com/non-us-case/communication-to-france-about-the-continued-development-of-mega-basin-projects/
https://climatecasechart.com/non-us-case/communication-to-the-united-kingdom-about-the-arrest-and-imprisonment-of-climate-activists/
https://climatecasechart.com/non-us-case/a-sud-ecologia-e-cooperazione-odv-ets-v-italy/
https://climatecasechart.com/non-us-case/communication-to-the-oecds-arrangement-on-efforts-to-expand-the-coal-fired-electricity-generation-sector/
https://climatecasechart.com/non-us-case/violations-of-human-rights-by-to-federation-of-bosnia-herzegovina-bih-and-china-due-to-coal-fired-plants-in-bih/
https://climatecasechart.com/non-us-case/childrens-petition-to-the-united-nations-secretary-general-to-declare-a-climate-emergency/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-croatia-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-canada-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-kazakhstan-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-monaco-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-ukraine-under-the-kyoto-protocol-ii/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-slovakia-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-lithuania-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-ukraine-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-romania-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-bulgaria-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/sacchi-et-al-v-argentina-et-al/
https://climatecasechart.com/non-us-case/non-compliance-procedure-of-greece-under-the-kyoto-protocol/
https://climatecasechart.com/non-us-case/the-philippine-movement-for-climate-justice-et-al-vs-standard-chartered/
https://climatecasechart.com/non-us-case/divest-invest-protect-indigenous-peoples-law-and-policy-program-and-womens-earth-and-climate-action-network-vs-credit-suisse-group/
https://climatecasechart.com/non-us-case/vu-climate-change-and-sustainability-law-clinic-et-al-vs-one-dyas/
https://climatecasechart.com/non-us-case/indigenous-leaders-of-aidesep-and-feconau-et-al-vs-louis-dreyfus-company-bv/
https://climatecasechart.com/non-us-case/greenpeace-luxembourg-vs-fonds-de-compensation-de-la-securite-sociale-sicav-fis/
https://climatecasechart.com/non-us-case/complaint-against-virgin-atlantic-and-british-airways-under-the-oecd-guidelines-brought-by-possible/
https://climatecasechart.com/non-us-case/complaint-against-drax-group-plc-under-the-oecd-guidelines-brought-by-the-lifescape-project-and-others/
https://climatecasechart.com/non-us-case/survival-international-italia-on-behalf-of-ayoreo-totobiegosode-indigenous-people-v-pasubio-italian-company-in-the-leather-sector/
https://climatecasechart.com/non-us-case/specific-instance-to-the-uk-ncp-under-the-oecd-guidelines-for-multinational-enterprises-filed-by-global-witness-against-uk-export-finance/
https://climatecasechart.com/non-us-case/focsiv-and-others-v-fca-italy-stellantis-nv/
https://climatecasechart.com/non-us-case/rete-legalita-per-il-clima-legality-for-climate-network-and-others-v-eni/
https://climatecasechart.com/non-us-case/rete-legalita-per-il-clima-legality-for-climate-network-and-others-v-intensive-livestock-farming-multinational-companies-operating-in-italy/
https://climatecasechart.com/non-us-case/development-yes-open-pit-mines-no-v-group-pzu-sa/
https://climatecasechart.com/non-us-case/germanwatch-vs-volkswagen/
https://climatecasechart.com/non-us-case/norwegian-climate-network-et-al-vs-statoil/
https://climatecasechart.com/non-us-case/banktrack-et-al-vs-ing-bank/
https://climatecasechart.com/non-us-case/specific-instance-under-the-oecd-guidelines-for-multinational-enterprises-submitted-to-the-slovenian-and-uk-national-contact-point-for-the-oecd-guidelines/
https://climatecasechart.com/non-us-case/complaint-against-bp-in-respect-of-violations-of-the-oecd-guidelines/
'''[1:-1].split('\n'))

excluded_cases = set('''
https://climatecasechart.com/non-us-case/request-for-an-advisory-opinion-on-the-scope-of-the-state-obligations-for-responding-to-the-climate-emergency/
https://climatecasechart.com/non-us-case/the-planet-v-bolsonaro/
https://climatecasechart.com/non-us-case/petition-to-the-inter-american-commission-on-human-rights-seeking-to-redress-violations-of-the-rights-of-children-in-cite-soleil-haiti/
https://climatecasechart.com/non-us-case/center-for-food-and-adequate-living-rights-et-al-v-tanzania-and-uganda/
https://climatecasechart.com/non-us-case/hearing-on-climate-change-before-the-inter-american-commission-on-human-rights/
https://climatecasechart.com/non-us-case/certain-activities-carried-out-by-nicaragua-in-the-border-area-costa-rica-v-nicaragua/
https://climatecasechart.com/non-us-case/request-advisory-opinion-inter-american-court-human-rights-concerning-interpretation-article-11-41-51-american-convention-human-rights/
https://climatecasechart.com/non-us-case/petition-inter-american-commission-human-rights-seeking-relief-violations-rights-arctic-athabaskan-peoples-resulting-rapid-arctic-warming-melting-caused-emissions/
https://climatecasechart.com/non-us-case/pulp-mills-on-the-river-uruguay-argentina-v-uruguay/
https://climatecasechart.com/non-us-case/petition-to-the-inter-american-commission-on-human-rights-seeking-relief-from-violations-resulting-from-global-warming-caused-by-acts-and-omissions-of-the-united-states/
https://climatecasechart.com/non-us-case/communications-to-australia-and-three-australian-companies-concerning-the-fossil-fuel-activities-of-woodside-energy/
https://climatecasechart.com/non-us-case/communication-to-the-government-of-colombia-about-the-humanitarian-and-environmental-crisis-suffered-by-the-afro-descendant-raizal-people/
https://climatecasechart.com/non-us-case/communication-to-pakistan-concerning-the-ongoing-forced-evictions-and-home-demolitions-along-karachis-waterways-nullahs/
https://climatecasechart.com/non-us-case/thailands-large-scale-reforestation-strategy-to-achieve-ghg-emission-targets-threatens-the-human-rights-of-14-forest-dependent-isan-minority-members-in-sab-wai-village-who-are-also-land-ri/
https://climatecasechart.com/non-us-case/environmental-justice-australia-eja-v-australia/
https://climatecasechart.com/non-us-case/rights-of-indigenous-people-in-addressing-climate-forced-displacement/
https://climatecasechart.com/non-us-case/un-human-rights-committee-views-adopted-on-teitiota-communication/
https://climatecasechart.com/non-us-case/petition-of-torres-strait-islanders-to-the-united-nations-human-rights-committee-alleging-violations-stemming-from-australias-inaction-on-climate-change/
https://climatecasechart.com/non-us-case/inclusive-development-international-et-al-vs-marsh/
https://climatecasechart.com/non-us-case/clientearth-vs-cargill/
https://climatecasechart.com/non-us-case/friends-of-the-earth-us-vs-export-import-bank-of-the-united-states/
https://climatecasechart.com/non-us-case/specific-instance-filed-to-the-australian-national-contact-point-under-the-oecd-guidelines-for-multinational-enterprises-by-foe-australia-and-others-v-anz-bank-group-limited/
https://climatecasechart.com/non-us-case/market-forces-v-smbc-mufg-and-mizuho/
https://climatecasechart.com/non-us-case/world-council-of-churches/
'''[1:-1].split('\n'))


assert len(excluded_cases - set(cases['link'])) == 0, "Some excluded cases not found"
assert len(included_cases - set(cases['link'])) == 0, "Some included cases not found"

organisations = [
  'Arbitral Tribunal',
  'European Committee on Social Rights',
  'European Union',
  'International Courts & Tribunals',
  'World Trade Organization',
  'United Nations',
  'OECD',
]

# This list may not be complete, as it doesn't contain WHO53 countries not yet
# in our dataset.
who53 = [
  'Austria',
  'Belgium',
  'Bulgaria',
  'Czech Republic',
  'Denmark',
  'Estonia',
  'Finland',
  'France',
  'Germany',
  'Hungary',
  'Ireland',
  'Italy',
  'Luxembourg',
  'Netherlands',
  'Northern Ireland',
  'Norway',
  'Poland',
  'Portugal',
  'Romania',
  'Russia',
  'Spain',
  'Sweden',
  'Switzerland',
  'Turkey',
  'Ukraine',
  'United Kingdom',
  'Vatican',
]

selected_cases = cases[
  (
    cases['country'].isin(who53 + organisations)
    | cases['link'].isin(included_cases)
  ) & ~cases['link'].isin(excluded_cases)
].copy()

selected_cases['slug'] = selected_cases['link'].str.rstrip('/ ').str.split('/').str[-1]
assert selected_cases.groupby('slug')['link'].apply(lambda x: len(set(x))).max() <= 1

selected_cases

Unnamed: 0,title,description,link,country,slug
160,"Fréderic, Marco, Sven v. Public Prosecutor",Whether the climate crisis can constitute an e...,https://climatecasechart.com/non-us-case/frede...,Belgium,frederic-marco-sven-v-public-prosecutor
161,"Hugues Falys, FIAN, Greenpeace, Ligue des droi...",Whether a farmer's claim that is premised upon...,https://climatecasechart.com/non-us-case/hugue...,Belgium,hugues-falys-fian-greenpeace-ligue-des-droits-...
162,Belgische Federatie der Brandstoffenhandelaars...,Whether the court should annul the Flemish dec...,https://climatecasechart.com/non-us-case/belgi...,Belgium,belgische-federatie-der-brandstoffenhandelaars...
163,Carbon Market Watch v. FIFA,Whether FIFA's advertising of the 2022 World C...,https://climatecasechart.com/non-us-case/carbo...,Belgium,carbon-market-watch-v-fifa
164,Lauwrys A.O. v. The Province of Antwerp,Whether GABRIËLS & CO l.c.'s new gas station p...,https://climatecasechart.com/non-us-case/lauwr...,Belgium,lauwrys-ao-v-the-province-of-antwerp
...,...,...,...,...,...
1046,Non-compliance Procedure of Ukraine under the ...,Ukraine’s failure to establish an initial repo...,https://climatecasechart.com/non-us-case/non-c...,United Nations,non-compliance-procedure-of-ukraine-under-the-...
1047,Non-compliance Procedure of Romania under the ...,Romania’s failure to establish an initial repo...,https://climatecasechart.com/non-us-case/non-c...,United Nations,non-compliance-procedure-of-romania-under-the-...
1048,Non-compliance Procedure of Bulgaria under the...,Bulgaria’s failure to establish an initial rep...,https://climatecasechart.com/non-us-case/non-c...,United Nations,non-compliance-procedure-of-bulgaria-under-the...
1051,"Sacchi, et al. v. Argentina, et al.",Whether respondents violated children’s rights...,https://climatecasechart.com/non-us-case/sacch...,United Nations,sacchi-et-al-v-argentina-et-al


In [20]:
#@title Collect details (~20')

def url_get(url):
  """Fetch a url untill status code 200, wait 1 sec in between"""
  for i in range(5):
    time.sleep(2**i)
    r = requests.get(url)
    if r.status_code == 200:
      return r
  print('failed:', url)

if not os.path.exists(f'data/{experiment_date}-climatecasechart-responses.p3'):
  responses = selected_cases['link'].progress_apply(url_get)
  responses.to_pickle(f'data/{experiment_date}-climatecasechart-responses.p3')
else:
  responses = pd.read_pickle(f'data/{experiment_date}-climatecasechart-responses.p3')

status_codes = responses.apply(lambda x: x.status_code)
status_codes.value_counts()

link
200    515
Name: count, dtype: int64

In [21]:
#@title Parse detail pages (~1')

def get_value(value):
  """Extract textual value or link if contains a[href]"""
  link = value.select('a[href]')
  if len(link) > 0:
    assert len(link) == 1
    url = link[0].attrs['href']
    if url.startswith('/'):
      url = 'https://tbinternet.ohchr.org' + url
    return url
  return value.text.strip()


def find_one_or_none(element, selector, attr=None, raw=False):
  """select the (grand)child in element that match selector,
  raises an error if multiple, returns None if none and returns
  the text (raw==False) or an attribute (attr is not None)."""
  items = element.select(selector)
  if len(items) > 1:
    raise ValueError('Found more than 1 item')
  if len(items) == 0:
    return None
  if attr is not None:
    return items[0].attrs[attr]
  return items[0] if raw else items[0].text


def extract_taxonomy(ul):
  """Extract different taxonomies from the website as a list of dicts
  containing the text and url, provide the top-most ul."""
  items = []
  while ul is not None:
    li = next(li for li in ul.children if li.name == 'li')
    a = li.select('a')[0]
    items.append({'url': a.attrs['href'], 'title': a.text})
    ul = next((ul for ul in li.children if ul.name == 'ul'), None)
  return items


def extract_labeled(label):
  """Considering a `.label` element, returns its text and the
  rest of the textual content of the parent element as a tuple."""
  label_text =  label.text.strip(' :')
  parent = label.parent
  label.replace_with('')
  return label_text, parent


def parse_detail_page(article):
  """Parses an article as shown e.g. here

  https://climatecasechart.com/non-us-case/in-re-climate-resilience-bill/

  into a dict."""

  return dict(**{
      'title': find_one_or_none(article, 'header.entry-header h1.entry-title'),
      'non-english title': find_one_or_none(article, 'header.entry-header h3.entry-title-non-english'),
    }, **{
      field: parent.text.strip()
      for meta in article.select('header.entry-header > .entry-meta > div > span.label')
      for field, parent in [extract_labeled(meta)] # alias
    }, **{
      taxonomy.text.strip(' :'): [
          extract_taxonomy(tax)
          for tax in taxonomy.parent.select('.entry-taxonomy-items > .entry-taxonomy-item > ul.taxonomy-box')
      ]
      for taxonomy in article.select('.entry-taxonomy > div.label')
  }, **{
      field: [text for element in parent.children if (text := element.text.strip()) != '']
      for content in article.select('.entry-content >.case-content > div > span.label')
      for field, parent in [extract_labeled(content)] # alias
  }, documents = [
      dict(zip(header, [get_value(cell) for cell in row.select('td')]))
      for table in article.select('.entry-documents > table')
      for header in [[cell.text.strip() for cell in table.select('thead > tr > th')]]
      for row in table.select('tbody > tr')
  ])


if not os.path.exists(f'data/{experiment_date}-case-details.p3'):
  docs = responses.progress_apply(lambda x: BeautifulSoup(x.text))
  articles = docs.apply(lambda doc: doc.select('main.content article')[0])
  parsed = articles.progress_apply(parse_detail_page).apply(pd.Series)
  case_details = pd.concat([selected_cases.drop(list(set(selected_cases.columns) & set(parsed.columns)), axis=1), parsed], axis=1)

  # Estimate the case filing year as (since not always reported):
  # - 4 digits reported in Reporter Info,
  # - or else the earliest filing date of a document
  #

  year = case_details['Reporter Info'].str.extract('([0-9]{4})')[0]
  # year[case_details['slug'] == 'decision-no-287110-of-feb-8-2007-re-societe-arcelor-atlantique-lorraine-and-others'] = '2007'
  year = year.fillna(case_details[year.isna()]['documents'].apply(lambda x: min((x['Filing Date'][-4:] for x in x), default=None) if x==x else None))
  case_details['year'] = year.astype(float)
  case_details = case_details.dropna(subset=['year'])
  case_details = case_details[(case_details['year'] >= 2011) & (case_details['year'] <= 2024)]
  case_details.to_pickle(f'data/{experiment_date}-case-details.p3')
else:
  case_details = pd.read_pickle(f'data/{experiment_date}-case-details.p3')

In [None]:
#@title Download documents (~15')

always_run = False #@param {"type": "boolean"}

for (slug, link), documents in tqdm(case_details.groupby(['slug', 'link'])['documents'], position=0):
  os.makedirs(f'data/{experiment_date}-pdf/{slug}/', exist_ok=True)
  if documents.str.len().sum() == 0:
    continue
  documents_ = documents.explode().apply(pd.Series).dropna(subset=['File'])
  documents_ = documents_[documents_['File'] != '']
  documents_['filename'] = documents_['File'].str.split('/').str[-1].str.strip().replace('', np.nan)
  if len(documents_) == 0:
    continue
  assert documents_.groupby('filename')['File'].apply(set).apply(len).max() <= 1

  for filename, url in documents_.groupby('filename')['File'].first().items():
    filepath = f'data/{experiment_date}-pdf/{slug}/{filename}'
    if not always_run and os.path.exists(filepath):
      continue
    retries = 10
    while (response := requests.get(url)).status_code != 200:
      if response.status_code != 403:
        print()
        print('Access denied (403):', url)
        break
      time.sleep(1)
      retries += 1
      if retries > 10:
        print()
        print(f'Failed ({response.status_code})', url)
        break
    else:
      with open(filepath, 'wb') as f:
        f.write(response.content)

In [None]:
#@title Fix filenames for surya compatibility (~1')

# Surya identifies files by the prefix before the first '.' - the idea of cutting
# of the extension only, but usually more than that. Hence we translate some
# dots to avoid ambiguity.

for slug, documents in case_details.groupby('slug')['documents']:
  filenames = {document['File'].split('/')[-1].replace('.docx', '.pdf') for document in documents.explode() if document == document}
  filenames = {fn for fn in filenames if fn != ''}
  counts = pd.Series([filename.split('.')[0].replace('No.', 'No').replace('NO.', 'No') for filename in filenames]).value_counts()
  if counts.max() > 1:
    for filename in filenames:
      fixed_filename = filename.replace('No.', 'No').replace('NO.', 'No')
      assert fixed_filename not in filenames
      if fixed_filename != filename and not os.path.exists(f"data/{experiment_date}-pdf/{slug}/{fixed_filename}"):
        !mv "data/{experiment_date}-pdf/{slug}/{filename}" "data/{experiment_date}-pdf/{slug}/{fixed_filename}"