In [20]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import concurrent.futures
from tqdm import tqdm
EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

In [24]:
def get_contact_page_urls(
    main_url: str,
    contact_page_keywords: list[str] = ["contact"],
) -> list[str]:
    try:
        response = requests.get(main_url)
        soup = BeautifulSoup(response.content, "html.parser")
        contact_urls = []
        for link in soup.find_all("a", href=True):
            if any(
                keyword in link["href"].lower() for keyword in contact_page_keywords
            ):
                contact_urls.append(link["href"])
        return contact_urls
    except Exception as e:
        return [e]


def extract_emails_from_page(url: str,email_regex:str) -> list[str]:
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")


        emails = re.findall(email_regex, soup.text
        )
    except Exception as e:
        emails = [e]

    return emails



pkn_kerken = requests.get("https://protestantsekerk.nl/kerkzoeker/?json").json()[
    "churches"
]
contact_page_keywords = [
    "contact",
    "about",
    "over ons",
    "anbi",
    "gegevens",
    "info",
    "wie",
    "geven",
    "give",
    "gift",
    "donatie",
    "doneren",
    "doneer",
]
kerk_urls = [kerk["website"] for kerk in pkn_kerken[:50]]


In [25]:

def process_kerk_url(kerk_url):
    contact_pages = get_contact_page_urls(kerk_url, contact_page_keywords)
    emails = [
        email for page in contact_pages for email in extract_emails_from_page(page, EMAIL_REGEX)
    ]
    return kerk_url, emails


with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(
        tqdm(executor.map(process_kerk_url, kerk_urls), total=len(kerk_urls))
    )

kerk_emails = {kerk_url: emails for kerk_url, emails in results}

  0%|          | 0/50 [00:00<?, ?it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
100%|██████████| 50/50 [00:34<00:00,  1.47it/s]


In [None]:
# Prepare data for DataFrame
rows = []
for key, values in kerk_emails.items():
    if values:
        for value in values:
            rows.append((key, value))
    else:
        rows.append((key, None))

# Create DataFrame
df = pd.DataFrame(rows, columns=["URL", "Email"])
df = df[
    df["Email"].str.contains(
        EMAIL_REGEX, na=False
    )
]

In [28]:
df.to_excel("kerk_emails_via_contactpagina.xlsx")