In [1]:
import requests
from bs4 import BeautifulSoup
import glob 
from tqdm import tqdm 
import os
from openai import OpenAI
import glob
import pandas as pd
import json

In [2]:
# get the cookie from web browser and paste it in a text file. Should look something like this: 
# kerkspot=<some characters>; ident=<some characters>; s=1
with open("auth_cookie.txt", 'r') as f:
    auth_cookie = f.readline().strip()

In [3]:
RERUN_SCRAPE = False
RERUN_OPENAI = False

In [4]:
def get_page(id):
    url = f"https://www.immanuelkerkdelft.nl/agenda/{id}/"
    headers = {'Cookie': auth_cookie}
    return requests.request("GET", url, headers=headers)

In [5]:
# there are some gaps in the ranges of the IDs.
# I manually searched some IDs found at https://www.immanuelkerkdelft.nl/go/gemeente/agenda/?d=4|2024&g=
# You can change year / month in the URL, click event and then get ID from there
ids_to_search = (
    list(range(244491+20, 244491-20, -1))
    # list(range(254149+20, 254149-20, -1)) +
    # list(range(254250, 254250-30, -1)) +
    # list(range(254250, 254250-20, -1)) +
    # list(range(258150, 258150-50, -1)) + 
    # list(range(260500, 260500-50, -1)) + 
    # list(range(268180, 268180-50, -1))
)


In [6]:
if RERUN_SCRAPE:
    found_ids = []
    for i in tqdm(ids_to_search):
        r = get_page(i)
        if r.status_code == 200:
            found_ids.append(i)
            with open(f"./data/{i}.html", 'w') as o:
                o.write(r.text)
    
    print(f"Found ids: {found_ids}")
    print(f"Max: {max(found_ids)}, min: {min(found_ids)}")

In [7]:
file_list = [f for f in glob.glob("./data/*.html")]

len(file_list)

59

In [8]:
def get_content(soup):
    text = soup.find(id="agendaContent").get_text(separator='\n')
    cleaned_text = '\n'.join(line for line in text.split('\n') if line.strip())
    return cleaned_text

def get_date(soup):
    return soup.find("div", {"class": "agendaDatum"}).get_text().strip()

In [9]:
parsed = []

for file in file_list:
    with open(file, 'r') as f:
        data = f.read()
    soup = BeautifulSoup(data, 'html.parser')
    date = get_date(soup)
    content = get_content(soup)
    
    parsed.append((file, date, content))

In [10]:
df = pd.DataFrame(parsed, columns=["file", "date", "content"]).sort_values("file")

In [11]:
df["content"][10]

"De kerkdienst wordt gehouden in de Immanuelkerk en wordt ook uitgezonden via Youtube\nKlik \nhier\n om de dienst van deze week te bekijken\nLiturgie\n:\nWelkom in de dienst\n Lied: - Kom nu is de tijd - Opwekking 539\n Votum en zegengroet\n Lied. Still my soul be still / stil mijn ziel wees stil - Opwekking 717\n Verootmoediging/leefregel\n Lied. Here's my heart Lord - Opwekking 802\n Gebed\n Lantaarn aansteken\n Lezing: Matteus 18:1-11\n Lied: Fundament - Opwekking 785\n Preek\n Lied: Heer wijs mij Uw weg - Sela / Opwekking 687\n Gebed\n Collecte + lied band: Build my life - Pat Barrett\n Lied: God van licht / My lighthouse - Opwekking 807\n Zegen\nRooster\n Houd me op de hoogte\n\xa0\xa0Delen\n\xa0\xa0Tweeten\n\xa0\xa0LinkedIn\n\xa0\xa0E-mail\n Opslaan in mijn agenda\n Outlook\n Google Kalender\n iCalendar"

In [12]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def extract_songs_from_liturgy(text):
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"Extract the section 'Liturgie' from the text below and only reply with the songs ('Lied'). Reply in Markdown. Leave out anything else."},
            {
                "role": "user",
                "content": text,
            }
        ],
        model="gpt-3.5-turbo",
    )
    return chat_completion.choices[0].message.content


In [13]:
if RERUN_OPENAI:
    responses = {}
    for index, row in tqdm(df.iterrows()):
        responses[row.file] = extract_songs_from_liturgy(row.content)

    with open("api_responses.json", 'w') as o:
        o.write(json.dumps(responses))

In [14]:
with open("api_responses.json", 'r') as f:
    responses = json.loads(f.read())

responses['./data/244485.html']

'- Lied: Dank u voor deze nieuwe morgen\n- Lied: Wij zijn als adem, U was er altijd - opw 558\n- Lied: Licht Aan (Deze dingen wil God graag) - Schrijvers voor gerechtigheid\n- Lied: Mijn Jezus, mijn redder (Juich voor de Heer) - opw 461\n- Slotlied: The Blessing - Cody Carnes, Kari Jobe'

In [15]:
ai_summaries = pd.DataFrame([(key, value) for key, value in responses.items()], columns=["file", "songs_ai"])
combined = ai_summaries.merge(df)

In [16]:
exploded = combined.copy()
exploded["songs_ai"] = exploded.apply(lambda row: row["songs_ai"].split("\n"), axis=1)
exploded = exploded.explode("songs_ai")
exploded = exploded[["file", "songs_ai", "date"]]
exploded

Unnamed: 0,file,songs_ai,date
0,./data/244485.html,- Lied: Dank u voor deze nieuwe morgen,16 juli 2023 10:00 tot 11:00
0,./data/244485.html,"- Lied: Wij zijn als adem, U was er altijd - o...",16 juli 2023 10:00 tot 11:00
0,./data/244485.html,- Lied: Licht Aan (Deze dingen wil God graag) ...,16 juli 2023 10:00 tot 11:00
0,./data/244485.html,"- Lied: Mijn Jezus, mijn redder (Juich voor de...",16 juli 2023 10:00 tot 11:00
0,./data/244485.html,"- Slotlied: The Blessing - Cody Carnes, Kari Jobe",16 juli 2023 10:00 tot 11:00
...,...,...,...
58,./data/268180.html,- Lied: **Psalm 121: 4:5 - Een bod’ge van Zijn...,14 juli 2024 10:00 tot 11:00
58,./data/268180.html,- Lied: **Psalm 33: 7 en 9 - God’s oog rust op...,14 juli 2024 10:00 tot 11:00
58,./data/268180.html,"- Lied: **Psalm 33: 10:11 - Hij kwam, Hij komt",14 juli 2024 10:00 tot 11:00
58,./data/268180.html,- Lied: **Psalm 84: 6: en 7 -’t Is naad’ren to...,14 juli 2024 10:00 tot 11:00


In [17]:
combined.to_excel("zangdiensten.xlsx")

In [18]:
exploded.to_excel("liedjes.xlsx")