In [1]:
from bs4 import BeautifulSoup
import requests
import re

# Web scraping from the radar AvroTros website
search_link = "https://radar.avrotros.nl/forum/energie-f51/index-s"

forum_url = BeautifulSoup(requests.get(search_link).content, 'html.parser')

page_index = 0

dict_topics = {}

for i in range(0, 100, 25):
    page_index = i

    forum_url = BeautifulSoup(requests.get(search_link + str(page_index) + '.html').content, 'html.parser')

    items = forum_url.find_all(class_=lambda x: x and x.startswith("row bg"))

    for item in items:
        title = item.find('a', class_='topictitle')
        topic_title = title.text

        topic_link = "https://radar.avrotros.nl/forum" + "/" + title['href'].lstrip('./..').split('.html')[0] + '.html'

        date_element = item.find("div", class_="topic-poster responsive-hide left-box").find("time")
        date = date_element.text

        code_link = topic_link[-11:]
        code = code_link[:6]

        reactions_element = item.find("dd", class_="posts")
        reactions = reactions_element.text
        # Remove the 'Reacties' text
        reactions = reactions[:-9]
        reactions = int(reactions)

        dict_topics[topic_title] = topic_link, date, code, reactions

# Convert dict to dataframe
import pandas as pd

df_topics = pd.DataFrame.from_dict(dict_topics, orient='index', columns=['link', 'date', 'code', 'reactions'])

df_topics

Unnamed: 0,link,date,code,reactions
De energiemarkt en salderen: hoe komt de prijsvorming tot stand?,https://radar.avrotros.nl/forum/energie-f51/de...,22 okt 2021 14:18,197725,48
"Woningen met verkeerde energielabels, wie heeft daar ervaring mee?",https://radar.avrotros.nl/forum/energie-f51/wo...,13 feb 2023 19:48,200283,36
slimme meter meet niet altijd correct,https://radar.avrotros.nl/forum/energie-f51/sl...,14 feb 2023 11:37,200287,3
Energieleverancier kiezen met zonnepanelen,https://radar.avrotros.nl/forum/energie-f51/en...,05 jan 2021 16:57,195768,100
Hogere tarieven Vattenfall ontlopen,https://radar.avrotros.nl/forum/energie-f51/ho...,23 dec 2022 16:42,199987,9
...,...,...,...,...
Een energiecontract zonder mijn toestemming?,https://radar.avrotros.nl/forum/energie-f51/ee...,09 mei 2022 17:04,198866,6
Vattenfall wil Warmtelink plaatsen voor digitaal uitlezen verbruik,https://radar.avrotros.nl/forum/energie-f51/va...,02 jun 2021 17:18,196961,47
Energierekening opsplitsen in Netbeheer en Levering,https://radar.avrotros.nl/forum/energie-f51/en...,03 mar 2022 09:41,198540,17
Faillisement Energie Welkom: wat kan / mag ik?,https://radar.avrotros.nl/forum/energie-f51/fa...,15 nov 2021 09:41,197882,31


In [2]:
# Get first ten rows within df
topics = df_topics.head(2)


dict_messages = {}
message_id = 1

for topic, row in topics.iterrows():
    page_index = 0
    while True:
        if page_index > row['reactions']:
            break
        page_link = "https://radar.avrotros.nl/forum/viewtopic.php?t=" + row['code'] + "&start=" + str(page_index)
        print(page_link)
        page = requests.get(page_link)
        if page.status_code != 200:
            break
        soup = BeautifulSoup(page.content, 'html.parser')
        posts = soup.find_all('div', class_=lambda x: x and x.startswith("post has-profile bg"))
        for post in posts:
            content = post.find("div", class_="content")
            # Remove blockquotes to prevent double counting
            for blockquote in content.find_all("blockquote"):
                blockquote.decompose()
            content = content.text

            date = post.find('p', class_="author").find('time').text

            dict_messages[message_id] = topic, content, date
            message_id += 1

        page_index += 20


df_messages = pd.DataFrame.from_dict(dict_messages, orient='index', columns=['topic', 'content', 'date'])

# Removing newlines and tabs from content cells.
df_messages['content'] = df_messages['content'].str.replace('\n', '')
df_messages['content'] = df_messages['content'].str.replace('\t', '')

# Convert dates to datetime
from datetime import datetime
import locale
locale.setlocale(locale.LC_ALL, "nl_NL")

df_messages["date"] = df_messages["date"].str.split(" ").apply(lambda x: " ".join([i.capitalize() for i in x]))

print(datetime.strptime("22 Okt 2021 14:32", "%d %b %Y %H:%M"))

df_messages['date'] = df_messages['date'].apply(lambda x: datetime.strptime(x, "%d %b %Y %H:%M"))

df_messages



https://radar.avrotros.nl/forum/viewtopic.php?t=197725&start=0
https://radar.avrotros.nl/forum/viewtopic.php?t=197725&start=20
https://radar.avrotros.nl/forum/viewtopic.php?t=197725&start=40
https://radar.avrotros.nl/forum/viewtopic.php?t=200283&start=0
https://radar.avrotros.nl/forum/viewtopic.php?t=200283&start=20
2021-10-22 14:32:00


Unnamed: 0,topic,content,date
1,De energiemarkt en salderen: hoe komt de prijs...,"Goedendag,In navolging van het inmiddels geslo...",2021-10-22 14:18:00
2,De energiemarkt en salderen: hoe komt de prijs...,3.U verbruikt en produceert stroomDezelfde pri...,2021-10-22 14:52:00
3,De energiemarkt en salderen: hoe komt de prijs...,Heel leerzaam artikel.,2021-10-22 17:23:00
4,De energiemarkt en salderen: hoe komt de prijs...,Dank voor de uitgebreide uitleg.,2021-10-22 17:57:00
5,De energiemarkt en salderen: hoe komt de prijs...,Ik heb de mods gevraagd dit topic sticky te ma...,2021-10-22 18:11:00
...,...,...,...
82,"Woningen met verkeerde energielabels, wie heef...",Zit er een kras op uw plaat? Kom dan eens met ...,2023-02-15 08:55:00
83,"Woningen met verkeerde energielabels, wie heef...",Dan heb je er dus niks aan. Niet om de hoogte ...,2023-02-15 09:07:00
84,"Woningen met verkeerde energielabels, wie heef...",Voor mij is dat duidelijk uit de bijdragen te ...,2023-02-15 09:13:00
85,"Woningen met verkeerde energielabels, wie heef...","@MM, de persoon waar ik op reageerde (Jurrien)...",2023-02-15 10:30:00


In [3]:
for index, row in df_messages.iterrows():
    print(index)
    print(row['date'])

1
2021-10-22 14:18:00
2
2021-10-22 14:52:00
3
2021-10-22 17:23:00
4
2021-10-22 17:57:00
5
2021-10-22 18:11:00
6
2021-10-23 11:48:00
7
2021-10-27 15:17:00
8
2021-10-28 09:25:00
9
2021-10-28 12:15:00
10
2021-10-28 18:59:00
11
2021-10-28 22:34:00
12
2021-10-28 23:24:00
13
2021-10-29 02:18:00
14
2021-10-29 02:23:00
15
2021-10-29 07:43:00
16
2021-10-29 11:40:00
17
2021-10-29 12:00:00
18
2021-10-30 11:31:00
19
2021-10-30 12:47:00
20
2021-10-30 16:14:00
21
2021-10-30 17:05:00
22
2021-10-30 18:15:00
23
2021-10-30 20:11:00
24
2021-10-30 21:53:00
25
2021-10-31 11:00:00
26
2021-10-31 11:07:00
27
2021-10-31 11:13:00
28
2021-10-31 15:47:00
29
2021-10-31 17:17:00
30
2022-11-27 15:13:00
31
2022-11-27 15:18:00
32
2022-11-27 15:29:00
33
2022-12-12 23:18:00
34
2022-12-12 23:28:00
35
2022-12-13 02:35:00
36
2022-12-13 09:07:00
37
2022-12-13 09:09:00
38
2022-12-13 09:15:00
39
2022-12-13 09:22:00
40
2022-12-13 09:36:00
41
2022-12-13 09:57:00
42
2022-12-13 10:50:00
43
2022-12-13 11:40:00
44
2022-12-13 14:35:

In [4]:
import torch
from transformers import RobertaTokenizer, RobertaForTokenClassification

tokenizer = RobertaTokenizer.from_pretrained('pdelobelle/robbert-v2-dutch-ner')
model = RobertaForTokenClassification.from_pretrained('pdelobelle/robbert-v2-dutch-ner', return_dict=True)
model.eval()
print("RobBERT model loaded")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/465M [00:00<?, ?B/s]

  t = torch.tensor([], dtype=storage.dtype, device=storage.untyped().device)


RobBERT model loaded


In [7]:
inputs = tokenizer.batch_encode_plus(
    ["Jan ging naar de bakker in Leuven en kocht een brood.",
     "Bedrijven zoals Google en Microsoft doen ook heel veel onderzoek naar NLP.",
     "Men moet een gegeven paard niet in de bek kijken.",
     "Hallo, mijn naam is RobBERT."],
    return_tensors="pt", padding=True)
for key, value in inputs.items():
    print("{}:\n\t{}".format(key, value))
print("Tokens:\n\t{}".format(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) ))
print("\t{}".format(tokenizer.convert_ids_to_tokens(inputs['input_ids'][1]) ))

input_ids:
	tensor([[    0,  6079,   499,    38,     5, 13292,    11,  6422,     8,  7010,
             9,  2617,     4,     2,     1],
        [    0, 25907,   129,  1283,     8,  3971,   113,    28,   118,    71,
           435,    38, 27600,     4,     2],
        [    0,  9396,    89,     9,   797,  2877,    22,    11,     5,  4290,
           445,     4,     2,     1,     1],
        [    0,  7751,     6,    74,   458,    12,  3663, 14334,   342,     4,
             2,     1,     1,     1,     1]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])
Tokens:
	['<s>', 'Jan', 'Ġging', 'Ġnaar', 'Ġde', 'Ġbakker', 'Ġin', 'ĠLeuven', 'Ġen', 'Ġkocht', 'Ġeen', 'Ġbrood', '.', '</s>', '<pad>']
	['<s>', 'Bedrijven', 'Ġzoals', 'ĠGoogle', 'Ġen', 'ĠMicrosoft', 'Ġdoen', 'Ġook', 'Ġheel', 'Ġveel', 'Ġonderzoek', 'Ġnaa

In [8]:
print(model.config.id2label)


{0: 'B-PER', 1: 'B-ORG', 2: 'B-LOC', 3: 'B-MISC', 4: 'I-PER', 5: 'I-ORG', 6: 'I-LOC', 7: 'I-MISC', 8: 'O'}


In [9]:
with torch.no_grad():
    results = model(**inputs)
    for i, input in enumerate(inputs['input_ids']):
        print(f"Sentence {i}")
        [print("{:12}".format(token), end="") for token in tokenizer.convert_ids_to_tokens(input) ]
        print('\n')
        [print("{:12}".format(model.config.id2label[item.item()]), end="") for item in results.logits[i].argmax(axis=1)]
        print('\n')

Sentence 0
<s>         Jan         Ġging       Ġnaar       Ġde         Ġbakker     Ġin         ĠLeuven     Ġen         Ġkocht      Ġeen        Ġbrood      .           </s>        <pad>       

O           B-PER       O           O           O           O           O           B-LOC       O           O           O           O           O           O           O           

Sentence 1
<s>         Bedrijven   Ġzoals      ĠGoogle     Ġen         ĠMicrosoft  Ġdoen       Ġook        Ġheel       Ġveel       Ġonderzoek  Ġnaar       ĠNLP        .           </s>        

O           O           O           B-ORG       O           B-ORG       O           O           O           O           O           O           B-MISC      O           O           

Sentence 2
<s>         Men         Ġmoet       Ġeen        Ġgegeven    Ġpaard      Ġniet       Ġin         Ġde         Ġbek        Ġkijken     .           </s>        <pad>       <pad>       

O           O           O           O           O        

In [17]:
inputs = tokenizer.batch_encode_plus(["Op het moment heb ik de Kolere thermostaat op 20 graden staan, maar ik krijg het nog steeds koud thuis."], return_tensors="pt", padding=True)

for key, value in inputs.items():
    print("{}:\n\t{}".format(key, value))
print("Tokens:\n\t{}".format(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) ))

with torch.no_grad():
    results = model(**inputs)
    for i, input in enumerate(inputs['input_ids']):
        print(f"Sentence {i}")
        [print("{:12}".format(token), end="") for token in tokenizer.convert_ids_to_tokens(input) ]
        print('\n')
        [print("{:12}".format(model.config.id2label[item.item()]), end="") for item in results.logits[i].argmax(axis=1)]
        print('\n')


input_ids:
	tensor([[    0,   375,    10,   396,    88,    29,     5, 10608,   827, 18655,
            13,   395,  2464,   224,     6,    37,    29,   944,    10,    49,
           237,  3201,   511,     4,     2]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])
Tokens:
	['<s>', 'Op', 'Ġhet', 'Ġmoment', 'Ġheb', 'Ġik', 'Ġde', 'ĠKol', 'ere', 'Ġthermostaat', 'Ġop', 'Ġ20', 'Ġgraden', 'Ġstaan', ',', 'Ġmaar', 'Ġik', 'Ġkrijg', 'Ġhet', 'Ġnog', 'Ġsteeds', 'Ġkoud', 'Ġthuis', '.', '</s>']
Sentence 0
<s>         Op          Ġhet        Ġmoment     Ġheb        Ġik         Ġde         ĠKol        ere         ĠthermostaatĠop         Ġ20         Ġgraden     Ġstaan      ,           Ġmaar       Ġik         Ġkrijg      Ġhet        Ġnog        Ġsteeds     Ġkoud       Ġthuis      .           </s>        

O           O           O           O           O           O           O           B-MISC      O           O           O           O      