In [27]:
! pip install bertopic pymupdf bertopic nltk
! pip install ipykernel



In [28]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

# read pdfs (folder: Wortprotokolle) content to docs string list
import os
import fitz  # PyMuPDF
import re
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

In [86]:
nltk.download('stopwords')

german_stop_words = stopwords.words('german')
# add stopwords from stopwords.txt
with open('stopwords.txt', 'r') as file:
    german_stop_words += file.read().splitlines()
    
german_stop_words += ['köln', 'schon', 'jahr', 'herr', 'frau', 'antrag', 'beifall', 'ja', 'top', 'seite', 'seiten', 'anlage', 'anlagen', 'drucksache', 'drucksachen', 'stadt', 'sitzung', 'ausschuss', 'ausschusses', 'ausschüsse']
vectorizer_model = CountVectorizer(stop_words=german_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timomurmann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
from bertopic.representation import ZeroShotClassification
from bertopic import BERTopic

# Create your representation model
candidate_topics = ["Wohnen", "Verkehr", "Sport", "Kultur", "Wirtschaft", "Gesundheit", "Umwelt", "Bildung", "Technologie"]
representation_model = ZeroShotClassification(candidate_topics, model="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")

In [68]:
# get all pdfs in folder
pdf_folder = "Wortprotokolle"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

# read all pdfs
docs = []

for pdf_file in pdf_files:
    # open pdf
    pdf_path = os.path.join(pdf_folder, pdf_file)
    doc = fitz.open(pdf_path)

    # read all pages
    for page in doc:
        text = page.get_text()
        docs.append(text)

    doc.close()

In [87]:
topic_model = BERTopic(language="german", verbose=True,
                        embedding_model="all-mpnet-base-v2",
                       vectorizer_model=vectorizer_model, representation_model=representation_model)

topic, prob = topic_model.fit_transform(docs)

2024-03-11 20:13:45,953 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 59/59 [00:34<00:00,  1.70it/s]
2024-03-11 20:14:22,931 - BERTopic - Embedding - Completed ✓
2024-03-11 20:14:22,931 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-11 20:14:26,366 - BERTopic - Dimensionality - Completed ✓
2024-03-11 20:14:26,367 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-11 20:14:26,400 - BERTopic - Cluster - Completed ✓
2024-03-11 20:14:26,401 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-11 20:14:48,383 - BERTopic - Representation - Completed ✓


In [88]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,776,-1_spd_reker_henriette_gibt,"[spd, reker, henriette, gibt, fraktion, dank, ...",[Ó \n \n \n \n \n \n15. Sitzung vom 17. März 2...
1,0,293,0_rat_spd_grünen_90,"[rat, spd, grünen, 90, liebe, 2023, reker, heu...",[ \n \n Rat der Stadt Köln \n26. Sitzung vom 1...
2,1,138,1_enthaltungen_gegenstimmen_tagesordnungspunkt...,"[enthaltungen, gegenstimmen, tagesordnungspunk...",[ \n \n10. Sitzung vom 16. September 2021 Se...
3,2,95,2_fraktion_enthaltungen_spd_gibt,"[fraktion, enthaltungen, spd, gibt, afd, fdp, ...",[Ó \n \n \n \n \n \n15. Sitzung vom 17. März 2...
4,3,82,3_menschen_afd_spd_immer,"[menschen, afd, spd, immer, deutschland, liebe...",[ \n \n \n 9. Sitzung vom 23. August 2021 \nSe...
5,4,54,4_schulen_kinder_schule_eltern,"[schulen, kinder, schule, eltern, gesamtschule...",[ \n \n Rat der Stadt Köln \n21. Sitzung vom ...
6,5,47,5_spd_reker_90_grünen,"[spd, reker, 90, grünen, bündnis, henriette, r...",[ \n \n Rat der Stadt Köln \n29. Sitzung vom 7...
7,6,44,6_liebe_henriette_reker_kolleginnen,"[liebe, henriette, reker, kolleginnen, kollege...",[ \n \n Rat der Stadt Köln \n23. Sitzung vom 9...
8,7,37,7_Sport___,"[Sport, , , , , , , , , ]",[ \nÓ \n \n \n \n \n \n \n 5. Sitzung vom 23. ...
9,8,30,8_rates_damen_heute_herren,"[rates, damen, heute, herren, tagesordnung, he...",[Ó \n \n \n \n \n \n \n16. Sitzung vom 5. Mai ...


In [89]:
topic_model.visualize_topics()

In [90]:
topic_model.visualize_barchart()


In [91]:
topic_model.visualize_heatmap()

In [75]:
topic_model.visualize_term_rank(log_scale=True)