In [27]:
! pip install ipykernel
! pip install --upgrade nbformat
! pip install plotly



In [28]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

# read pdfs (folder: Wortprotokolle) content to docs string list
import os
import fitz  # PyMuPDF
import re
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

In [79]:
nltk.download('stopwords')

german_stop_words = stopwords.words('german')
# add stopwords from stopwords.txt
with open('stopwords.txt', 'r') as file:
    german_stop_words += file.read().splitlines()
    
german_stop_words += ['köln', 'schon', 'jahr', 'herr', 'frau', 'antrag', 'beifall', 'ja', 'top', 'seite', 'seiten', 'anlage', 'anlagen', 'drucksache', 'drucksachen', 'stadt', 'sitzung', 'ausschuss', 'ausschusses', 'ausschüsse']
vectorizer_model = CountVectorizer(stop_words=german_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timomurmann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
from bertopic.representation import ZeroShotClassification
from bertopic import BERTopic

# Create your representation model
candidate_topics = ["Wohnen", "Verkehr", "Sport", "Kultur", "Wirtschaft", "Gesundheit", "Umwelt", "Bildung", "Technologie"]
representation_model = ZeroShotClassification(candidate_topics, model="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")

In [68]:
# get all pdfs in folder
pdf_folder = "Wortprotokolle"
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

# read all pdfs
docs = []

for pdf_file in pdf_files:
    # open pdf
    pdf_path = os.path.join(pdf_folder, pdf_file)
    doc = fitz.open(pdf_path)

    # read all pages
    for page in doc:
        text = page.get_text()
        docs.append(text)

    doc.close()

In [69]:
topic_model = BERTopic(language="german", verbose=True,
                        embedding_model="all-mpnet-base-v2",
                       vectorizer_model=vectorizer_model, representation_model=representation_model)

topic, prob = topic_model.fit_transform(docs)

2024-03-11 19:57:25,011 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 59/59 [00:34<00:00,  1.71it/s]
2024-03-11 19:58:02,884 - BERTopic - Embedding - Completed ✓
2024-03-11 19:58:02,884 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-11 19:58:06,569 - BERTopic - Dimensionality - Completed ✓
2024-03-11 19:58:06,570 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-11 19:58:06,605 - BERTopic - Cluster - Completed ✓
2024-03-11 19:58:06,607 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-11 19:58:27,193 - BERTopic - Representation - Completed ✓


In [71]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,874,-1_oberbürgermeisterin_spd_reker_gibt,"[oberbürgermeisterin, spd, reker, gibt, henrie...",[ \n \n Rat der Stadt Köln \n22. Sitzung vom ...
1,0,190,0_rat_oberbürgermeisterin_spd_90,"[rat, oberbürgermeisterin, spd, 90, heute, müs...",[ \n \n Rat der Stadt Köln \n21. Sitzung vom ...
2,1,132,1_enthaltungen_gegenstimmen_tagesordnungspunkt...,"[enthaltungen, gegenstimmen, tagesordnungspunk...",[ \n \n10. Sitzung vom 16. September 2021 Se...
3,2,90,2_menschen_afd_oberbürgermeisterin_spd,"[menschen, afd, oberbürgermeisterin, spd, imme...",[Ó \n \n \n \n \n \n \n 13. Sitzung vom 14. De...
4,3,53,3_schulen_kinder_schule_eltern,"[schulen, kinder, schule, eltern, gesamtschule...",[ \n \n Rat der Stadt Köln \n24. Sitzung vom 2...
5,4,44,4_fraktion_2021_enthaltungen_gibt,"[fraktion, 2021, enthaltungen, gibt, abstimmen...",[Ó \n \n \n \n \n \n15. Sitzung vom 17. März 2...
6,5,38,5_oberbürgermeisterin_liebe_henriette_reker,"[oberbürgermeisterin, liebe, henriette, reker,...",[Ó \n \n \n \n \n \n \n16. Sitzung vom 5. Mai ...
7,6,36,6_Sport___,"[Sport, , , , , , , , , ]",[ \n \n Rat der Stadt Köln \n24. Sitzung vom 2...
8,7,30,7_uhr_rates_damen_heute,"[uhr, rates, damen, heute, herren, tagesordnun...",[Ó \n \n \n \n \n \n \n16. Sitzung vom 5. Mai ...
9,8,28,8_oberbürgermeisterin_dank_henriette_verwaltung,"[oberbürgermeisterin, dank, henriette, verwalt...",[ \n \n10. Sitzung vom 16. September 2021 Se...


In [72]:
topic_model.visualize_topics()

In [73]:
topic_model.visualize_barchart()


In [74]:
topic_model.visualize_heatmap()

In [75]:
topic_model.visualize_term_rank(log_scale=True)