# Notebook: LDA

## 1. Load Packages

In [1]:
from gensim.models.ldamodel import LdaModel
from nltk.tokenize import word_tokenize
from bertopic import BERTopic
from gensim import corpora
import pandas as pd

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


## 2. Constants / Setup

In [2]:
N_TOPICS = 20

In [3]:
FILTERED_DATASET_MENTIONS_PATH = "../Datasets/filtered_mentions.csv"
FILTERED_DATASET_POLITICIANS_PATH = "../Datasets/filtered_politicians.csv"

## 3. Load Dataset

In [4]:
df_mentions = pd.read_csv(FILTERED_DATASET_MENTIONS_PATH)
df_mentions

Unnamed: 0.1,Unnamed: 0,text,source_party,source_account,date,sentiment,clean_text
0,0,Wichtige wissenschaftliche Erkenntnis- nun mus...,SPD,KarambaDiaby,2021-01-09 19:35:29,0,erkenntnis umsetzung
1,1,@KarambaDiaby @HalleSpd @SPD_LSA Ich gratulier...,SPD,KarambaDiaby,2021-01-09 17:09:28,0,linken
2,2,@KarambaDiaby @HalleSpd @SPD_LSA Herzlichen Gl...,SPD,KarambaDiaby,2021-01-09 13:16:13,0,glückwunsch erfolg
3,3,@KarambaDiaby @HalleSpd @SPD_LSA Wann werden k...,SPD,KarambaDiaby,2021-01-09 12:32:40,1,steuern sozialabgaben rente senkung renteneint...
4,4,@KarambaDiaby @HalleSpd @SPD_LSA Glückwunsch.,SPD,KarambaDiaby,2021-01-09 12:13:06,0,glückwunsch
...,...,...,...,...,...,...,...
707236,707236,@b_riexinger Klima oder Verkehr fast gleich......,LINKE,b_riexinger,2021-12-17 08:19:23,1,klima verkehr pöstchen
707237,707237,@b_riexinger @Linksfraktion Na ob das noch lan...,LINKE,b_riexinger,2021-12-17 08:18:07,1,gruppierungen querdenker linke bürger mitte sy...
707238,707238,@b_riexinger Ich wünsch Dir viel Erfolg.,LINKE,b_riexinger,2021-12-17 07:47:59,0,erfolg
707239,707239,"@b_riexinger Nun, da gibt es ja genügend zu tu...",LINKE,b_riexinger,2021-12-17 02:07:26,2,paris


In [5]:
df_politicians = pd.read_csv(FILTERED_DATASET_POLITICIANS_PATH)
df_politicians

Unnamed: 0.1,Unnamed: 0,UserScreenName,source_account,date,Text,text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,id,source_party,sentiment,clean_text
0,0,AfD Berlin,AfDBerlin,2021-03-26 21:07:22,AfD Berlin\n@AfDBerlin\n·\n26. März,AfD wirkt.\n\nSchluss mit dem #Gendergaga\nMDR...,,4.0,28.0,132.0,['https://pbs.twimg.com/profile_images/1037343...,https://twitter.com/AfDBerlin/status/137555499...,1,AfD,2,afd schluss gendergaga mdr aktuell märz gender...
1,1,AfD Berlin,AfDBerlin,2021-03-27 07:20:27,AfD Berlin\n@AfDBerlin\n·\n27. März,Im Herbst wird gewählt.\nSchluss mit den Recht...,,10.0,20.0,112.0,['https://pbs.twimg.com/card_img/1471780757332...,https://twitter.com/AfDBerlin/status/137570928...,2,AfD,2,herbst schluss rechtsbrüchen regierung bild pr...
2,2,AfD Berlin,AfDBerlin,2021-03-31 07:14:04,AfD Berlin\n@AfDBerlin\n·\n31. März,Behördenwillkür\nFlüchtlingsheime durchgedrück...,,3.0,13.0,34.0,['https://pbs.twimg.com/media/ExylKvEU8AgowSU?...,https://twitter.com/AfDBerlin/status/137715723...,3,AfD,1,behördenwillkür flüchtlingsheime vorwürfen sen...
3,3,AfD Berlin,AfDBerlin,2021-04-01 14:29:00,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Aus Raider wird jetzt Twix \n\nLeider kein #Ap...,,1.0,4.0,17.0,['https://pbs.twimg.com/card_img/1472910546907...,https://twitter.com/AfDBerlin/status/137762907...,4,AfD,2,raider twix aprilscherz bz berlin name astraze...
4,4,AfD Berlin,AfDBerlin,2021-04-01 05:02:10,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Gendern geht („*innen“)\nImpfen geht nicht.\nD...,,1.0,3.0,16.0,['https://pbs.twimg.com/profile_images/9706413...,https://twitter.com/AfDBerlin/status/137748642...,5,AfD,1,gendern impfen regierung deutschland welt abwä...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58859,58859,Tino Chrupalla,Tino_Chrupalla,2021-12-04 17:26:46,Tino Chrupalla\n@Tino_Chrupalla\n·\n4. Dez. 2021,Friedlicher Protest gegen einen #Impfzwang ist...,,265.0,122.0,578.0,[],https://twitter.com/Tino_Chrupalla/status/1467...,61802,AfD,1,protest impfzwang grundrecht schutz privatsphä...
58860,58860,Tino Chrupalla,Tino_Chrupalla,2021-12-13 16:30:19,Tino Chrupalla\n@Tino_Chrupalla\n·\n13. Dez. 2021,@OlafScholz\n muss sich endlich klar zu Nord S...,,4.0,2.0,11.0,[],https://twitter.com/Tino_Chrupalla/status/1470...,61803,AfD,0,nord stream interessen energieversorgung
58861,58861,Tino Chrupalla,Tino_Chrupalla,2021-12-17 14:02:24,Tino Chrupalla\n@Tino_Chrupalla\n·\n17. Dez. 2021,Mit \n@_FriedrichMerz\n gibt es keine konserva...,,188.0,227.0,1.008,[],https://twitter.com/Tino_Chrupalla/status/1471...,61804,AfD,1,erneuerung ausgrenzung bürger kälte management...
58862,58862,Tino Chrupalla,Tino_Chrupalla,2021-12-19 09:27:23,Tino Chrupalla\n@Tino_Chrupalla\n·\n19. Dez. 2021,Wir wünschen Ihnen und Ihrer Familie einen bes...,,449.0,346.0,2.648,['https://pbs.twimg.com/media/FG9dwrcXIAUF5BP?...,https://twitter.com/Tino_Chrupalla/status/1472...,61805,AfD,0,familie advent


## 4. Code

In [6]:
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bertopic import BERTopic
stopwords=set(stopwords.words('german'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df = df_mentions.copy()

In [8]:
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in str(sentence).split():
            if word.lower() not in stopwords:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

In [9]:
cleaned_text = remove_stopwords(df.text.to_list())

In [10]:
model = BERTopic(language="german", nr_topics="auto")

In [11]:
topics, probabilities = model.fit_transform(cleaned_text)

Downloading (…)0fe39/.gitattributes: 100%|████████████████████████████████████| 968/968 [00:00<00:00, 1.49MB/s]
Downloading (…)_Pooling/config.json: 100%|█████████████████████████████████████| 190/190 [00:00<00:00, 264kB/s]
Downloading (…)83e900fe39/README.md: 100%|████████████████████████████████| 3.79k/3.79k [00:00<00:00, 5.33MB/s]
Downloading (…)e900fe39/config.json: 100%|████████████████████████████████████| 645/645 [00:00<00:00, 1.04MB/s]
Downloading (…)ce_transformers.json: 100%|█████████████████████████████████████| 122/122 [00:00<00:00, 263kB/s]
Downloading pytorch_model.bin: 100%|████████████████████████████████████████| 471M/471M [00:43<00:00, 10.9MB/s]
Downloading (…)nce_bert_config.json: 100%|███████████████████████████████████| 53.0/53.0 [00:00<00:00, 157kB/s]
Downloading (…)tencepiece.bpe.model: 100%|████████████████████████████████| 5.07M/5.07M [00:00<00:00, 12.5MB/s]
Downloading (…)cial_tokens_map.json: 100%|█████████████████████████████████████| 239/239 [00:00<00:00, 7

In [None]:
model.update_topics(cleaned_text, topics, n_gram_range=(1, 2))

In [None]:
model.get_topic_freq().head(11)

In [None]:
for i in range(0,10):
    model.get_topic(1)

In [None]:
model.visualize_barchart()

In [None]:
model.visualize_hierarchy(top_n_topics=20)