# Notebook: BERTopic to analyse the Topics

## 1. Load Packages

In [1]:
from plots import plot_topic_subplots, plot_timeseries_subplots, plot_timeseries_sentiment_subplots
from gensim.models.ldamodel import LdaModel
from gensim.models import LdaModel
from bertopic import BERTopic
from gensim import corpora
import pandas as pd
import numpy as np
import random
import torch
import math
import re
import os

## 2. Constants / Setup

In [2]:
N_TOPICS = 25
RANDOM_STATE = 42 
TOP_N_WORDS = 100
MIN_CLUSTER_SIZE = 250

In [3]:
DATASET_MENTIONS_PATH = "../Datasets/tweets_mentions_cleaned.csv"
DATASET_MENTIONS_IMG_PATH = "../Datasets/img_dataset_mentions/predictions_cleaned.csv"
DATASET_POLITICAL_ACCOUNTS_PATH = "../Datasets/tweets_political_accounts_cleaned.csv"
DATASET_POLITICAL_ACCOUNTS_IMG_PATH = "../Datasets/img_dataset_political_accounts/predictions_cleaned.csv"

In [4]:
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x7f45bf227090>

## 3. Load Dataset

In [6]:
df_political_accounts = pd.read_csv(DATASET_POLITICAL_ACCOUNTS_PATH)

In [7]:
df_political_accounts_img = pd.read_csv(DATASET_POLITICAL_ACCOUNTS_IMG_PATH)

In [8]:
df_mentions_img = pd.read_csv(DATASET_MENTIONS_IMG_PATH)

In [9]:
df_mentions = pd.read_csv(DATASET_MENTIONS_PATH)

## 4. Code

#### Clean Data

In [10]:
df = df_mentions.copy()

In [11]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: '' if type(x) != str else x)
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,source_party,source_account,date,sentiment,cleaned_text
0,0,0,Wichtige wissenschaftliche Erkenntnis- nun mus...,SPD,KarambaDiaby,2021-01-09 19:35:29,0,wichtig wissenschaftlich erkenntnis schnell pr...
1,1,1,@KarambaDiaby @HalleSpd @SPD_LSA Ich gratulier...,SPD,KarambaDiaby,2021-01-09 17:09:28,0,gratulieren linke
2,2,2,@KarambaDiaby @HalleSpd @SPD_LSA Herzlichen Gl...,SPD,KarambaDiaby,2021-01-09 13:16:13,0,herzlich glückwunsch erfolg
3,3,3,@KarambaDiaby @HalleSpd @SPD_LSA Wann werden k...,SPD,KarambaDiaby,2021-01-09 12:32:40,1,wann konkret massiv steuer sozialabgabe senk...
4,4,4,@KarambaDiaby @HalleSpd @SPD_LSA Glückwunsch.,SPD,KarambaDiaby,2021-01-09 12:13:06,0,glückwunsch
...,...,...,...,...,...,...,...,...
707236,707236,707236,@b_riexinger Klima oder Verkehr fast gleich......,LINKE,b_riexinger,2021-12-17 08:19:23,1,klima verkehr fast gleich hauptsach pöstchen
707237,707237,707237,@b_riexinger @Linksfraktion Na ob das noch lan...,LINKE,b_riexinger,2021-12-17 08:18:07,1,na lange gut gruppierung querdenker linke bü...
707238,707238,707238,@b_riexinger Ich wünsch Dir viel Erfolg.,LINKE,b_riexinger,2021-12-17 07:47:59,0,wünsch erfolg
707239,707239,707239,"@b_riexinger Nun, da gibt es ja genügend zu tu...",LINKE,b_riexinger,2021-12-17 02:07:26,2,genügend tu paris machen


In [12]:
mask = df['cleaned_text'].apply(lambda x: len(str(x).split())) >= 5
df = df[mask]
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,source_party,source_account,date,sentiment,cleaned_text
0,0,0,Wichtige wissenschaftliche Erkenntnis- nun mus...,SPD,KarambaDiaby,2021-01-09 19:35:29,0,wichtig wissenschaftlich erkenntnis schnell pr...
3,3,3,@KarambaDiaby @HalleSpd @SPD_LSA Wann werden k...,SPD,KarambaDiaby,2021-01-09 12:32:40,1,wann konkret massiv steuer sozialabgabe senk...
5,5,5,@KarambaDiaby @HalleSpd @SPD_LSA Wir hatten do...,SPD,KarambaDiaby,2021-01-09 11:13:33,1,genug dunkel zeit ab heimat bevölkerung helfen
6,6,6,@KarambaDiaby @HalleSpd @SPD_LSA Glückwunsch! ...,SPD,KarambaDiaby,2021-01-09 10:41:10,0,glückwunsch pass gut genosse bestimmt viele ...
10,10,10,@KarambaDiaby @HalleSpd @SPD_LSA Ich freue mic...,SPD,KarambaDiaby,2021-01-09 09:47:36,0,freuen nächster jahr wählen dürfen
...,...,...,...,...,...,...,...,...
707233,707233,707233,@Rainer_Rehak @JoeMo38753690 @Johann_v_d_Bron ...,LINKE,b_riexinger,2021-12-17 10:15:35,2,ach gesellschaft forschung unternehmen aufko...
707235,707235,707235,@b_riexinger Warum haben sich die Leute eigent...,LINKE,b_riexinger,2021-12-17 08:23:45,1,warum leute eigentlich streik eisenbahner au...
707236,707236,707236,@b_riexinger Klima oder Verkehr fast gleich......,LINKE,b_riexinger,2021-12-17 08:19:23,1,klima verkehr fast gleich hauptsach pöstchen
707237,707237,707237,@b_riexinger @Linksfraktion Na ob das noch lan...,LINKE,b_riexinger,2021-12-17 08:18:07,1,na lange gut gruppierung querdenker linke bü...


In [13]:
df = df.reset_index(drop=True)

In [14]:
df['date'] = pd.to_datetime(df['date'])
months = df['date'].dt.month.to_list()

#### Train LDA

In [15]:
documents = df['cleaned_text'].tolist()
tokenized_documents = [doc.split() for doc in documents]
dictionary = corpora.Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]
lda_model = LdaModel(corpus, num_topics=N_TOPICS, id2word=dictionary, passes=10, random_state=RANDOM_STATE)
topics_data = []
for topic_id in range(N_TOPICS):
    words = lda_model.show_topic(topic_id)
    topic_words = ", ".join([word for word, _ in words])
    topic_docs_count = sum(1 for doc in lda_model.get_document_topics(corpus) if max(doc, key=lambda x: x[1])[0] == topic_id)
    topics_data.append([topic_id, topic_words, topic_docs_count])

topics_df = pd.DataFrame(topics_data, columns=['topic_id', 'top_10_words', 'document_count'])
topics_df

Unnamed: 0,topic_id,top_10_words,document_count
0,0,"mensch, leben, sicher, meinung, tweet, gerne, ...",39525
1,1,"gut, richtig, frage, verstehen, thema, falsch,...",67581
2,2,"problem, kosten, bereits, hoffen, nutzen, ding...",10556
3,3,"stehen, aussage, na, nix, bzw, btw, seite, erw...",11693
4,4,"bitte, jemand, schön, schreiben, lieb, helfen,...",18333
5,5,"frau, herr, egal, merkel, völlig, person, schn...",23154
6,6,"dafür, land, brauchen, geld, halt, deutschland...",49600
7,7,"sein, ende, genug, heißen, sozialismus, funkti...",20329
8,8,"linke, partei, afd, wählen, fdp, cdu, grüne, s...",66047
9,9,"hoch, dabei, nennen, fallen, zahl, gelten, vie...",20082


In [16]:
topics_df.to_csv('lda_topics.csv', index=False)