# Notebook: Calculate Overall Topics

## 1. Load Packages

In [1]:
from load_datasets import load_mentions_dataset, load_politicians_dataset
from gensim.models.ldamodel import LdaModel
from nltk.tokenize import word_tokenize
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
from gensim import corpora
import pandas as pd
import gensim
import spacy
import nltk
import re
import os

## 2. Constants / Setup

In [2]:
FILTERED_DATASET_MENTIONS_PATH = "../Datasets/filtered_mentions.csv"
FILTERED_DATASET_POLITICIANS_PATH = "../Datasets/filtered_politicians.csv"

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words("german"))
STOPWORDS.update(["mehr", "heute", "https", "thread", "anzeigen", "https", "http", "www", "co", "de", "html", "nehmen", "habt", "hast", "tut", "gibt", "wäre", "steht", "ganz", "usw", "ging", "seid", "liegt", "sagen", "finde", "denen", "übrigens", "trotzdem", "darauf", "hätte", "wer", "sogar", "weiß"])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 3. Load Dataset

In [4]:
df_mentions = load_mentions_dataset()

In [5]:
df_mentions

Unnamed: 0,text,source_party,source_account,date,sentiment
0,Wichtige wissenschaftliche Erkenntnis- nun mus...,SPD,KarambaDiaby,2021-01-09 19:35:29,0
1,@KarambaDiaby @HalleSpd @SPD_LSA Ich gratulier...,SPD,KarambaDiaby,2021-01-09 17:09:28,0
2,@KarambaDiaby @HalleSpd @SPD_LSA Herzlichen Gl...,SPD,KarambaDiaby,2021-01-09 13:16:13,0
3,@KarambaDiaby @HalleSpd @SPD_LSA Wann werden k...,SPD,KarambaDiaby,2021-01-09 12:32:40,1
4,@KarambaDiaby @HalleSpd @SPD_LSA Glückwunsch.,SPD,KarambaDiaby,2021-01-09 12:13:06,0
...,...,...,...,...,...
707236,@b_riexinger Klima oder Verkehr fast gleich......,LINKE,b_riexinger,2021-12-17 08:19:23,1
707237,@b_riexinger @Linksfraktion Na ob das noch lan...,LINKE,b_riexinger,2021-12-17 08:18:07,1
707238,@b_riexinger Ich wünsch Dir viel Erfolg.,LINKE,b_riexinger,2021-12-17 07:47:59,0
707239,"@b_riexinger Nun, da gibt es ja genügend zu tu...",LINKE,b_riexinger,2021-12-17 02:07:26,2


In [6]:
df_politicians = load_politicians_dataset()

In [7]:
df_politicians

Unnamed: 0,UserScreenName,source_account,date,Text,text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL,id,source_party,sentiment
0,AfD Berlin,AfDBerlin,2021-03-26 21:07:22,AfD Berlin\n@AfDBerlin\n·\n26. März,AfD wirkt.\n\nSchluss mit dem #Gendergaga\nMDR...,,4.0,28.0,132.0,['https://pbs.twimg.com/profile_images/1037343...,https://twitter.com/AfDBerlin/status/137555499...,1,AfD,2
1,AfD Berlin,AfDBerlin,2021-03-27 07:20:27,AfD Berlin\n@AfDBerlin\n·\n27. März,Im Herbst wird gewählt.\nSchluss mit den Recht...,,10.0,20.0,112.0,['https://pbs.twimg.com/card_img/1471780757332...,https://twitter.com/AfDBerlin/status/137570928...,2,AfD,2
2,AfD Berlin,AfDBerlin,2021-03-31 07:14:04,AfD Berlin\n@AfDBerlin\n·\n31. März,Behördenwillkür\nFlüchtlingsheime durchgedrück...,,3.0,13.0,34.0,['https://pbs.twimg.com/media/ExylKvEU8AgowSU?...,https://twitter.com/AfDBerlin/status/137715723...,3,AfD,1
3,AfD Berlin,AfDBerlin,2021-04-01 14:29:00,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Aus Raider wird jetzt Twix \n\nLeider kein #Ap...,,1.0,4.0,17.0,['https://pbs.twimg.com/card_img/1472910546907...,https://twitter.com/AfDBerlin/status/137762907...,4,AfD,2
4,AfD Berlin,AfDBerlin,2021-04-01 05:02:10,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Gendern geht („*innen“)\nImpfen geht nicht.\nD...,,1.0,3.0,16.0,['https://pbs.twimg.com/profile_images/9706413...,https://twitter.com/AfDBerlin/status/137748642...,5,AfD,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58859,Tino Chrupalla,Tino_Chrupalla,2021-12-04 17:26:46,Tino Chrupalla\n@Tino_Chrupalla\n·\n4. Dez. 2021,Friedlicher Protest gegen einen #Impfzwang ist...,,265.0,122.0,578.0,[],https://twitter.com/Tino_Chrupalla/status/1467...,61802,AfD,1
58860,Tino Chrupalla,Tino_Chrupalla,2021-12-13 16:30:19,Tino Chrupalla\n@Tino_Chrupalla\n·\n13. Dez. 2021,@OlafScholz\n muss sich endlich klar zu Nord S...,,4.0,2.0,11.0,[],https://twitter.com/Tino_Chrupalla/status/1470...,61803,AfD,0
58861,Tino Chrupalla,Tino_Chrupalla,2021-12-17 14:02:24,Tino Chrupalla\n@Tino_Chrupalla\n·\n17. Dez. 2021,Mit \n@_FriedrichMerz\n gibt es keine konserva...,,188.0,227.0,1.008,[],https://twitter.com/Tino_Chrupalla/status/1471...,61804,AfD,1
58862,Tino Chrupalla,Tino_Chrupalla,2021-12-19 09:27:23,Tino Chrupalla\n@Tino_Chrupalla\n·\n19. Dez. 2021,Wir wünschen Ihnen und Ihrer Familie einen bes...,,449.0,346.0,2.648,['https://pbs.twimg.com/media/FG9dwrcXIAUF5BP?...,https://twitter.com/Tino_Chrupalla/status/1472...,61805,AfD,0


## 4. Code

Define function to clean text

In [8]:
nlp = spacy.load("de_core_news_sm")

def clean_text(text):
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(r'@\w+', '', text)
    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)

    doc = nlp(text)
    text = " ".join([token.text.lower() for token in doc if token.pos_ in ["PROPN", "NOUN"] and token.text.lower() not in STOPWORDS])
    
    return text

In [9]:
df_politicians['clean_text'] = df_politicians['text'].apply(clean_text)
df_mentions['clean_text'] = df_mentions['text'].apply(clean_text)

In [10]:
df_mentions.to_csv(FILTERED_DATASET_MENTIONS_PATH)
df_politicians.to_csv(FILTERED_DATASET_POLITICIANS_PATH)

In [11]:
df_mentions[df_mentions['text'].str.contains('katjakipping')]

Unnamed: 0,text,source_party,source_account,date,sentiment,clean_text
4606,@OliS1401 @Ralf_Stegner @katjakipping Den Jogh...,SPD,Ralf_Stegner,2021-09-11 09:21:10,1,joghurt rente auskommen alter produktivitätsen...
7937,überwiesen. SO GEHT UMVERTEILUNG u SOZ.SPALTUN...,SPD,hubertus_heil,2021-02-03 14:19:20,2,geht umverteilung soz spaltung
10245,@Sternen84302848 @katjakipping @KuehniKev @hub...,SPD,OlafScholz,2021-01-24 13:05:38,2,linke koalitionsdisziplin lösung
33370,@ThomasPietsch6 @katjakipping @dieLinke @Heiko...,SPD,OlafScholz,2021-11-16 00:46:22,1,
38945,@katjakipping @jusos Kümmert euch mal lieber d...,SPD,jusos,2021-10-19 07:16:29,2,partei
...,...,...,...,...,...,...
706390,@obscure84 @Katina_Schubert @katjakipping @BMJ...,LINKE,b_riexinger,2021-04-13 08:02:52,1,warnhinweis haltung versklavung frauen handmai...
706391,@Katina_Schubert @BMJV_Bund @ABaerbock @EskenS...,LINKE,b_riexinger,2021-04-13 07:16:26,2,wording schwangerschaftsabbruch männer thema
707047,@MichaNeuhaus @dieLinke Man könnte euch wegen ...,LINKE,b_riexinger,2021-09-15 17:00:34,1,dingen ko argument verhältnis nato
707087,@Janine_Wissler @DietmarBartsch @SusanneHennig...,LINKE,b_riexinger,2021-10-09 12:27:32,2,anonym kenntnis umfragen
