# Notebook: Create Term Frequency Analysis 

This notebook is used to calculate term frequencies.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) | [Markus Bink](https://github.com/MarkusBink/)

## Packages

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from reportlab.graphics import renderPDF
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from svglib.svglib import svg2rlg
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import nltk
import re
import os

## Parameters

In [2]:
PARTIES = ["SPD", "CDU_CSU", "GRUENE", "FDP", "AFD", "LINKE"]
DATASET_PATH_PREDICTIONS =  "../Datasets/complete_dataset_predictions/"
DATASET_PATH = "../Datasets/dataset/"

## Setup Packages

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words("german"))
STOPWORDS.update(["mehr", "heute","https", "bundestag", "thread", "anzeigen"])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Code

### 1. Load Data

In [4]:
df = pd.DataFrame({})

for party in PARTIES:
    for subdir, _, files in os.walk(DATASET_PATH_PREDICTIONS + party):
        for file in files:
            if file.endswith('.csv') and subdir[len(DATASET_PATH_PREDICTIONS):] in PARTIES:
                # Get username of CSV file
                username = file[:-4]
                
                # Read CSV file as pandas dataframe
                df_acc_data = pd.read_csv(DATASET_PATH + party + "/" + file)
                ids = df_acc_data["id"].values
                df_acc_data = df_acc_data[["tweet", "source_party", "source_account", "date"]].reset_index().drop(columns='index')
                
                df_pred = pd.read_csv(DATASET_PATH_PREDICTIONS + party + "/" + file)
                df_pred = df_pred[df_pred["id"].isin(ids)][["pred"]].reset_index().drop(columns='index')
                
                matched_df = pd.concat([df_acc_data, df_pred], axis=1)
                matched_df = matched_df.rename(columns={"pred": "sentiment", "tweet": "text"})
                
                df = pd.concat([df, matched_df], axis=0)

df = df.reset_index().drop(columns='index')

In [5]:
df

Unnamed: 0,text,source_party,source_account,date,sentiment
0,Wichtige wissenschaftliche Erkenntnis- nun mus...,SPD,KarambaDiaby,2021-01-09 19:35:29,0
1,@KarambaDiaby @HalleSpd @SPD_LSA Ich gratulier...,SPD,KarambaDiaby,2021-01-09 17:09:28,0
2,@KarambaDiaby @HalleSpd @SPD_LSA Herzlichen Gl...,SPD,KarambaDiaby,2021-01-09 13:16:13,0
3,@KarambaDiaby @HalleSpd @SPD_LSA Wann werden k...,SPD,KarambaDiaby,2021-01-09 12:32:40,1
4,@KarambaDiaby @HalleSpd @SPD_LSA Glückwunsch.,SPD,KarambaDiaby,2021-01-09 12:13:06,0
...,...,...,...,...,...
707236,@b_riexinger Klima oder Verkehr fast gleich......,LINKE,b_riexinger,2021-12-17 08:19:23,1
707237,@b_riexinger @Linksfraktion Na ob das noch lan...,LINKE,b_riexinger,2021-12-17 08:18:07,1
707238,@b_riexinger Ich wünsch Dir viel Erfolg.,LINKE,b_riexinger,2021-12-17 07:47:59,0
707239,"@b_riexinger Nun, da gibt es ja genügend zu tu...",LINKE,b_riexinger,2021-12-17 02:07:26,2


Clean Text

In [6]:
def remove_twitter_mentions(text):
    regex = r"@[a-zA-Z0-9_]+"
    return re.sub(regex, " ", text)

In [7]:
def clean_text(text):
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    
    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)
    
    word_tokens = word_tokenize(text)
    text = [word.lower() for word in word_tokens]
    text = " ".join(text)
    
    return text

In [8]:
df["text"] = df["text"].apply(remove_twitter_mentions)

In [9]:
df["text"] = df["text"].apply(clean_text)

### 2. Calculate Word Frequencies

In [13]:
def get_word_frequencies(data):
    text = data['text']
    count_vect = CountVectorizer(max_features=100)
    tf_matrix = count_vect.fit_transform(text)
    feature_names = list(count_vect.vocabulary_.keys())

    tf_df = pd.DataFrame(tf_matrix.toarray(), columns=feature_names)
    word_counts = tf_df.sum(axis=0)
    word_counts_sorted = word_counts.sort_values(ascending=False)

    # Die ersten 50 häufigsten Wörter ausgeben
    top_words = word_counts_sorted.head(50).to_frame().reset_index()
    top_words.columns = ['word', 'count']
    top_words.style.format({'count': '{:,}'})
    return top_words

#### 2.1 Calculate Word Frequencies for Entire Dataset

In [14]:
print(get_word_frequencies(df))

        word   count
0        der  439060
1      warum  321580
2      schon  287935
3       über  245206
4        was  240565
5        mit  228379
6      keine  164298
7       mehr  162743
8      alles  158274
9         er  150885
10    können  140028
11       aus  120761
12       bei  116420
13       wir  113006
14     nicht  112205
15       vor   99765
16       war   91454
17       ein   91096
18       mir   84633
19      muss   83107
20    wieder   82911
21       zum   81077
22      wird   80559
23     jetzt   78652
24      aber   77833
25       den   77792
26  menschen   76228
27        im   74839
28        ja   74605
29       sie   74507
30     diese   71456
31       dem   70460
32       als   70269
33       die   63710
34        um   63683
35     einen   61312
36      viel   61127
37      sind   61109
38      nach   59732
39       mal   59497
40       auf   54561
41      wenn   54416
42     einer   53597
43    machen   52836
44      ihre   52263
45       des   51736
46      kann 

#### 2.2 Calculate Word Frequencies for specific time/sentiment

In [19]:
print(get_word_frequencies(df[df["source_party"] == "FDP"][pd.to_datetime(df["date"]).dt.month == 11][df["sentiment"] == 0]))

       word  count
0      muss    501
1      alle    366
2       zur    311
3        zu    268
4        du    232
5      herr    195
6       der    193
7    können    167
8       vor    151
9       ein    141
10      das    135
11    schon    132
12     wird    126
13    jetzt    121
14      von    121
15      den    114
16      fdp    114
17     kann    114
18       an    110
19    haben    107
20    https    103
21    bitte    103
22     ihre     88
23      aus     86
24     doch     83
25      wie     82
26      hat     81
27     gute     81
28     mich     77
29      des     77
30      mir     76
31      war     72
32      und     70
33     dank     68
34     wenn     66
35      ich     65
36      als     65
37       in     63
38    immer     63
39     sein     62
40    alles     59
41     habe     59
42     eine     58
43     dann     55
44     sind     54
45  lindner     53
46     aber     53
47      dem     50
48      amp     48
49   wieder     48


  print(get_word_frequencies(df[df["source_party"] == "FDP"][pd.to_datetime(df["date"]).dt.month == 11][df["sentiment"] == 0]))
