# Notebook: Create Term Frequency Analysis 

This notebook is used to calculate term frequencies.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) | [Markus Bink](https://github.com/MarkusBink/)

## Packages

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from reportlab.graphics import renderPDF
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from svglib.svglib import svg2rlg
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
import nltk
import re
import os

## Parameters

In [2]:
PARTIES = ["SPD", "CDU_CSU", "GRUENE", "FDP", "AFD", "LINKE"]
DATASET_PATH_PREDICTIONS =  "../Datasets/complete_dataset_predictions/"
DATASET_PATH = "../Datasets/dataset/"
OUTPUT_DIR = "../Word Frequencies/"

## Setup Packages

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words("german"))
STOPWORDS.update(["mehr", "heute","https", "bundestag", "thread", "anzeigen", "https", "http", "www", "co"])

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/markusbink/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/markusbink/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Code

### 1. Load Data

In [18]:
df = pd.DataFrame({})

for party in PARTIES:
    for subdir, _, files in os.walk(DATASET_PATH_PREDICTIONS + party):
        for file in files:
            if file.endswith('.csv') and subdir[len(DATASET_PATH_PREDICTIONS):] in PARTIES:
                # Get username of CSV file
                username = file[:-4]
                
                # Read CSV file as pandas dataframe
                df_acc_data = pd.read_csv(DATASET_PATH + party + "/" + file)
                ids = df_acc_data["id"].values
                df_acc_data = df_acc_data[["tweet", "source_party", "source_account", "date"]].reset_index().drop(columns='index')
                
                df_pred = pd.read_csv(DATASET_PATH_PREDICTIONS + party + "/" + file)
                df_pred = df_pred[df_pred["id"].isin(ids)][["pred"]].reset_index().drop(columns='index')
                
                matched_df = pd.concat([df_acc_data, df_pred], axis=1)
                matched_df = matched_df.rename(columns={"pred": "sentiment", "tweet": "text"})
                
                df = pd.concat([df, matched_df], axis=0)

df = df.reset_index().drop(columns='index')

In [5]:
df

Unnamed: 0,text,source_party,source_account,date,sentiment
0,Wichtige wissenschaftliche Erkenntnis- nun mus...,SPD,KarambaDiaby,2021-01-09 19:35:29,0
1,@KarambaDiaby @HalleSpd @SPD_LSA Ich gratulier...,SPD,KarambaDiaby,2021-01-09 17:09:28,0
2,@KarambaDiaby @HalleSpd @SPD_LSA Herzlichen Gl...,SPD,KarambaDiaby,2021-01-09 13:16:13,0
3,@KarambaDiaby @HalleSpd @SPD_LSA Wann werden k...,SPD,KarambaDiaby,2021-01-09 12:32:40,1
4,@KarambaDiaby @HalleSpd @SPD_LSA Glückwunsch.,SPD,KarambaDiaby,2021-01-09 12:13:06,0
...,...,...,...,...,...
707236,@b_riexinger Klima oder Verkehr fast gleich......,LINKE,b_riexinger,2021-12-17 08:19:23,1
707237,@b_riexinger @Linksfraktion Na ob das noch lan...,LINKE,b_riexinger,2021-12-17 08:18:07,1
707238,@b_riexinger Ich wünsch Dir viel Erfolg.,LINKE,b_riexinger,2021-12-17 07:47:59,0
707239,"@b_riexinger Nun, da gibt es ja genügend zu tu...",LINKE,b_riexinger,2021-12-17 02:07:26,2


Clean Text

In [6]:
def remove_twitter_mentions(text):
    regex = r"@[a-zA-Z0-9_]+"
    return re.sub(regex, " ", text)

In [7]:
def clean_text(text):
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_TAGS = re.compile(r"<[^>]+>")
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
    
    text = re.sub(RE_TAGS, " ", text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)
    
    word_tokens = word_tokenize(text)
    text = [word.lower() for word in word_tokens if word.lower() not in STOPWORDS]
    text = " ".join(text)
    
    return text

In [8]:
df["text"] = df["text"].apply(remove_twitter_mentions)

In [9]:
df["text"] = df["text"].apply(clean_text)

In [10]:
df

Unnamed: 0,text,source_party,source_account,date,sentiment
0,wichtige wissenschaftliche erkenntnis schnell ...,SPD,KarambaDiaby,2021-01-09 19:35:29,0
1,gratuliere linken,SPD,KarambaDiaby,2021-01-09 17:09:28,0
2,herzlichen glückwunsch erfolg,SPD,KarambaDiaby,2021-01-09 13:16:13,0
3,wann konkret massiv steuern sozialabgaben gese...,SPD,KarambaDiaby,2021-01-09 12:32:40,1
4,glückwunsch,SPD,KarambaDiaby,2021-01-09 12:13:06,0
...,...,...,...,...,...
707236,klima verkehr fast gleich hauptsache pöstchen,LINKE,b_riexinger,2021-12-17 08:19:23,1
707237,na lange gut geht gruppierungen querdenker lin...,LINKE,b_riexinger,2021-12-17 08:18:07,1
707238,wünsch erfolg,LINKE,b_riexinger,2021-12-17 07:47:59,0
707239,gibt ja genügend tuen paris macht,LINKE,b_riexinger,2021-12-17 02:07:26,2


### 2. Calculate Word Frequencies

In [11]:
def get_sentiment_as_name(sentiment_code):
    if sentiment_code == 0:
        return "positive"
    if sentiment_code == 1:
        return "negative"
    if sentiment_code == 2: 
        return "neutral"

In [12]:
def get_month_code(month_num):
    if month_num == 1:
        return "January"
    elif month_num == 2:
        return "February"
    elif month_num == 3:
        return "March"
    elif month_num == 4:
        return "April"
    elif month_num == 5:
        return "May"
    elif month_num == 6:
        return "June"
    elif month_num == 7:
        return "July"
    elif month_num == 8:
        return "August"
    elif month_num == 9:
        return "September"
    elif month_num == 10:
        return "October"
    elif month_num == 11:
        return "November"
    elif month_num == 12:
        return "December"  

In [13]:
def get_word_frequencies(data):
    text = data['text']
    count_vect = CountVectorizer(max_features=100)
    tf_matrix = count_vect.fit_transform(text)
    feature_names = list(count_vect.vocabulary_.keys())

    tf_df = pd.DataFrame(tf_matrix.toarray(), columns=feature_names)
    word_counts = tf_df.sum(axis=0)
    word_counts_sorted = word_counts.sort_values(ascending=False)

    # Die ersten 50 häufigsten Wörter ausgeben
    top_words = word_counts_sorted.head(200).to_frame().reset_index()
    top_words.columns = ['word', 'count']
    top_words.style.format({'count': '{:,}'})
    return top_words

In [14]:
get_word_frequencies(df).to_csv(OUTPUT_DIR + "dataset_word_freq.csv")

In [15]:
for sentiment in [0, 1, 2]:
    get_word_frequencies(df[df["sentiment"] == sentiment]).to_csv(OUTPUT_DIR + f'dataset_word_freq_sentiment_{get_sentiment_as_name(sentiment)}.csv')

In [16]:
for sentiment in [0, 1, 2]:
    for party in PARTIES:
        get_word_frequencies(df[(df["sentiment"] == sentiment) & (df["source_party"] == party)].reset_index(drop=True)).reset_index().to_csv(OUTPUT_DIR + f'dataset_word_freq_sentiment_{get_sentiment_as_name(sentiment)}_party_{party}.csv')

In [17]:
for sentiment in [0, 1, 2]:
    for month in range(1, 13):
        get_word_frequencies(df[(df["sentiment"] == sentiment) & (pd.to_datetime(df['date']).dt.month == month)]).to_csv(OUTPUT_DIR + f'dataset_word_freq_sentiment_{get_sentiment_as_name(sentiment)}_month_{get_month_code(month)}.csv')