# Notebook: Clean Dataset Text

## 1. Load Packages

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import spacy
import random
import math
import nltk 
import re
import os

## 2. Constants / Setup

In [2]:
#!python -m spacy download de_core_news_sm

In [3]:
nlp = spacy.load('de_core_news_sm')

In [4]:
DATASET_MENTIONS_PATH = "../Datasets/tweets_mentions"
DATASET_MENTIONS_IMG_PATH = "../Datasets/img_dataset_mentions/predictions"
DATASET_POLITICAL_ACCOUNTS_PATH = "../Datasets/tweets_political_accounts"
DATASET_POLITICAL_ACCOUNTS_IMG_PATH = "../Datasets/img_dataset_political_accounts/predictions"

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stopwords = set(stopwords.words("german")).union(set(stopwords.words("english")))
stopwords.update(["web", "zus", "ee", "en", "ja", "nein", "mal", "nein", "schon", "immer", "utm", "the", "to", "mehr", "com", "geht", "heute", "https", "thread", "anzeigen", "https", "http", "www", "co", "de", "html", "nehmen", "habt", "hast", "tut", "gibt", "wäre", "steht", "ganz", "usw", "ging", "seid", "liegt", "sagen", "finde", "denen", "übrigens", "trotzdem", "darauf", "hätte", "wer", "sogar", "weiß"])

## 3. Load Dataset

In [7]:
df_politicians = pd.read_csv(DATASET_POLITICAL_ACCOUNTS_PATH + ".csv")
df_mentions = pd.read_csv(DATASET_MENTIONS_PATH + ".csv")
# Let's only consider Images with Text
def get_n_tokens_for_text(extracted_text):
    if isinstance(extracted_text, float) and math.isnan(extracted_text):
        return 0
    else:
        return len(nltk.word_tokenize(str(extracted_text)))
df_political_accounts_img = pd.read_csv(DATASET_POLITICAL_ACCOUNTS_IMG_PATH + ".csv")
df_political_accounts_img = df_political_accounts_img.rename(columns={'extracted_text': 'text'})
df_political_accounts_img = df_political_accounts_img[df_political_accounts_img["text"].apply(get_n_tokens_for_text) != 0]
df_mentions_img = pd.read_csv(DATASET_MENTIONS_IMG_PATH + ".csv")
df_mentions_img = df_mentions_img.rename(columns={'extracted_text': 'text'})
df_mentions_img = df_mentions_img[df_mentions_img["text"].apply(get_n_tokens_for_text) != 0]

## 4. Code

In [8]:
nlp = spacy.load('de_core_news_sm')

#### Clean Data

In [9]:
df_politicians['date'] = pd.to_datetime(df_politicians['date'])
df_mentions['date'] = pd.to_datetime(df_mentions['date'])
df_political_accounts_img['date'] = pd.to_datetime(df_political_accounts_img['date'])
df_mentions_img['date'] = pd.to_datetime(df_mentions_img['date'])

In [10]:
df_politicians["cleaned_text"] = df_politicians["text"].apply(lambda x: "" if type(x) != str else x)
df_mentions["cleaned_text"] = df_mentions["text"].apply(lambda x: "" if type(x) != str else x)
df_political_accounts_img["cleaned_text"] = df_political_accounts_img["text"].apply(lambda x: "" if type(x) != str else x)
df_mentions_img["cleaned_text"] = df_mentions_img["text"].apply(lambda x: "" if type(x) != str else x)

In [11]:
def clean_text(text):
        RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
        RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)
        RE_PUNCTUATION = re.compile(r'[.,]',re.IGNORECASE)

        text = re.sub(r'@\w+', '', text)
        text = re.sub(RE_ASCII, " ", text)
        text = re.sub(RE_SINGLECHAR, " ", text)
        text = re.sub(RE_WSPACE, " ", text)
        text = re.sub(RE_PUNCTUATION, '', text)
        return text

In [12]:
df_politicians["cleaned_text"] = df_politicians["cleaned_text"].apply(clean_text)
df_mentions["cleaned_text"] = df_mentions["cleaned_text"].apply(clean_text)
df_political_accounts_img["cleaned_text"] = df_political_accounts_img["cleaned_text"].apply(clean_text)
df_mentions_img["cleaned_text"] = df_mentions_img["cleaned_text"].apply(clean_text)

In [13]:
def lemmatize_remove_stopwords_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_.lower() for token in doc if token.text.lower() not in stopwords])
    return lemmatized_text

In [14]:
df_politicians["cleaned_text"] = df_politicians["cleaned_text"].apply(lemmatize_remove_stopwords_text)
df_mentions["cleaned_text"] = df_mentions["cleaned_text"].apply(lemmatize_remove_stopwords_text)
df_political_accounts_img["cleaned_text"] = df_political_accounts_img["cleaned_text"].apply(lemmatize_remove_stopwords_text)
df_mentions_img["cleaned_text"] = df_mentions_img["cleaned_text"].apply(lemmatize_remove_stopwords_text)

In [15]:
df_politicians

Unnamed: 0.1,Unnamed: 0,UserScreenName,source_account,date,Text,text,Emojis,Comments,Likes,Retweets,photos,Tweet URL,id,source_party,sentiment,tweet_id,cleaned_text
0,0,AfD Berlin,AfDBerlin,2021-03-26 21:07:22,AfD Berlin\n@AfDBerlin\n·\n26. März,AfD wirkt.\n\nSchluss mit dem #Gendergaga\nMDR...,,4.0,28.0,132.0,['https://pbs.twimg.com/profile_images/1037343...,https://twitter.com/AfDBerlin/status/137555499...,1,AFD,2,1375554998461984769,afd wirken schluss gendergaga mdr aktuell märz...
1,1,AfD Berlin,AfDBerlin,2021-03-27 07:20:27,AfD Berlin\n@AfDBerlin\n·\n27. März,Im Herbst wird gewählt.\nSchluss mit den Recht...,,10.0,20.0,112.0,['https://pbs.twimg.com/card_img/1471780757332...,https://twitter.com/AfDBerlin/status/137570928...,2,AFD,2,1375709283850063876,herbst wählen schluss rechtsbrüch regierung bi...
2,2,AfD Berlin,AfDBerlin,2021-03-31 07:14:04,AfD Berlin\n@AfDBerlin\n·\n31. März,Behördenwillkür\nFlüchtlingsheime durchgedrück...,,3.0,13.0,34.0,['https://pbs.twimg.com/media/ExylKvEU8AgowSU?...,https://twitter.com/AfDBerlin/status/137715723...,3,AFD,1,1377157230630301700,behördenwillkür flüchtlingsheim durchgedrücken...
3,3,AfD Berlin,AfDBerlin,2021-04-01 14:29:00,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Aus Raider wird jetzt Twix \n\nLeider kein #Ap...,,1.0,4.0,17.0,['https://pbs.twimg.com/card_img/1472910546907...,https://twitter.com/AfDBerlin/status/137762907...,4,AFD,2,1377629070662373376,raider twix leider aprilscherz bz berlin neu n...
4,4,AfD Berlin,AfDBerlin,2021-04-01 05:02:10,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Gendern geht („*innen“)\nImpfen geht nicht.\nD...,,1.0,3.0,16.0,['https://pbs.twimg.com/profile_images/9706413...,https://twitter.com/AfDBerlin/status/137748642...,5,AFD,1,1377486425499832320,gender innen impfe regierung blamieren deutsch...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58859,58859,Tino Chrupalla,Tino_Chrupalla,2021-12-04 17:26:46,Tino Chrupalla\n@Tino_Chrupalla\n·\n4. Dez. 2021,Friedlicher Protest gegen einen #Impfzwang ist...,,265.0,122.0,578.0,[],https://twitter.com/Tino_Chrupalla/status/1467...,61802,AFD,1,1467183612034433034,friedlicher protest impfzwang grundrecht schut...
58860,58860,Tino Chrupalla,Tino_Chrupalla,2021-12-13 16:30:19,Tino Chrupalla\n@Tino_Chrupalla\n·\n13. Dez. 2021,@OlafScholz\n muss sich endlich klar zu Nord S...,,4.0,2.0,11.0,[],https://twitter.com/Tino_Chrupalla/status/1470...,61803,AFD,0,1470430897199628298,endlich klar nord stream bekennen deutsch in...
58861,58861,Tino Chrupalla,Tino_Chrupalla,2021-12-17 14:02:24,Tino Chrupalla\n@Tino_Chrupalla\n·\n17. Dez. 2021,Mit \n@_FriedrichMerz\n gibt es keine konserva...,,188.0,227.0,1.008,[],https://twitter.com/Tino_Chrupalla/status/1471...,61804,AFD,1,1471843224251740177,konservativ erneuerung ausgrenzung gesund impf...
58862,58862,Tino Chrupalla,Tino_Chrupalla,2021-12-19 09:27:23,Tino Chrupalla\n@Tino_Chrupalla\n·\n19. Dez. 2021,Wir wünschen Ihnen und Ihrer Familie einen bes...,,449.0,346.0,2.648,['https://pbs.twimg.com/media/FG9dwrcXIAUF5BP?...,https://twitter.com/Tino_Chrupalla/status/1472...,61805,AFD,0,1472498789122514945,wünschen familie besinnlich advent


In [16]:
df_politicians.to_csv(DATASET_POLITICAL_ACCOUNTS_PATH + "_cleaned.csv")
df_mentions.to_csv(DATASET_MENTIONS_PATH + "_cleaned.csv")
df_political_accounts_img.to_csv(DATASET_POLITICAL_ACCOUNTS_IMG_PATH + "_cleaned.csv")
df_mentions_img.to_csv(DATASET_MENTIONS_IMG_PATH + "_cleaned.csv")