# Preprocessing of Reddit

The preprocessing script contains a section with all preprocessing functions. These should be easily adaptable to working with text that isn't tweets. To accomodate for both English and Danish language some functions have additional arguments 'stops' and 'langStemmer' so we can use the same function in both cases.

In [1]:
#Importing packages 
import pandas as pd
import spacy
import re
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
nltk.download('punkt')

#Detect langugage
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

# for Danish
from nltk.stem.snowball import DanishStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# READ DATA 
data = pd.read_csv("C:/Users/stine/OneDrive/melchior_job/female_pol_data/comments_all_female_updated.csv")

### Dropping duplicates

In [3]:
# dropping ALL duplicate values
data.drop_duplicates(subset = "comment_id", #'comment_id' for comments and 'id' for posts
                     keep = 'first', inplace = True)

In [4]:
# COMMENT only: remove deleted comments
data = data[data['comment_body'] != '[deleted]']

In [5]:
# POSTS only: concatenate title and body (only for post-csvs)
data["concatenated_text"]= data["title"].str.cat(data["body"], sep =". ", na_rep = "")

KeyError: 'title'

# Language Detection 

In [6]:
for index, row in data.iterrows():
    try:
        data.loc[index, 'lang'] = detect(row['comment_body']) # 'comment_body' for comments 'concatenated_text' for posts
    except LangDetectException:
        data.loc[index, 'lang'] = None


data['lang'].value_counts()

da    3916
no     347
en      74
sl      55
de      31
sv      23
nl      13
et      12
sk      11
af      11
id      11
pt       9
sw       9
it       9
tr       8
so       8
tl       7
cy       6
hr       6
hu       5
fi       4
pl       4
fr       4
sq       4
es       3
vi       2
ca       2
ro       1
cs       1
Name: lang, dtype: int64

In [None]:
#Continue only with the scandinavian tweets 
tweets_da = data[data.lang.isin(['da', 'no', 'sv'])]

# Functions

In [4]:
def clean_text(corpus, stops):
    no_urls = [re.sub(r"http\S+", "", text) for text in corpus] # remove links
    only_letters = [re.sub("(#[A-Za-z]+)|(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)", ' ' , text) for text in no_urls] # remove numbers
    only_letters = [text.replace('\n', ' ') for text in only_letters] # remove newline characters
    lowercased_tweets = [text.lower().split() for text in only_letters] # lowercase all words
    no_stopwords = [[w for w in text if not w in stops] for text in lowercased_tweets] # remove stopwords
    tweets = [" ".join(text) for text in no_stopwords] # join the lowered and cleaned words
    return tweets

In [5]:
# tokenization of text
def tokenize_text(tweets):
    tokens = [word_tokenize(text) for text in tweets] # tokenizes every tweet using nltk
    return tokens

In [6]:
# lemmatization of text using the nlp() method from SpaCy.
def lemmatize_text(sent):
    lemmas = [x.lemma_ for x in nlp(sent)]
    return lemmas

In [7]:
# stemming of text using nltk
def stem_text(sent, langStemmer):
    stemmer = langStemmer
    stem = [stemmer.stem(token) for token in sent]
    return stem

# Danish Preprocessing

In [3]:
nlp = spacy.load("da_core_news_sm")

# specify and extend stopwords
stopwords_da = stopwords.words("danish")  
stopwords_da.extend(['paa', 'saa', 'vaere',  'rt', 'ogsaa', 'faa', 'faar', 'nok', 'mt', 'gt'])


# exchange special characters æ, ø, å for their 'international' equivalents.
tweets_da['clean_text'] = [re.sub('ø', 'oe', text) for text in tweets_da['comment_body']] # concatenated_text for posts and comment_body for comments
tweets_da['clean_text'] = [re.sub('æ', 'ae', text) for text in tweets_da['clean_text']]
tweets_da['clean_text'] = [re.sub('å', 'aa', text) for text in tweets_da['clean_text']]

['og', 'i', 'jeg', 'det', 'at', 'en', 'den', 'til', 'er', 'som', 'på', 'de', 'med', 'han', 'af', 'for', 'ikke', 'der', 'var', 'mig', 'sig', 'men', 'et', 'har', 'om', 'vi', 'min', 'havde', 'ham', 'hun', 'nu', 'over', 'da', 'fra', 'du', 'ud', 'sin', 'dem', 'os', 'op', 'man', 'hans', 'hvor', 'eller', 'hvad', 'skal', 'selv', 'her', 'alle', 'vil', 'blev', 'kunne', 'ind', 'når', 'være', 'dog', 'noget', 'ville', 'jo', 'deres', 'efter', 'ned', 'skulle', 'denne', 'end', 'dette', 'mit', 'også', 'under', 'have', 'dig', 'anden', 'hende', 'mine', 'alt', 'meget', 'sit', 'sine', 'vor', 'mod', 'disse', 'hvis', 'din', 'nogle', 'hos', 'blive', 'mange', 'ad', 'bliver', 'hendes', 'været', 'thi', 'jer', 'sådan', 'paa', 'saa', 'vaere', 'rt', 'ogsaa', 'faa', 'faar', 'nok', 'mt', 'gt']


NameError: name 'tweets_da' is not defined

In [None]:
# apply cleaning function
tweets_da['clean_text'] = clean_text(tweets_da['clean_text'], stops=stopwords_da)

# apply tokenization
tweets_da['clean_text'] = tokenize_text(tweets_da['clean_text'])

# apply lemmatization
lemmas = []
for tweet in tweets_da['clean_text']:
    lemma = [lemmatize_text(x) for x in tweet]
    lemmas.append([item for sublist in lemma for item in sublist])
tweets_da['lemmatized_text'] = lemmas

# apply stemming
stems = []
for tweet in tweets_da['clean_text']:
    stems.append(stem_text(tweet, langStemmer=DanishStemmer()))
tweets_da['stemmed_text'] = stems

In [22]:
tweets_da.head()

NameError: name 'tweets_da' is not defined

In [None]:
data.to_csv("C:/Users/stine/OneDrive/melchior_job/preprocessed/comments_new_female_nodups_prepro_lang.csv")