# Data preparation and association matrix
In this notebook the data (speeches) gets prepared. The punctuation gets removed, every word is set to lowercase, the stopwords get removed and the words get lemmatized.
After that the association matrix is computed.

In [1]:
import jsonlines
import json
import copy
import string
import spacy
nlp = spacy.load('de_core_news_sm')
from collections import Counter
from collections import defaultdict
import pandas as pd
import networkx as nx
import tables
import numpy as np

In [2]:
# load data in jsonfile
speeches_19_20 = []
with jsonlines.open('data/speeches_19.jsonl') as f:
    for line in f.iter():
        #for line in list(f):
        speeches_19_20.append(line)
with jsonlines.open('data/speeches_20.jsonl') as f:
    for line in f.iter():
        #for line in list(f):
        speeches_19_20.append(line)

In [8]:
speeches_19_20_prep = copy.deepcopy(speeches_19_20)

In [5]:
# delete punctuation
punctuation = string.punctuation + '–' + '-' + '--' + '––'
for rede in speeches_19_20_prep:
    rede_dummy = ""
    for char in rede['text']:
        if char not in punctuation:
            rede_dummy += char
        rede['text'] = rede_dummy

In [6]:
#lemmatization
for rede in speeches_19_20_prep:
    doc = nlp(rede['text'])
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    rede['text'] = lemmatized_text

In [7]:
# lower case
for rede in speeches_19_20_prep:
    rede['text'] = rede['text'].lower()

In [8]:
#delete single and double char words
def filter_single_double_char_words(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if len(token.text) > 2]
    return " ".join(filtered_tokens)

#apply funcion to each dictionary in the list
for rede in speeches_19_20_prep:
    rede["text"] = filter_single_double_char_words(rede["text"])

In [9]:
# stopwords with stopword list from github.com/solariz/german_stopwords

file_stopwords = open('data/german_stopwords_full.txt','r', encoding ='utf-8')
stopwords_list = file_stopwords.read().splitlines()
file_stopwords.close()

for rede in speeches_19_20_prep:
    words = rede['text'].split()
    filtered_words = [word for word in words if word not in stopwords_list]
    rede['text'] = ' '.join(filtered_words)

In [10]:
# stopwords with own stopwords identified during the work on the project

stopwords_list = ["geehrte", "damen", "herren", "kollegen", "kolleginnen", "geehrter", "geehrten", "herr", "frau"]

for rede in speeches_19_20_prep:
    words = rede['text'].split()
    filtered_words = [word for word in words if word not in stopwords_list]
    rede['text'] = ' '.join(filtered_words)

In [11]:
#add the tokenized texts to data (for later use)
for rede in speeches_19_20_prep:
    doc = nlp(rede['text'])
    rede.update({'text_token': []})
    for token in doc:
        rede['text_token'].append(token.text)

In [12]:
# save prepared data
# Specify the file name
file_name = 'data/speeches_19_20_prep.jsonl'

# Open the file in write mode
with open(file_name, 'w') as file:
    # Write each dictionary as a JSON object followed by a newline
    for item in speeches_19_20_prep:
        json_line = json.dumps(item)
        file.write(json_line + '\n')

In [3]:
# load prepared data
speeches_19_20_prep = []
with jsonlines.open('data/speeches_19_20_prep.jsonl') as f:
    for line in f.iter():
        #for line in list(f):
        speeches_19_20_prep.append(line)

In [4]:
def build_co_occurrence_matrix(corpus, window_size, vocab_size):
    word_counter = Counter()
    for text in corpus:
        word_counter.update(text['text_token'])

    most_common_words = [word for word, _ in word_counter.most_common(vocab_size)]
    unique_words = set(most_common_words)

    co_occurrence_dict = defaultdict(lambda: defaultdict(float))

    for text in corpus:
        text_list = [word for word in text['text_token'] if word in unique_words]
        for idx, word in enumerate(text_list):
            i = max(0, idx - window_size)
            j = min(len(text_list) - 1, idx + window_size)
            search = [text_list[idx_] for idx_ in range(i, j + 1)]
            search.remove(word)
            for neighbor in search:
                co_occurrence_dict[word][neighbor] += 1

    return co_occurrence_dict

In [5]:
coo_dict=build_co_occurrence_matrix(speeches_19_20_prep,window_size=20,vocab_size=2500)

In [6]:
coo_matrix = pd.DataFrame(coo_dict,index=coo_dict.keys()).fillna(0).astype(int)

In [7]:
#calculate associations
word_counter = Counter()
for text in speeches_19_20_prep:
    word_counter.update(text['text_token'])

for column in coo_matrix.columns:
    coo_matrix[column] = coo_matrix[column] * 10000 / (coo_matrix.index.map(word_counter) * word_counter[column])

In [8]:
coo_matrix = coo_matrix.astype(np.float32)

In [9]:
coo_matrix.to_hdf('data/coo_matrix.h5', key='df', mode='w', complib='blosc', complevel=9)