In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from datetime import date, datetime
from model import SentimentModel
pd.set_option('mode.chained_assignment', None)

First we load speeches of year 2019.

In [2]:
speeches = pd.read_feather('speeches_2019.feather')
speeches.shape

(16912, 12)

Then we group keywords as done previously.

In [3]:
technologie = ['IT', 'Innovation', 'Technologisch', 'Information',
               'Technik', 'Technologische', 'Medizintechnik', 'Technologie']
datenschutz = ['Datenschutz', 'Datensicherheit', 'Transparente', 'Transparenz', 'Anonymität', 'Privatsphäre']
digitalisierung = ['Digitalisierung', 'Automatisierung', 'App ', 'Vernetzung', 'Internet',
                   'Telekommunikation', 'Software', 'Hardware', 'Computer', 'Handy']

senioren = ['Senioren', 'Erwachsene', 'ältere Menschen', 'Familienmitglieder']
rente = ['Rente', 'Rentner', 'Rentnerin', 'Altersvorsorge', 'Elterngeld']
pflegeheim = ['Pflegeheim', 'Altenheim', 'Pfleger', 'Altenpflege', 'Pflege']

gesundheit = ['Gesundheitswesen', 'Ernährung', 'Lebensqualität', 'Lebenserwartung', 'Gesundheit']
krankheit = ['Erkrankung', 'Nebenwirkung', 'Infektion', 'Krank', 'Krankheit']
medikament = ['Medikament', 'Arzneimittel', 'Patient', 'Behandlung', 'Pill', 'Antibiotika', 'Impfstoffe']
krankenhaus = ['Krankenhaus', 'Klinik', 'Krankenschwester', 'Krankenpfleger', 'Arzt']

In [4]:
keywords_tech = technologie + datenschutz + digitalisierung

keywords_aging = senioren + rente + pflegeheim

keywords_health = gesundheit + krankenhaus + krankheit + medikament

Then we filter the speeches to obtain the speeches that contains technology related keywords.

In [5]:
speeches_tech = speeches[speeches.speechContent.str.contains('|'.join(keywords_tech))]

In [6]:
speeches_tech.shape

(2160, 12)

Now, we obtain the speeches that has both tech-related and aging-related keywords.

In [17]:
speeches_senioren = speeches_tech[speeches_tech.speechContent.str.contains('|'.join(keywords_aging))]
speeches_senioren.shape

(244, 12)

Now, we obtain the speeches that has both tech-related and health-related keywords.

In [18]:
speeches_health = speeches_tech[speeches_tech.speechContent.str.contains('|'.join(keywords_health))]
speeches_health.shape

(405, 12)

Then we load the pretrained BERT model for sentiment analysis.

In [9]:
model = SentimentModel(model_name = "oliverguhr/german-sentiment-bert")

Now we caculate the sentiment for both speeches.

In [19]:
speeches_senioren['sentiment'] = ''
for i, speech in speeches_senioren.iterrows():
    speeches_senioren.at[i,'sentiment'] = model.predict_sentiment([speech.speechContent])[0]

In [20]:
speeches_health['sentiment'] = ''
for i, speech in speeches_health.iterrows():
    speeches_health.at[i,'sentiment'] = model.predict_sentiment([speech.speechContent])[0]

In [21]:
speeches_health.reset_index(inplace=True)
speeches_senioren.reset_index(inplace=True)

Finally, we save the speeches with sentiments for next step.

In [22]:
speeches_senioren.to_feather('speeches_2019_with_sentiments_senioren.feather')
speeches_health.to_feather('speeches_2019_with_sentiments_health.feather')