# Generate sentences from Speeches

In [1]:
## 01-create_examples.ipynb as a reference
import re

import pandas as pd
import spacy
from tqdm import tqdm
import plotly.express as px

# setup
pd.set_option("display.max_colwidth", 2000)
pd.set_option("display.max_rows", 50)

PARTYCOLORS =  {'AfD': '#59dcff', "CSU":"#525060", "CDU":"black", "CDU/CSU":"black", "SPD":"#e03427", "GRUENE":"#52d157", "DIE LINKE":"#a33ab5", "FDP":"#ffeb3b",}

In [2]:
# get speeches
df = pd.read_feather("data/Bundestag1819.feather")
# Select only speeches by mps
df = df[(df["role"]=="mp")]
df.columns
#df.head(1)

Index(['name', 'electoral_term', 'session', 'party', 'speech', 'role', 'date',
       'url', 'speech_id'],
      dtype='object')

In [3]:
df = df[(df["party"].isin(PARTYCOLORS.keys()))]

In [4]:
df.shape[0]

55359

In [5]:
# Fix all co2 compound terms
df["speech"] = df["speech"].str.replace(r"[C|c][O|o]\s?2", 'CO2 ', regex=True)
df["speech"] = df["speech"].str.replace(r"CO2  ", 'CO2 ', regex=True)
df["speech"] = df["speech"].str.replace(r'CO2 -', 'CO2-', regex=True)

In [6]:
# Fix speakers included in speeches
for i in range(df.shape[0]):
    u = df.iloc[i]
    sname = u["name"]
    regexp = re.compile(fr'\w{sname} ')
    if regexp.search(u["speech"]):
        df.iloc[i,4] = df.iloc[i].speech.replace(f"{sname} ", '')

### Clean Text and create sentence_df

Speeches contain many linebreaks and inline references to contributions of other speakers (table contributions in db). These have to be removed.


In [7]:
# load Spacy
# if model not yet installed, run: python -m spacy download de_core_news_md

nlp = spacy.load("de_core_news_md")
nlp.disable_pipes(
    [
        "tok2vec",
        "tagger",
        "morphologizer",
        "parser",
        "lemmatizer",
        "attribute_ruler",
        "ner",
    ]
)
nlp.enable_pipe("senter")

In [8]:
# iterate over all speeches
sent_id = -1
all_sents = []
for index, speech in tqdm(df.iterrows(), total=df.shape[0]):
    # extract doc-level information
    text = speech.speech
#    text = clean_speech(text)
    doc = nlp(text)

    # get sentences of doc to iterate over
    sents = list(doc.sents)
    n_sentences = len(list(doc.sents))

    # skip very short speeches
    if n_sentences < 3:
        continue

    # iterate over sentences + add 1 row per sentence to all_sents
    for sent_no, sent in enumerate(sents, 1):
        sent_id += 1
        # do not use first and last sentence
        if not 1 < sent_no < n_sentences:
            continue

        sent_length = len([tok for tok in sent if not tok.is_punct])
        # skip very short sentences
        if sent_length < 3:
            continue

        data = {
            "sent_id": sent_id,
            "speech_id": speech.speech_id,
            "name": speech["name"],
            "electoral_term": speech.electoral_term,
            "party": speech.party,
            "role": speech.role,
            "date": speech.date,
            "session": speech.session,
            "sentence_no": sent_no,
            "sentence_length": sent_length,
            "sentence": str(sent),
        }
        all_sents.append(data)

sentence_df = pd.DataFrame(all_sents)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 55359/55359 [08:15<00:00, 111.83it/s]


In [9]:
%%capture --no-display

# Clean bad endings
sentence_df.loc[sentence_df.sentence.str.contains("^(– |- )", case=True), "sentence"] = sentence_df.loc[sentence_df.sentence.str.contains("^(– |- )", case=True), "sentence"].str.split(" ", n=1, expand=True)[1] # Fix sentences that start with – and a space

# Make all references to CO2 spelled identical
sentence_df["sentence"] = sentence_df["sentence"].str.replace(r'[C|c][O|o]\W2', 'CO2', regex=True)

# remove some sentences that have some issues
sentence_df = sentence_df[
    ~sentence_df.sentence.str.contains(
        "^((meine )?sehr (geehrt|verehrt)|liebe|(meine )?damen und)", case=False
    )  # introductions
    & ~sentence_df.sentence.str.contains("(:|;)$", case=False)  # bad endings
    & ~sentence_df.sentence.str.contains("^([a-z]|-|–)", case=True)  # lowercase sent starts / dashes
]

print(sentence_df.shape)
sentence_df = sentence_df[~(sentence_df.sentence_length < 5) & ~(sentence_df.sentence.str.contains(": "))]
print(sentence_df.shape)

In [10]:
sentence_df.sample(3)

Unnamed: 0,sent_id,speech_id,name,electoral_term,party,role,date,session,sentence_no,sentence_length,sentence
1460978,1590053,65605,Rüdiger Kruse,19,CDU,mp,2021-05-06,227,39,5,Wir haben 114 Punkte zusammengetragen.
418052,451246,16830,Georg Kippels,18,CDU/CSU,mp,2016-01-14,149,29,43,"Herr Kollege Weinberg, in Ihrem Antrag heißt es, es habe ein Geschmäckle, dass das Volumen in der Ausschreibung jetzt auf 9 Millionen Euro erweitert worden ist, und sei deshalb überhaupt nicht verwunderlich, dass der neue Anbieter auch ein erhöhtes Angebotsvolumen präsentieren kann."
805282,872452,34080,Andreas Mrosek,19,AfD,mp,2018-06-27,41,9,6,Ich spreche also aus beruflicher Erfahrung.


In [11]:
sentence_df.shape

(1117882, 11)

In [12]:
stc_per_party = sentence_df["party"].value_counts()
stc_per_party

party
SPD          252524
CDU/CSU      202017
GRUENE       168448
DIE LINKE    157805
CDU          129406
AfD           85574
FDP           74585
CSU           47523
Name: count, dtype: int64

In [13]:
sentence_df = sentence_df.reset_index()
del sentence_df["index"]
sentence_df.to_feather("data/sentences_all.feather")