# Create final Bundestag dataset as a combination from open discourse and GermaParl


In [1]:
import pandas as pd
from tqdm import tqdm
import plotly.express as px

# setup
pd.set_option("display.max_colwidth", 2000)
pd.set_option("display.max_rows", 50)

In [2]:
# get speeches
dfg = pd.read_feather("data/germaparl.feather")
# use only from 19th legislative period
dfg = dfg[dfg["electoral_term"] == 19]
# Select only speeches by mp or government
dfg = dfg[(dfg["role"]=="mp") | (dfg["role"]=="government")]
dfg = dfg.sort_values(by=['speech_id'])
dfg.shape
#dfg.head(1)

(40075, 18)

In [3]:
dfg[dfg.speech.str.contains("\n", case=False)]

Unnamed: 0,aid,speech_id,name,position,party,role,group,who,speech,agenda_no,agenda_type,description_agenda,title,electoral_term,session,session_id,date,url


In [4]:
dfo = pd.read_feather("data/open_discourse_1819.feather")
dfo["speech_content"] = dfo["speech_content"].str.normalize('NFKD')
# use only from 18th legislative period
dfo = dfo[dfo["electoral_term"] == 18]
# Select only speeches by mp or government
dfo = dfo[(~dfo["position_short"].isin(['Presidium of Parliament', 'Guest', 'Not found']))]
dfo = dfo.sort_values(by=['id_x'])
dfo.shape
#dfo.head(1)

(27833, 16)

In [5]:
# convert open discourse positions to roles from GermaParl
roles = {"Member of Parliament": "mp", "Minister": "government", "Chancellor": "government", "Secretary of State": "government"}
dfo["role"] = dfo.position_short.map(roles)

In [6]:
dfo = dfo.rename({'speech_content':'speech', 'document_url':'url'}, axis='columns')

In [7]:
dfo = dfo[["name", "electoral_term", "session", "party", "speech", "role", "date", "url"]]
dfg = dfg[["name", "electoral_term", "session", "party", "speech", "role", "date", "url"]]

In [8]:
df = pd.concat([dfo, dfg])

In [10]:
df[df.speech.str.len() < 110].sample(40)

Unnamed: 0,name,electoral_term,session,party,speech,role,date,url
11558,Katja Dörner,18,57,GRUENE,Selbstverständlich.,mp,2014-10-09,https://dip21.bundestag.de/dip21/btp/18/18057.pdf
74846,Annalena Baerbock,19,239,GRUENE,Ich komme zum Schluss.,mp,2021-09-07,https://www.bundestag.de/resource/blob/858472/ba7188b5e684b1fb5497fa5cd5bfa1d3/19239-data.xml
11598,Horst Seehofer,19,41,CSU,Bei mir kommt es jedenfalls verstehbar nicht an.,government,2018-06-27,https://www.bundestag.de/resource/blob/562324/3ed065a2084ae77a4e45cd192f37ed29/19041-data.xml
52260,Maria Klein-Schmeink,18,243,GRUENE,"Ich danke für die Gelegenheit, Ihnen eine Frage zu stellen .",mp,2017-06-29,https://dip21.bundestag.de/dip21/btp/18/18243.pdf
47631,Gabriele Hiller-Ohm,19,158,SPD,Von der AfD?,mp,2020-05-07,https://www.bundestag.de/resource/blob/695138/c2abc3b1ff1868224f5964d9ba0bc60e/19158-data.xml
24375,Mahmut Özdemir,19,83,SPD,"Ich würde gerne testen wollen, ob sie ihren Gesetzentwurf selber gelesen hat.",mp,2019-02-21,https://www.bundestag.de/resource/blob/595300/707c0bea31cc2b599cc68cba83542996/19083-data.xml
9007,Peter Boehringer,19,34,AfD,"Nein, nicht überwiegend, sondern exklusiv, bitte.",mp,2018-05-18,https://www.bundestag.de/resource/blob/556386/36c353e5fc4b7ad2c50ea72af3b146da/19034-data.xml
19932,Katja Kipping,18,94,DIE LINKE,Gerne.,mp,2015-03-19,https://dip21.bundestag.de/dip21/btp/18/18094.pdf
12692,Heinz Joachim Barchmann,18,61,SPD,"Ja, gern.",mp,2014-10-17,https://dip21.bundestag.de/dip21/btp/18/18061.pdf
42138,Christian Kühn,19,140,GRUENE,Lassen Sie uns nun auch gemeinsam handeln. Danke schön.,mp,2020-01-16,https://www.bundestag.de/resource/blob/678012/ab217d31390525a2fdd02bbc8e44ae0f/19140-data.xml


In [44]:
print(dfg.shape, dfo.shape)
print(df.shape)

(40075, 8) (27833, 8)
(67908, 8)


In [45]:
df["date"] = pd.to_datetime(df["date"])

In [46]:
df = df.reset_index(drop=True)

In [47]:
df["speech_id"] = df.index + 1

In [48]:
df.reset_index(drop=True).to_feather("data/Bundestag1819.feather")