In [1]:
import pandas as pd
from datetime import datetime, date

We start with looking up the tables that we have: Factions, Contributions, Politicians, Speeches and so on.

In [2]:
factions = pd.read_feather('factions.feather')
factions.head()

Unnamed: 0,id,abbreviation,fullName
0,-1,not found,not found
1,0,AfD,Alternative für Deutschland
2,1,BHE,Block der Heimatvertriebenen und Entrechteten
3,2,BP,Bayernpartei
4,3,Grüne,Bündnis 90/Die Grünen


In [3]:
factions.shape

(28, 3)

In [4]:
contributions_extended = pd.read_feather('contributions_extended.feather')
contributions_extended.head()

Unnamed: 0,id,type,firstName,lastName,politicianId,content,speechId,textPosition,factionId
0,0,Lachen,,,-1,links,6,7,-1
1,1,Personen-Einruf,,hütter,11000979,Nein!,6,2,13
2,2,Zuruf,,,-1,§ 51!,9,3,20
3,3,Zuruf,,,-1,links,11,0,-1
4,4,Personen-Einruf,,blücher,11000202,Jawohl!,12,0,13


In [5]:
contributions_simplified = pd.read_feather('contributions_simplified.feather')
contributions_simplified.head()

Unnamed: 0,id,textPosition,speechId,content
0,0,0,2,(Der Bundespräsident leistet diesen Eid.)
1,1,4,5,(dem Bundespräsidenten die Hand reichend)
2,2,3,5,(Langanhaltender lebhafter Beifall.)
3,3,2,5,(Bundespräsident Dr. Heuss)
4,4,1,5,(Sehr richtig!)


In [6]:
contributions_extended.shape

(2451436, 9)

In [8]:
contributions_simplified.shape

(2131773, 4)

In [9]:
politicians = pd.read_feather('politicians.feather')
politicians.head()

Unnamed: 0,id,firstName,lastName,birthPlace,birthCountry,birthDate,deathDate,gender,profession,aristocracy,academicTitle
0,11000001,Manfred,Abelein,Stuttgart,Deutschland,1930-10-20,2008-01-17,männlich,"Rechtsanwalt, Wirtschaftsprüfer, Universitätsp...",,Prof. Dr.
1,11000002,Ernst,Achenbach,Siegen,Deutschland,1909-04-09,1991-12-02,männlich,Rechtsanwalt und Notar,,Dr.
2,11000003,Annemarie,Ackermann,Parabutsch,Jugoslawien,1913-05-26,1994-02-18,weiblich,Hilfsreferentin,,
3,11000004,Else,Ackermann,Berlin,Deutschland,1933-11-06,2019-09-14,weiblich,Ärztin,,Dr.
4,11000005,Ulrich,Adam,"Teterow, Kr. Teterow, Bezirk Neubrandenburg",Deutschland,1950-06-09,,männlich,"Mathematiker, Geschäftsführer",,


In [10]:
politicians.shape

(4102, 11)

In [11]:
electoral_terms = pd.read_feather('electoral_terms.feather')
electoral_terms.head()

Unnamed: 0,id,startDate,endDate
0,1,-641174400,-512524800
1,2,-512438400,-385516800
2,3,-385430400,-259113600
3,4,-259027200,-132710400
4,5,-132624000,-6393600


In [12]:
electoral_terms.shape

(19, 3)

In [2]:
speeches = pd.read_feather('speeches.feather')
speeches.head()

Unnamed: 0,id,session,electoralTerm,firstName,lastName,politicianId,speechContent,factionId,documentUrl,positionShort,positionLong,date
0,0.0,2.0,1.0,,Köhler,11001150.0,Meine Damen und Herren! Ich eröffne die 2. Sit...,-1.0,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,Presidium of Parliament,präsident,1949-09-12
1,1.0,2.0,1.0,,Arnold,-1.0,"Der Bundesrat ist versammelt, Herr Präsident.\n",-1.0,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,Guest,präsident des bundesrats,1949-09-12
2,2.0,2.0,1.0,,Köhler,11001150.0,Ich danke für diese Erklärung. Ich stelle dami...,-1.0,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,Presidium of Parliament,präsident,1949-09-12
3,3.0,2.0,1.0,,Heuss,-1.0,"Ja, ich habe den Wunsch.\n",-1.0,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,Guest,bundespräsident,1949-09-12
4,4.0,2.0,1.0,,Köhler,11001150.0,Ich erteile dem Herrn Bundespräsidenten das Wo...,-1.0,https://dip21.bundestag.de/dip21/btp/01/01002.pdf,Presidium of Parliament,präsident,1949-09-12


In [14]:
speeches.shape

(899526, 12)

Now we see max date and min date present on speeches table.

In [16]:
speeches.date.max()

datetime.date(2020, 12, 17)

In [17]:
speeches.date.min()

datetime.date(1949, 9, 12)

Now we split the speeches table on the basis of date so that later we do not have to load whole table each time, as we are only interested in speeches of year 2019 and 2020.

In [34]:
speeches_2020 = speeches[((speeches.date >= date(2020, 1, 1)) & (speeches.date <= date(2020, 12, 31)))]
speeches_2020.reset_index(drop=True, inplace=True)
speeches_2020.to_feather('speeches_2020.feather')

In [35]:
speeches_2019 = speeches[((speeches.date >= date(2019, 1, 1)) & (speeches.date <= date(2019, 12, 31)))]
speeches_2019.reset_index(drop=True, inplace=True)
speeches_2019.to_feather('speeches_2019.feather')

In [15]:
speeches_2019_2020 = speeches[((speeches.date >= date(2019, 1, 1)) & (speeches.date <= date(2020, 12, 31)))]
speeches_2019_2020.reset_index(drop=True, inplace=True)
speeches_2019_2020.to_feather('speeches_2019_2020.feather')