In [488]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import pickle
import numpy as np
import unidecode

In [286]:
fdp = "https://www.fdpbt.de/fraktion/abgeordnete"
source_fdp = requests.get(fdp).text
soup_fdp = BeautifulSoup(source_fdp, 'html.parser')

cdu = "https://www.cducsu.de/hier-stellt-die-cducsu-bundestagsfraktion-ihre-abgeordneten-vor"
source_cdu = requests.get(cdu).text
soup_cdu = BeautifulSoup(source_cdu, 'html.parser')

spd = "https://www.spdfraktion.de/abgeordnete/alle?wp=19&view=list&old=19"
source_spd = requests.get(spd).text
soup_spd = BeautifulSoup(source_spd, 'html.parser')

# for Die Linke, one needs to extract the twitter info from each individual MdB website
linke_base = "https://www.linksfraktion.de/fraktion/abgeordnete/"
letters = [['a', 'e'], ['f', 'j'], ['k', 'o'], ['p', 't'], ['u', 'z']]
linke_name_bins = []

for letter in letters:
    extension = f'{letter[0]}-bis-{letter[1]}/' 
    linke_name_bins.append(linke_base + extension)

In [287]:
all_abg_fdp = soup_fdp.find(class_ = 'person-list').find_all(class_ = 'person-item-wrapper')
all_abg_cdu = soup_cdu.find_all(class_ = 'teaser delegates')
all_abg_spd = soup_spd.find_all(class_ = 'views-row')
 
all_abg_linke = []
for name_bin in linke_name_bins:
    source = requests.get(name_bin).text
    soup = BeautifulSoup(source, 'html.parser')
    for abg in soup.find_all('div', attrs = {'class': 'col-xs-12 col-sm-12 col-md-6 col-lg-6'}):
        extension = abg.find('h2').find('a')['href'].lstrip('/fraktion/abgeordnete/')
        all_abg_linke.append(linke_base + extension)

In [288]:
twitter_list = []
for abg in all_abg_fdp:
    name_field = abg.find(class_ = 'person-name')
    funktion = name_field.find('span').text.strip()
    name = name_field.text.strip('\n').strip().rstrip(funktion).strip('\n').strip()
    twitter = abg.find('a', attrs = {'class': 'tw'}, href = True)
    twitter_list.append(
        {
        'Partei': "FDP",
        'Name': name,
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )
    
for abg in all_abg_cdu:
    twitter = abg.find(class_ = 'twitter')
    twitter_list.append(
        {
        'Partei': "CDU/CSU",
        'Name': abg.find('h2').find('span').text.strip(' '),
        'Twitter': twitter.find('a', href = True)['href'] if twitter is not None else ""
            }
        )
    
for abg in all_abg_spd:
    twitter = abg.find(class_ = 'ico_twitter')
    twitter_list.append(
        {
        'Partei': "SPD",
        'Name': abg.find('h3').find('a').get_text().strip(' '),
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )
    
for abg in all_abg_linke:
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    twitter = abg_soup.find('a', text = re.compile('Twitter-Profil'))
    twitter_list.append(
        {
        'Partei': "Die Linke",
        'Name': abg_soup.find('h1').text.strip(' '),
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )

In [79]:
# twitter_df = pd.DataFrame(twitter_list)
# twitter_df['twitter'] = twitter_df['twitter'].apply(lambda x: x.lstrip('http://twitter.com/'))
# twitter_df['twitter'] = twitter_df['twitter'].apply(lambda x: x.lstrip('https://twitter.com/'))
# twitter_df['twitter'] = twitter_df['twitter'].apply(lambda x: x.strip(''))

In [489]:
with open('abg_df.pickle', 'rb') as handle:
    df = pickle.load(handle)

In [490]:
matching_list = []
for name in df['Name']:
    interim = re.sub("[\(\[].*?[\)\]]", "", ' '.join(name.split(',')[::-1])).strip(' ') # placing first name before last name
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim) # stripping titles
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    matching_list.append(re.sub(' +', ' ', interim))
df['Name_matching'] = matching_list

In [491]:
afd_df = pd.read_csv('AFD.csv', encoding = "ISO-8859-1", delimiter = ';')
afd_df.columns = ['Name', 'Partei', 'Twitter']
columns_titles = ['Partei', 'Name', 'Twitter']
afd_df=afd_df.reindex(columns=columns_titles)

In [492]:
twitter_df = pd.DataFrame(twitter_list)
twitter_df = twitter_df.append(afd_df)

In [493]:
matching_list_twitter = []
for name in twitter_df['Name']:
    interim = re.sub("[\(\[].*?[\)\]]", "", name).strip(' ')
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    matching_list_twitter.append(re.sub(' +', ' ', interim))   
twitter_df['Name_matching_control'] = matching_list_twitter

In [494]:
# merging dataframes (df and twitter_df)
df = pd.merge(df, twitter_df, how = 'left', left_on = 'Name_matching', right_on = 'Name_matching_control', suffixes = ('', '_right'))

In [495]:
# non_matches = []
# for i in range(len(df)):
#     if (df['Partei'][i] != df['Partei_right'][i] and df['Partei'][i] in ('SPD', 'Die Linke', 'CDU/CSU', 'FDP', 'AFD')):
#         non_matches.append(i)
        
# non_matches_twitter = []
# for i in non_matches:
#     non_match = df['Name_matching'][i].split()[-1]
#     for name in twitter_df['Name_matching_control']:
#         if non_match in name:
#             non_matches_twitter.append([i, twitter_df.loc[twitter_df['Name_matching_control'] == name].index[0]])

# for pair in non_matches_twitter:
#     df['Name_matching'][i] = twitter_df['Name_matching_control'][j]
#     df['Twitter'][i] = twitter_df['Twitter'][j]    

In [507]:
# feeding Twitter account into "Soziale Medien"-dictionary
for i in range(len(df)):
    df['Soziale Medien'][i]['Twitter'] = df['Twitter'][i]
# dropping columns used for merging only
df = df.drop(['Name_matching', 'Partei_right', 'Name_right', 'Twitter', 'Name_matching_control'], axis = 1)   

In [510]:
with open('abg_df.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)