In [659]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import pickle
import numpy as np
import unidecode

In [514]:
fdp = "https://www.fdpbt.de/fraktion/abgeordnete"
source_fdp = requests.get(fdp).text
soup_fdp = BeautifulSoup(source_fdp, 'html.parser')

cdu = "https://www.cducsu.de/hier-stellt-die-cducsu-bundestagsfraktion-ihre-abgeordneten-vor"
source_cdu = requests.get(cdu).text
soup_cdu = BeautifulSoup(source_cdu, 'html.parser')

spd = "https://www.spdfraktion.de/abgeordnete/alle?wp=19&view=list&old=19"
source_spd = requests.get(spd).text
soup_spd = BeautifulSoup(source_spd, 'html.parser')

gruene = "https://www.gruene-bundestag.de/abgeordnete"
source_gruene = requests.get(gruene).text
soup_gruene = BeautifulSoup(source_gruene, 'html.parser')

# for Die Linke, one needs to extract the twitter info from each individual MdB website
linke_base = "https://www.linksfraktion.de/fraktion/abgeordnete/"
letters = [['a', 'e'], ['f', 'j'], ['k', 'o'], ['p', 't'], ['u', 'z']]
linke_name_bins = []

for letter in letters:
    extension = f'{letter[0]}-bis-{letter[1]}/' 
    linke_name_bins.append(linke_base + extension)

In [515]:
all_abg_fdp = soup_fdp.find(class_ = 'person-list').find_all(class_ = 'person-item-wrapper')
all_abg_cdu = soup_cdu.find_all(class_ = 'teaser delegates')
all_abg_spd = soup_spd.find_all(class_ = 'views-row')

extensions_gruene = soup_gruene.find_all('a', class_="abgeordneteTeaser__wrapper")
urlbase_gruene = 'https://www.gruene-bundestag.de'
all_abg_gruene = []
for a in extensions_gruene:
    extension = a['href']
    link = urlbase_gruene + str(extension)
    all_abg_gruene.append(link)
    
all_abg_linke = []
for name_bin in linke_name_bins:
    source = requests.get(name_bin).text
    soup = BeautifulSoup(source, 'html.parser')
    for abg in soup.find_all('div', attrs = {'class': 'col-xs-12 col-sm-12 col-md-6 col-lg-6'}):
        extension = abg.find('h2').find('a')['href'].lstrip('/fraktion/abgeordnete/')
        all_abg_linke.append(linke_base + extension)

In [516]:
twitter_list = []
for abg in all_abg_fdp:
    name_field = abg.find(class_ = 'person-name')
    funktion = name_field.find('span').text.strip()
    name = name_field.text.strip('\n').strip().rstrip(funktion).strip('\n').strip()
    twitter = abg.find('a', attrs = {'class': 'tw'}, href = True)
    twitter_list.append(
        {
        'Partei': "FDP",
        'Name': name,
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )
    
for abg in all_abg_cdu:
    twitter = abg.find(class_ = 'twitter')
    twitter_list.append(
        {
        'Partei': "CDU/CSU",
        'Name': abg.find('h2').find('span').text.strip(' '),
        'Twitter': twitter.find('a', href = True)['href'] if twitter is not None else ""
            }
        )
    
for abg in all_abg_spd:
    twitter = abg.find(class_ = 'ico_twitter')
    twitter_list.append(
        {
        'Partei': "SPD",
        'Name': abg.find('h3').find('a').get_text().strip(' '),
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )
    
for abg in all_abg_gruene:
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    hrefss = []
    twitter = ""
    for x in abg_soup.find_all(class_="weitereInfoTeaser"):
        for y in x.find_all('a', href = True):
            z = y['href']
            hrefss.append(z)
            for i in hrefss:
                if "twitter" not in i:
                    continue 
                else:
                    twitter = i
                    
    twitter_list.append(           
        {
        'Partei': "Bündnis 90/Die Grünen",
        'Name': abg_soup.find('h1').text,
        'Twitter': twitter
            }
        )
    
for abg in all_abg_linke:
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    twitter = abg_soup.find('a', text = re.compile('Twitter-Profil'))
    twitter_list.append(
        {
        'Partei': "Die Linke",
        'Name': abg_soup.find('h1').text.strip(' '),
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )

In [826]:
afd_df = pd.read_csv('AFD.csv', encoding = "ISO-8859-1", delimiter = ';')
afd_df.columns = ['Name', 'Partei', 'Twitter']
columns_titles = ['Partei', 'Name', 'Twitter']
afd_df=afd_df.reindex(columns=columns_titles)

In [827]:
twitter_df = pd.DataFrame(twitter_list)
twitter_df = twitter_df.append(afd_df)

In [828]:
def name_prep_twitter(name):
    interim = re.sub("[\(\[].*?[\)\]]", "", name).strip(' ')
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    interim = re.sub(' +', ' ', interim)
    if len(interim.split()) > 2:
        if interim.split()[0].endswith(('.', 'med', 'forest')):
            first_name = interim.split()[1]
        else:
            first_name = interim.split()[0]   
        last_name = interim.split()[-1]
        return (first_name + ' ' + last_name)
    else:
        return interim

In [829]:
twitter_df['Name_matching'] = twitter_df['Name'].apply(name_prep_twitter)

In [830]:
def name_prep(name):
    interim = re.sub("[\(\[].*?[\)\]]", "", ' '.join(name.split(',')[::-1])).strip(' ') # placing first name before last name
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim) # stripping titles
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    interim = re.sub(' +', ' ', interim)
    if len(interim.split()) > 2:
        if interim.split()[0].endswith(('.', 'med', 'forest')):
            first_name = interim.split()[1]
        else:
            first_name = interim.split()[0]   
        last_name = interim.split()[-1]
        return (first_name + ' ' + last_name)
    else:
        return interim

In [854]:
with open('abg_df.pickle', 'rb') as handle:
    df = pickle.load(handle)

In [855]:
df['Name_matching'] = df['Name'].apply(name_prep)

In [856]:
# merging dataframes (df and twitter_df)
df = pd.merge(df, twitter_df, how = 'left', left_on = 'Name_matching', right_on = 'Name_matching', suffixes = ('', '_right'))

In [857]:
df.head()

Unnamed: 0,Name,Partei,Wahlart,Bundesland,Wahlkreis,Ausschuesse,Soziale Medien,Biografie,Twitter,Name_matching,Partei_right,Name_right,Twitter_right
0,"Abercron, Dr. Michael von",CDU/CSU,Direkt gewählt,Schleswig-Holstein,Wahlkreis 007: Pinneberg,{'Ordentliches Mitglied': ['Ausschuss für Ernä...,{'von-abercron.de/': 'http://www.von-abercron....,Geboren am 17. November 1952 in Ehlers...,https://twitter.com/mvabercron,Michael Abercron,CDU/CSU,Dr. Michael von Abercron,https://twitter.com/mvabercron
1,"Achelwilm, Doris",Die Linke,Gewählt über Landesliste,Bremen,n.a.,{'Ordentliches Mitglied': ['Ausschuss für Fami...,{'doris-achelwilm.de': 'http://www.doris-achel...,Geboren am 30. November 1976 in Thuine...,https://twitter.com/doris_achelwilm,Doris Achelwilm,Die Linke,Doris Achelwilm,https://twitter.com/doris_achelwilm
2,"Aggelidis, Grigorios",FDP,Gewählt über Landesliste,Niedersachsen,Wahlkreis 043: Hannover-Land I,{'Ordentliches Mitglied': ['Kuratorium der Bun...,{'grigorios-aggelidis.de': 'http://www.grigori...,Geboren am 19. August 1965 in Hannover...,http://www.twitter.com/Aggelidis_FDP,Grigorios Aggelidis,FDP,Grigorios Aggelidis,http://www.twitter.com/Aggelidis_FDP
3,"Akbulut, Gökay",Die Linke,Gewählt über Landesliste,Baden-Württemberg,Wahlkreis 275: Mannheim,"{'Ordentliches Mitglied': ['Schriftführer/in',...",{'goekay-akbulut.de': 'https://goekay-akbulut....,Geboren 1982 in Pinarbasi/ Türkei; ledig.Juni ...,https://twitter.com/akbulutgokay,Gokay Akbulut,Die Linke,Gökay Akbulut,https://twitter.com/akbulutgokay
4,"Albani, Stephan",CDU/CSU,Gewählt über Landesliste,Niedersachsen,Wahlkreis 027: Oldenburg – Ammerland,{'Ordentliches Mitglied': ['Ausschuss für Bild...,{'stephan-albani.de': 'http://www.stephan-alba...,Geboren am 3. Juni 1968 in Göttingen; verheira...,,Stephan Albani,CDU/CSU,Stephan Albani,


In [858]:
# non_matches = []
# for i in range(len(df)):
#     if (df['Name_matching'][i] != df['Name_matching'][i]):
#         non_matches.append(i)

# print(non_matches)

In [859]:
mask1 = (df['Twitter'] == "")
mask2 = (df['Twitter'].isnull())
def get_twitter_from_dict(x):
    if 'Twitter' in x:
        return x['Twitter']

In [860]:
# filling up Twitter-column with values from Soziale Medien-dictionary
mask1 = (df['Twitter'] == "")
mask2 = (df['Twitter'].isnull())
df.loc[mask1 | mask2]['Twitter'] = df.loc[mask1 | mask2]['Soziale Medien'].apply(get_twitter_from_dict)

# dropping columns used for merging only
df = df.drop(['Name_matching', 'Partei_right', 'Name_right'], axis = 1)   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [861]:
df.head()

Unnamed: 0,Name,Partei,Wahlart,Bundesland,Wahlkreis,Ausschuesse,Soziale Medien,Biografie,Twitter,Twitter_right
0,"Abercron, Dr. Michael von",CDU/CSU,Direkt gewählt,Schleswig-Holstein,Wahlkreis 007: Pinneberg,{'Ordentliches Mitglied': ['Ausschuss für Ernä...,{'von-abercron.de/': 'http://www.von-abercron....,Geboren am 17. November 1952 in Ehlers...,https://twitter.com/mvabercron,https://twitter.com/mvabercron
1,"Achelwilm, Doris",Die Linke,Gewählt über Landesliste,Bremen,n.a.,{'Ordentliches Mitglied': ['Ausschuss für Fami...,{'doris-achelwilm.de': 'http://www.doris-achel...,Geboren am 30. November 1976 in Thuine...,https://twitter.com/doris_achelwilm,https://twitter.com/doris_achelwilm
2,"Aggelidis, Grigorios",FDP,Gewählt über Landesliste,Niedersachsen,Wahlkreis 043: Hannover-Land I,{'Ordentliches Mitglied': ['Kuratorium der Bun...,{'grigorios-aggelidis.de': 'http://www.grigori...,Geboren am 19. August 1965 in Hannover...,http://www.twitter.com/Aggelidis_FDP,http://www.twitter.com/Aggelidis_FDP
3,"Akbulut, Gökay",Die Linke,Gewählt über Landesliste,Baden-Württemberg,Wahlkreis 275: Mannheim,"{'Ordentliches Mitglied': ['Schriftführer/in',...",{'goekay-akbulut.de': 'https://goekay-akbulut....,Geboren 1982 in Pinarbasi/ Türkei; ledig.Juni ...,https://twitter.com/akbulutgokay,https://twitter.com/akbulutgokay
4,"Albani, Stephan",CDU/CSU,Gewählt über Landesliste,Niedersachsen,Wahlkreis 027: Oldenburg – Ammerland,{'Ordentliches Mitglied': ['Ausschuss für Bild...,{'stephan-albani.de': 'http://www.stephan-alba...,Geboren am 3. Juni 1968 in Göttingen; verheira...,,


In [862]:
mask1 = df['Twitter'].isnull()

def get_username(url):
    if url.startswith('http'):
        return(url.split('/')[3].split('?')[0])
    else:
        return(url.split('?')[0])

In [863]:
df['Twitter'] = df['Twitter'][mask1 == False].apply(get_username)

In [864]:
with open('abg_df.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)