In [1]:
import xml.etree.ElementTree as ET
from datetime import datetime
from googlesearch import search
import time
import requests
import json
import re
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import string
import os



In [2]:
def remove_suffixes(name):
    
    suffixes = []
    new_name = []

    for part in name.split(' '):
        if '.' not in part and part.isupper() or part == 'MSc':
            suffixes.append(part + ' ')
        else:
            new_name.append(part)
    
    return ' '.join(new_name), suffixes


def remove_prefixes(name):
    
    if 'De heer.' in name:
        name = name.replace('De heer.', 'De heer')
    
    pattern = r"(?!ij|th|ph|hr|ch|jr|sr)[a-z]{2,}\.\s?"
    prefixes = re.findall(pattern, name, flags=re.IGNORECASE)
    
    name = re.sub(pattern, '', name, flags=re.IGNORECASE).strip()
    
    if name.split(' ')[0].lower() == 'mevrouw':
        name = ' '.join(name.split(' ')[1:])
        prefixes.append('mew.')
        
    
    if ' '.join(name.split(' ')[0:2]).lower() == 'de heer':
        print
        name = ' '.join(name.split(' ')[2:])
        prefixes.append('dhr.')
    
    return name, prefixes


def remove_army_titles(name):
    
    titles = []
    new_name = []
    
    # these army titles are gathered from the ministery of defense
    army_titles = {'Generaal-majoor', 'Generaal', 'Commandeur-arts', 'Brigade-generaal', 'Commodore', 'Kapitein-ter-zee', 'Brigadegeneraal', 'Kolonel-vlieger', 'Commandeur', 'Schout-bij-nacht', 'Luitenant-kolonel', 'Cdre', 'Majoor', 'Vice-admiraal', 'Kolonel', 'Luitenant-generaal'}

    multi_word_army_titles = ['Generaal-majoor der mariniers', 'Generaal-majoor der Cavalerie', 'Luitenant-generaal der mariniers', 'Brigadegeneraal der mariniers']
    
    for Title in multi_word_army_titles:
        if Title in name:
            titles.append(Title + ' ')
            name = name.split(f'{Title} ')[1]
    
    for part in name.split(' '):
        if part in army_titles:
            titles.append(part + ' ')
        else:
            new_name.append(part)
    
    return ' '.join(new_name), titles
            


In [3]:
def strip_titles_from_name(name):
    """
        Tries to remove all prefixes, army titles and suffixes and from the name string
        and tries to leave the initials and the rest of the name in it's place. 
    """

    # prefixes are very common among names in the XML file
    name, prefixes = remove_prefixes(name)
    
    # Sometimes the name string may contain an army title (especially in the ministry of defense)
    name, army_title = remove_army_titles(name)
    
    # Suffixes in the XML file are always fully Capitalized except if it is MSc, these are separated from the name
    name, suffixes = remove_suffixes(name)
    
    if prefixes or suffixes or army_title:
        return prefixes + army_title + suffixes, name
    
    else:
        return '', name.strip()

print(strip_titles_from_name('De heer. A. Choho'))

(['dhr.'], 'A. Choho')


In [4]:
def xpath(element, path, namespaces):
    obj = element.find(path, namespaces)
    
    if obj is not None:
        return obj.text
    else:
        return ''

In [5]:
def xpath_adres(element, namespaces, Type):
    
    Incomplete = False
    
    if Type == 'Bezoek':
    
        obj = element.find('p:adressen/p:adres[p:type="Bezoekadres"]', namespaces)

        if obj is not None:
            
            straat = obj.find('p:straat', namespaces)
            if straat is not None:
                straat = straat.text
            else:
                Incomplete = True
            
            huisnummer = obj.find('p:huisnummer', namespaces)
            if huisnummer is not None:
                huisnummer = huisnummer.text
            else:
                Incomplete = True
            
            postcode = obj.find('p:postcode', namespaces)
            if postcode is not None:
                postcode = postcode.text
            else:
                Incomplete = True
                
            plaats = obj.find('p:plaats', namespaces)
            if plaats is not None:
                plaats = plaats.text.title()
                
            else:
                Incomplete = True
                
            if Incomplete == True:
                return ''

            return f"{straat} {huisnummer} {postcode} {plaats}"
         
        else:
            return ''
        
    if Type == 'Post':
        
        obj = element.find('p:adressen/p:adres[p:type="Postadres"]', namespaces)
        
        if obj is not None:
            postbus = obj.find('p:postbus', namespaces)
            if postbus is not None:
                postbus = postbus.text
            else:
                Incomplete = True
            
            postcode = obj.find('p:postcode', namespaces)
            if postcode is not None:
                postcode = postcode.text
            else:
                Incomplete = True
            
            plaats = obj.find('p:plaats', namespaces)
            if plaats is not None:
                plaats = plaats.text.title()
                
            else:
                Incomplete = True
                
            if Incomplete == True:
                return ''
            

            return f"{postbus} {postcode} {plaats}"
         
        else:
            return ''
        
    else:
        return ''

In [6]:
def xpath_TOOI(element, path, namespaces, afkorting):
    
    dc_publisher = element.find(path, namespaces)
    if dc_publisher is not None:

        # hier wordt de afkorting gebruikt bijv. ws = waterschappen
        if dc_publisher.text.rsplit('/')[-1][:len(afkorting)] == afkorting:
            
            return dc_publisher.text.rsplit('/')[-1]
        else:
            return ''
    else:
        return ''
    

In [7]:
def test_name(initial_name_txt):
    if initial_name_txt is not None:
        
        pattern = r'\((.*?)\)'
        matches = re.findall(pattern, initial_name_txt)
        
        initial_name_txt = re.sub(pattern, '', initial_name_txt)
        if " (" in initial_name_txt:
            initial_name_txt = initial_name_txt.split(' (')[0]
        
        if '  ' in initial_name_txt:
            initial_name_txt = initial_name_txt.replace('  ', ' ')

        # er staat vaak een prefix als dr. of mew. voor deze wordt verwijderd?
        prefixes, name_without_prefix = strip_titles_from_name(initial_name_txt)
        name_without_prefix = name_without_prefix.lstrip()
        
        if '.' in name_without_prefix and '. ' not in name_without_prefix:
            name_without_prefix = name_without_prefix.replace(".", ". ")
        
        if '. ' in name_without_prefix:
            c = name_without_prefix.count('. ')
            name_without_prefix = name_without_prefix.replace(". ", ".", c-1)

        initial_name_split = name_without_prefix.split(' ', 1)

        # bij gemeenten staat er een persoon in als mew. Schouten dus zonder initialen. 
        if len(initial_name_split) == 2:
            first_word = initial_name_split[0]
            if '.' not in first_word and len(first_word) != 1:

                foaf_firstName = initial_name_split[0]
                foaf_initials = foaf_firstName[0].upper() + '.'
                
            else:
                foaf_initials = initial_name_split[0]
                foaf_firstName = ''

            foaf_lastName = initial_name_split[1]
            x = foaf_lastName.split(' ')
            if x[0] != '':
                if x[0][-1] == '.':
                    foaf_lastName = ' '.join(x[1:])
                    foaf_initials = foaf_initials + x[0]
        
        else:
            foaf_initials = ''
            foaf_firstName = ''
            foaf_lastName = initial_name_split[0]
    
    # gebeurt in de praktijk niet maar maakt het wel failproof
    else:
        foaf_initials = ''
        foaf_firstName = ''
        foaf_lastName = ''
        
    prefixes = ''.join(prefixes)
    prefixes = prefixes.rstrip()
    
    if matches != []:
        if len(matches[0].split(' ')) == 1:
            foaf_firstName = matches[0]
            
            if '.' in foaf_firstName or foaf_firstName.isupper():
                foaf_firstName = ''
        
    if foaf_firstName != '':
        name_without_prefix = foaf_firstName + ' ' + foaf_lastName
    
    if foaf_firstName in ['van', 'de']:
        
        foaf_lastName = foaf_firstName + ' ' + foaf_lastName
        foaf_firstName = ''
        
    if foaf_firstName.isupper() == True:
        foaf_initials = '.'.join(foaf_firstName)
        foaf_firstName = ''
       
    return foaf_initials, foaf_firstName, foaf_lastName, name_without_prefix, prefixes



In [8]:
tree = ET.parse("exportOO_full.xml")
    
# Define the namespaces used in the XML document
namespaces = {
    'p': 'https://organisaties.overheid.nl/static/schema/oo/export/2.6.4'
}

afkortingen = {'Waterschap': 'ws',
              'Gemeente': 'gm',
               'Provincie': 'pv',
               'Ministerie': 'mnre'
              }


organisatie_elements = tree.findall(f'p:organisaties/p:organisatie', namespaces)

for organisatie in organisatie_elements:

    functie_element = organisatie.findall('p:functies/p:functie', namespaces)

    for functie in functie_element:

        medewerker_element = functie.findall('p:medewerkers/p:medewerker', namespaces)

        # loop over alle medewerkers om per medewerker gegevens te verzamelen
        for medewerker in medewerker_element:

            initial_name = medewerker.find('p:naam', namespaces)
            X = test_name(initial_name.text)
            if X[1] != '':
                print(X[1])
                

Annelore
Igor
Gerhard
Marlies
Alexander
Bert
Karin
Joke
Mirjam
Nanning
Gert
Alexander
Bert
Nanning
Geert
Sieb
Suzanne
Esther
Bas
Michiel
Ge
Piet
Renée
Gert
Margriet
Annelies
Léonie
Romeo
Carolien
Marien
Renée
Tessa
Tello
Meike
Rosalie
Jurriaan
Erik
Caroline
Arjan
Marjolein
Robin
Judith
Jane
Loes
Smit-
Smit-
Rien
Han
Suzanne
Paula
Afelonne
Tom
Iep
Marc
Annemieke
Erik
Mohammed
Marjolijn
Simone
Robert
Win
Gerben
Karin
Marian
Nico
Herman
Fione
Rosalien
Wim
Martien
Yannick
Axel
Gerard
Maarten
Edgar
Kristel
Ronald
Marijke
Janneke
Irene
Ninke
Ariën
Daphne
Marieke
José
Hennie
Harmen
Dirk
Richard
Elise
Ann
Pieter
Careen
Eva
Richard
Bauke
Rob
Liesbeth
Henk
Stan
André
Yvonne
Dick
Fione
Bart
Vincent
Thijs
Harry
Win
Timo
Pablo
Léon
Petro
Burgemeester
Stannie
Jet
Eppo
Patrick
Jan
Hartogh
Hartogh
Wolf
Monique
Tesseltje
Kristel
Jakob
Edith
Mirjam
Rob
Maria
Hilgenga-van
Hilgenga-van
Michael
Mariëtte
Michèle
Albert
Henkjan
Annette
Hans
Maurice
Maurice
Mark
Aarts-van
Noël
Noël
Gerard
Robert
Betty
Marko
K

In [9]:
def xpath_naam(element, path, namespaces):
    # verzamel de naam in de XML
    initial_name = element.find(path, namespaces)
    
    initial_name_txt = initial_name.text
    
    if initial_name_txt is not None:
        
        pattern = r'\((.*?)\)'
        matches = re.findall(pattern, initial_name_txt)
        
        initial_name_txt = re.sub(pattern, '', initial_name_txt)
        if " (" in initial_name_txt:
            initial_name_txt = initial_name_txt.split(' (')[0]
        
        if '  ' in initial_name_txt:
            initial_name_txt = initial_name_txt.replace('  ', ' ')

        # er staat vaak een prefix als dr. of mew. voor deze wordt verwijderd?
        prefixes, name_without_prefix = strip_titles_from_name(initial_name_txt)
        name_without_prefix = name_without_prefix.lstrip()
        
        if '.' in name_without_prefix and '. ' not in name_without_prefix:
            name_without_prefix = name_without_prefix.replace(".", ". ")
        
        if '. ' in name_without_prefix:
            c = name_without_prefix.count('. ')
            name_without_prefix = name_without_prefix.replace(". ", ".", c-1)

        initial_name_split = name_without_prefix.split(' ', 1)

        # bij gemeenten staat er een persoon in als mew. Schouten dus zonder initialen. 
        if len(initial_name_split) == 2:
            first_word = initial_name_split[0]
            if '.' not in first_word and len(first_word) != 1:

                foaf_firstName = initial_name_split[0]
                foaf_initials = foaf_firstName[0].upper() + '.'
                
            else:
                foaf_initials = initial_name_split[0]
                foaf_firstName = ''

            foaf_lastName = initial_name_split[1]
            x = foaf_lastName.split(' ')
            if x[0] != '':
                if x[0][-1] == '.':
                    foaf_lastName = ' '.join(x[1:])
                    foaf_initials = foaf_initials + x[0]
        
        else:
            foaf_initials = ''
            foaf_firstName = ''
            foaf_lastName = initial_name_split[0]
    
    # gebeurt in de praktijk niet maar maakt het wel failproof
    else:
        foaf_initials = ''
        foaf_firstName = ''
        foaf_lastName = ''
        
    prefixes = ''.join(prefixes)
    prefixes = prefixes.rstrip()
    
    if matches != []:
        if len(matches[0].split(' ')) == 1:
            foaf_firstName = matches[0]
            
            if '.' in foaf_firstName or foaf_firstName.isupper():
                foaf_firstName = ''
        
    if foaf_firstName != '':
        name_without_prefix = foaf_firstName + ' ' + foaf_lastName
    
    if foaf_firstName in ['van', 'de']:
        
        foaf_lastName = foaf_firstName + ' ' + foaf_lastName
        foaf_firstName = ''
        
    if foaf_firstName.isupper() == True:
        foaf_initials = '.'.join(foaf_firstName)
        foaf_firstName = ''
       
    return foaf_initials, foaf_firstName, foaf_lastName, name_without_prefix, prefixes

In [10]:
def json_create_single_layer(organisatie_type):
    
    list_of_dicts = []
    
    # afkorting zal later worden gebruikt
    afkortingen = {'Waterschap': 'ws',
                  'Gemeente': 'gm',
                   'Agentschap': 'oorg',
                    'Ministerie': 'mnre',
                   'Hoog College van Staat': 'oorg'
                  }
    afkorting = afkortingen[organisatie_type]
    
    tree = ET.parse("exportOO_full.xml")
    
    # Define the namespaces used in the XML document
    namespaces = {
        'p': 'https://organisaties.overheid.nl/static/schema/oo/export/2.6.4'
    }

    # Verzamel de huidige datum
    foi_retrievedDate = datetime.today().strftime('%Y-%m-%d')

    # En het jaar
    dc_date_year = foi_retrievedDate[:4]


    # Verkrijg alle organisatie elementen (gemeenten, waterschappen etc.)
    organisatie_elements = tree.findall(f'//p:organisaties/p:organisatie/[p:types = "{organisatie_type}"]', namespaces)
    
    # loop over deze elementen en verzamel gegevens
    for organisatie in organisatie_elements:

        # In de resourceIdentifierTOOI staat de organisatiecode en veramel deze code
        dc_publisher = xpath_TOOI(organisatie, './/p:identificatiecodes/p:resourceIdentifier[@p:naam="resourceIdentifierTOOI"]', namespaces, afkorting) 
        
        dc_publisher_name = xpath(organisatie, 'p:naam', namespaces)

        website_txt = xpath(organisatie, './/p:contact/p:internetadressen/p:internetadres/p:url', namespaces)

        Type = xpath(organisatie, 'p:types/p:type', namespaces)
        
        foi_endDate = xpath(organisatie, 'p:eindDatum', namespaces)
        
        # in het functie_element staan alle verschillende functies
        functie_element = organisatie.findall('p:functies/p:functie', namespaces)

        # loop over deze functies
        for functie in functie_element:

            foi_count = 0
            DICT = {}

            foaf_function_type = xpath(functie, 'p:naam', namespaces)

            # vindt alle medewerkers die bij deze functie horen
            medewerker_element = functie.findall('p:medewerkers/p:medewerker', namespaces)

            # loop over alle medewerkers om per medewerker gegevens te verzamelen
            for medewerker in medewerker_element:
                
                foi_party = xpath(medewerker, 'p:partijLidmaatschap', namespaces)

                foaf_initials, foaf_firstName, foaf_lastName, name_without_prefix, prefixes = xpath_naam(medewerker, 'p:naam', namespaces)
                
                # optioneel startdatum, telefoonnummer en mailadres
                foi_startDate = xpath(medewerker, 'p:startDatum', namespaces)

                foaf_phone = xpath(medewerker, 'p:contact/p:telefoonnummers/p:telefoonnummer/p:nummer', namespaces)

                foaf_mbox = xpath(medewerker, 'p:contact/p:emailadressen/p:emailadres/p:email', namespaces)
                
                foi_bezoekadres = xpath_adres(medewerker, namespaces, 'Bezoek')
                
                foi_postadres = xpath_adres(medewerker, namespaces, 'Post')
                
                # genereer beschikbaarheidsgegevens
                if organisatie_type == 'Waterschap':
                    bereikbaarheidsgegevens = f"Bereikbaarheidsgegevens van {name_without_prefix}, {foaf_function_type} voor {Type.lower()} {dc_publisher_name}"
                else:
                    if foi_party != '':
                        bereikbaarheidsgegevens = f"Bereikbaarheidsgegevens van {name_without_prefix}, {foaf_function_type} voor {foi_party} in de {dc_publisher_name}"
                    else:
                        bereikbaarheidsgegevens = f"bereikbaarheidsgegevens van {name_without_prefix}, {foaf_function_type} voor {dc_publisher_name}"
                
#                 query = f"{name_without_prefix} {foaf_function_type} '{dc_publisher_name}'"
                
#                 linkedin_hrefs = search_yahoo('site:linkedin.com ' + query)
#                 foi_linkedin = extract_urls(linkedin_hrefs, 'linkedin.com', foaf_lastName)
                
#                 twitter_hrefs = search_yahoo('site:twitter.com ' + query)
                
#                 foi_twitter = extract_urls(twitter_hrefs,'twitter.com', foaf_lastName)
                
                
#                 wikipedia_hrefs = search_yahoo('site:nl.wikipedia.org ' + query)

#                 foi_wikipedia = extract_urls(wikipedia_hrefs, 'nl.wikipedia.org',foaf_lastName)
                

#                 website_hrefs = search_yahoo(f'site:{website_txt} ' + query)
#                 foi_website = extract_urls(website_hrefs, website_txt, foaf_lastName)                        
                                
                    
                # vul de dictionary in
                Dict = {
                        'dc_identifier': f"nl.{dc_publisher}.{foaf_function_type}.{dc_date_year}.{foi_count + 1}",
                        'dc_title': f"{name_without_prefix} - {dc_publisher_name}",
                        'dc_type': Type,
                        'dc_description': bereikbaarheidsgegevens,
                        'dc_source': f"https://organisaties.overheid.nl/archive/exportOO_{organisatie_type.lower()}.xml",
                        'dc_publisher': dc_publisher,
                        'dc_creator': "R0m4ndu",
                        'foi_retrievedDate': foi_retrievedDate,
                        'dc_date_year': dc_date_year,
                        'dc_publisher_name': dc_publisher_name,
                        'foi_title': prefixes,
                        'foaf_initials': foaf_initials,
                        'foaf_firstName': foaf_firstName,
                        'foaf_lastName': foaf_lastName,
                        'foaf_name': name_without_prefix,
                        'foaf_mbox': foaf_mbox,
                        'foaf_phone': foaf_phone,
                        'foi_visitAddress': foi_bezoekadres,
                        'foi_mailAddress': foi_postadres,
                        'foaf_workplaceHomepage': website_txt,
                        'foi_startDate': foi_startDate,
                        'foi_party': foi_party,
                        'foi_function': foaf_function_type,
                        'foi_files': [] ,
                    }
                # Verwijder alle lege strings uit de dict
                filtered_dict = {key: value for key, value in Dict.items() if value != ""}
                
                if foi_endDate == "":
                
                    # en voeg de dictionary toe aan 
                    DICT[foi_count] = filtered_dict

                    foi_count+=1
            
            if dc_publisher != '':
            
                # uiteindelijk hoeft de dict van alle medewerkers alleen maar toegevoegd te worden aan de volledige dict.
                final_dict = {'resource': f"nl.{dc_publisher}.{foaf_function_type}.{dc_date_year}",
                            'infobox': {'foi_totalDossiers': len(DICT),
                                         'foi_dossiers': DICT}}


                list_of_dicts.append(final_dict)
                
            

    number_of_people = sum(i['infobox']['foi_totalDossiers'] for i in list_of_dicts)
        
    return number_of_people, list_of_dicts


In [11]:
def json_create_multi_layer(organisatie_type):
    
    list_of_dicts = []
    
    # afkorting zal later worden gebruikt
    afkortingen = {'Provincie': 'pv',
                  'Ministerie': 'mnre',
                  'Hoog College van Staat': 'oorg',
                  'Gemeente': 'gm',
                  "Waterschap":"ws"}
    afkorting = afkortingen[organisatie_type]
    
    tree = ET.parse(f"exportOO_full.xml")
    
    # Define the namespaces used in the XML document
    namespaces = {
        'p': 'https://organisaties.overheid.nl/static/schema/oo/export/2.6.4'
    }

    # Verzamel de huidige datum
    foi_retrievedDate = datetime.today().strftime('%Y-%m-%d')

    # En het jaar
    dc_date_year = foi_retrievedDate[:4]

    # Verkrijg alle organisatie elementen (gemeenten, waterschappen etc.)
    organisatie_elements = tree.findall(f'//p:organisaties/p:organisatie/[p:types = "{organisatie_type}"]', namespaces)

    # loop over deze elementen en verzamel gegevens
    for organisatie in organisatie_elements:

        # In de resourceIdentifierTOOI staat de organisatiecode en veramel deze code
        dc_publisher = xpath_TOOI(organisatie, './/p:identificatiecodes/p:resourceIdentifier[@p:naam="resourceIdentifierTOOI"]', namespaces, afkorting) 
        
        dc_publisher_name = xpath(organisatie, 'p:naam', namespaces)

        website_txt = xpath(organisatie, './/p:contact/p:internetadressen/p:internetadres/p:url''p:types/p:type', namespaces)

        Type = xpath(organisatie, 'p:types/p:type', namespaces)
        
        foi_endDate1 = xpath(organisatie, 'p:eindDatum', namespaces)
        
        organisatie_elements2 = organisatie.findall('.//p:organisaties/p:organisatie', namespaces)
        
        for organisatie2 in organisatie_elements2:
            
            organisatie_naam = xpath(organisatie2, 'p:naam', namespaces)
        
            # in het functie_element staan alle verschillende functies
            functie_element = organisatie2.findall('p:functies/p:functie', namespaces)


            # loop over deze functies
            for functie in functie_element:
                foi_count = 0
                DICT = {}

                foaf_function_type = xpath(functie, 'p:naam', namespaces)

                # vindt alle medewerkers die bij deze functie horen
                medewerker_element = functie.findall('p:medewerkers/p:medewerker', namespaces)

                # loop over alle medewerkers om per medewerker gegevens te verzamelen
                for medewerker in medewerker_element:

                    foi_party = xpath(medewerker, 'p:partijLidmaatschap', namespaces)

                    foaf_initials, foaf_firstName, foaf_lastName, name_without_prefix, prefixes = xpath_naam(medewerker, 'p:naam', namespaces)

                    foi_endDate2 = xpath(organisatie, 'p:eindDatum', namespaces)
                    
                    # optioneel startdatum, telefoonnummer en mailadres
                    foi_startDate = xpath(medewerker, 'p:startDatum', namespaces)

                    foaf_phone = xpath(medewerker, 'p:contact/p:telefoonnummers/p:telefoonnummer/p:nummer', namespaces)

                    foaf_mbox = xpath(medewerker, 'p:contact/p:emailadressen/p:emailadres/p:email', namespaces)

                    # genereer beschikbaarheidsgegevens

                    if organisatie_type != 'Provincies':
                        if foi_party != '':
                            bereikbaarheidsgegevens = f"Bereikbaarheidsgegevens van {name_without_prefix}, {foaf_function_type} voor {foi_party} in de {dc_publisher_name}"
                        else:
                            bereikbaarheidsgegevens = f"bereikbaarheidsgegevens van {name_without_prefix}, {foaf_function_type} voor {dc_publisher_name}"

                    # vul de dictionary in
                    Dict = {
                            'dc_identifier': f"nl.{dc_publisher}.{foaf_function_type}.{dc_date_year}.{foi_count + 1}",
                            'dc_title': f"{name_without_prefix} - {dc_publisher_name}",
                            'dc_type': Type,
                            'dc_description': bereikbaarheidsgegevens,
                            'dc_source': f"https://organisaties.overheid.nl/archive/exportOO_{organisatie_type.lower()}.xml",
                            'dc_publisher': dc_publisher,
                            'dc_creator': "R0m4ndu",
                            'foi_retrievedDate': foi_retrievedDate,
                            'dc_date_year': dc_date_year,
                            'dc_publisher_name': dc_publisher_name,
                            'foi_title': prefixes,
                            'foaf_initials': foaf_initials,
                            'foaf_firstName': foaf_firstName,
                            'foaf_lastName': foaf_lastName,
                            'foaf_name': name_without_prefix,
                            'foaf_mbox': foaf_mbox,
                            'foaf_phone': foaf_phone,
                            'foi_visitAddress': foi_bezoekadres,
                            'foi_mailAddress': foi_postadres,
                            'foaf_workplaceHomepage': website_txt,
                            'foi_startDate': foi_startDate,
                            'foi_party': foi_party,
                            'foi_function': foaf_function_type,
                            'foi_files': [] ,
                        }

                    # Verwijder alle lege strings uit de dict
                    filtered_dict = {key: value for key, value in Dict.items() if value != ""}
                    
                    if foi_endDate1 == '' and foi_endDate2 == '':
                        # en voeg de dictionary toe aan 
                        DICT[foi_count] = filtered_dict

                        foi_count+=1
                
                if dc_publisher != '':
                
                    # uiteindelijk hoeft de dict van alle medewerkers alleen maar toegevoegd te worden aan de volledige dict.
                    final_dict = {'resource': f"nl.{dc_publisher}.{foaf_function_type}.{dc_date_year}",
                                'infobox': {'foi_totalDossiers': len(DICT),
                                             'foi_dossiers': DICT}}


                    list_of_dicts.append(final_dict)

    number_of_people = sum(i['infobox']['foi_totalDossiers'] for i in list_of_dicts)
        
    return number_of_people, list_of_dicts

In [12]:
def search_yahoo(Query):

    time.sleep(1)
    
    url = "http://search.yahoo.com/search?p=%s"
    r = requests.get(url % Query) 
    soup = BeautifulSoup(r.text)
    hrefs = [a['href'] for a in soup.find_all('a', href=True)]
    hrefs = [href for href in hrefs if 'yahoo' not in hrefs]

    new_hrefs = [href for href in hrefs if "yahoo" not in href and '#' not in href]
    
    return new_hrefs

def extract_urls(hrefs, site, last_name):
    
    possible = False
    
    last_names = [last_name]
    if ' ' in last_name:
        last_names.append(last_name.replace(' ', ''))
        last_names.append(last_name.replace(' ', '-'))
        last_names.append(last_name.replace(' ', '+'))
                          
    for href in hrefs:
        href = href.lower()
        
        if site == 'linkedin.com':
            if '/in/' not in href:
                continue
                
        if site == 'nl.wikipedia.org':
            if '/wiki/' not in href:
                continue
                
        if site == 'twitter.com':
            if '/status' in href or '/hashtag' in href:
                continue
                
        parts = re.split(r'[\s+/\\-]', href)
        if len(parts) > 8:
            continue
                          
        
        for ln in last_names:
            if ln.lower() in href:
                if site in href:
                    print(href)
                    return href
                        
    return ''

In [13]:
tree = ET.parse(f"exportOO_full.xml")
namespaces = {
    'p': 'https://organisaties.overheid.nl/static/schema/oo/export/2.6.4'
}

In [28]:
def extract_all_medewerkers(Organisatie, multi_layer = True):
    
    ### This section is for creating the JSON file of the organisation
    people1, dict_list = json_create_single_layer(Organisatie)
    
    if multi_layer == True:
        people2, dict_list2 = json_create_multi_layer(Organisatie)
    
        dict_list = dict_list + dict_list2
        
    Json = json.dumps(dict_list, indent=4)

    # Write JSON string to a text file
    with open(f"{Organisatie}.json", "w") as file:
        file.write(Json)
    
    ### Because the gathering of (social) media might take a while create a separate dict just for this.
    if os.path.exists(f"Social_Media_{Organisatie}.json"):
    
        with open(f"Social_Media_{Organisatie}.json", "r") as file:
            social_media_dict = json.load(file)
            people_in_dict = list(social_media_dict.keys())
    
    else:
        social_media_dict = dict()
        people_in_dict = []
    
    for p in dict_list:
        persons_per_func = p['infobox']['foi_dossiers']
        for k, v in persons_per_func.items():
            Identifier = f"{v['foaf_name']} {v['dc_publisher_name']} {v['foi_function']}"
            
            if 'foaf_firstName' not in v:
                foaf_firstName = ''
            else:
                foaf_firstName = v['foaf_firstName']
                
            if 'foaf_initials' not in v:
                foaf_initials = ''
            else:
                foaf_initials = v['foaf_initials']
            
            if Identifier not in people_in_dict:
                person_dict = {
                                  'foaf_name': v['foaf_name'],
                                  'foaf_function_type': v['foi_function'],
                                  'dc_publisher_name': v['dc_publisher_name'],
                                  'foaf_firstName': foaf_firstName,
                                  'foaf_lastName': v['foaf_lastName'],
                                  'foaf_initials': foaf_initials,
                                  'website': v['foaf_workplaceHomepage'],
                                  'foi_linkedin': '',
                                  'foi_twitter': '',
                                  'foi_wikipedia': '',
                                  'foaf_workplaceHomepage': '',
                                  'up_to_date': False}
            
                social_media_dict[f"{v['foaf_name']} {v['dc_publisher_name']} {v['foi_function']}"] = person_dict
    
    
    Json = json.dumps(social_media_dict, indent=4)

    # Write JSON string to a text file
    with open(f"Social_Media_{Organisatie}.json", "w") as file:
        file.write(Json)
        
    ### Here the check for the amount of elements gathered compared to the amount of elements in the XML tree.
    ### if multi_layer == True this will always both be equal
    
    organisatie_elements = tree.findall(f'//p:organisaties/p:organisatie/[p:types = "{Organisatie}"]', namespaces)
    list_of_peoples = [organisatie_element.findall('.//p:medewerker', namespaces) for organisatie_element in organisatie_elements]
    people = sum(len(lst) for lst in list_of_peoples)
            
    print(f'aantal medewerkers in XML: {people}')
    
    if multi_layer == True:
        print(f'aantal medewerkers extracted in JSON: {people1 + people2}')
    else:
        print(f'aantal medewerkers extracted in JSON: {people1}')

extract_all_medewerkers("Gemeente", False)


  organisatie_elements = tree.findall(f'//p:organisaties/p:organisatie/[p:types = "{organisatie_type}"]', namespaces)


aantal medewerkers in XML: 14120
aantal medewerkers extracted in JSON: 14060


  organisatie_elements = tree.findall(f'//p:organisaties/p:organisatie/[p:types = "{Organisatie}"]', namespaces)


In [24]:
attributes = ['dc_identifier', 'dc_title', 'dc_type', 'dc_description', 'dc_source', 'dc_publisher', 'dc_creator', 'foi_retrievedDate', 'dc_date_year', 'foi_worksFor', 'dc_publisher_name', 'foi_title', 'foaf_initials', 'foaf_firstName', 'foaf_lastName', 'foaf_name', 'foaf_mbox', 'foaf_phone', 'foi_visitAddress', 'foi_mailAddress', 'foi_linkedin', 'foi_twitter', 'foi_wikipedia', 'foaf_workplaceHomepage', 'foi_startDate', 'foi_party', 'foi_function', 'foi_files']

with open('Gemeente.json', 'r') as f:
    dict_list = json.load(f)

id_dict = {key: 0 for key in attributes}

for dl in dict_list:
    dossiers = dl['infobox']['foi_dossiers']['0']
    for k,v in dossiers.items():
        for ID in attributes:
            if ID == k:
                id_dict[ID] += 1
                
print(id_dict)

{'dc_identifier': 42, 'dc_title': 42, 'dc_type': 42, 'dc_description': 42, 'dc_source': 42, 'dc_publisher': 42, 'dc_creator': 42, 'foi_retrievedDate': 42, 'dc_date_year': 42, 'foi_worksFor': 0, 'dc_publisher_name': 42, 'foi_title': 22, 'foaf_initials': 42, 'foaf_firstName': 3, 'foaf_lastName': 42, 'foaf_name': 42, 'foaf_mbox': 2, 'foaf_phone': 2, 'foi_visitAddress': 2, 'foi_mailAddress': 2, 'foi_linkedin': 0, 'foi_twitter': 0, 'foi_wikipedia': 0, 'foaf_workplaceHomepage': 42, 'foi_startDate': 25, 'foi_party': 0, 'foi_function': 42, 'foi_files': 42}
