In [1]:
import xml.etree.ElementTree as ET
import paramiko
import time
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from urllib.parse import urlparse, unquote
import ujson
from collections import defaultdict
import gzip
import shutil

In [2]:
# Define the SFTP server information
host = "bestanden.officielebekendmakingen.nl"
port = 22  # Default SFTP port
username = "anonymous"
password = "anonymous@domain.com"

# Create an SSH client
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(host, port, username, password)

# Now you have established the SFTP connection using SSH, and you can create an SFTP client
sftp = ssh.open_sftp()

# You can perform SFTP operations with 'sftp' here.


In [9]:
from Scripts.Helper_Functions import TOOI_organisaties, get_coordinates_from_point, multicoords_to_coords

organisaties = TOOI_organisaties(orgs = ['provincies'])

In [10]:
def xpath(target_name, root):
    element = root.find(".//metadata[@name='" + target_name + "']")
    if element is not None:
        return element.get('content')
    else:
        return ''

In [11]:
def identify_addresses(address_list, root):
    
    if address_list == []:
        return '', '', ''

    all_c = []
    all_a = []
    all_g = []
    for address in address_list:
        geometry = xpath('OVERHEIDop.geometrie', address)
        Gemeente = xpath('OVERHEIDop.ligtInGemeente', address)
        Provincie = xpath('OVERHEIDop.provincienaam', address)
        Waterschap = xpath('OVERHEIDop.waterschapsnaam', address)
        Huisnummer = xpath('OVERHEIDop.huisnummer', address)
        Postcode = xpath('OVERHEIDop.postcode', address)
        Straatnaam = xpath('OVERHEIDop.straatnaam', address)
        Woonplaats = xpath('OVERHEIDop.woonplaats', address)
        GeoLabel = xpath('OVERHEIDop.geometrieLabel', address)

        straat = [Straatnaam, Huisnummer]
        straat_filtered = [item for item in straat if item != '']
        address = [' '.join(straat_filtered), Postcode, Woonplaats]
        address_filtered = [item for item in address if item != '']
        if address_filtered != []:
            address_final = ', '.join(address_filtered)
        else:
            address_final = ''
    

        if Gemeente != '':
            if address_final != '':
                address_final += f", Gemeente {Gemeente}"
            else:
                address_final = f"Gemeente {Gemeente}"

        if address_final == '' and Waterschap != '':
            address_final = Waterschap

        if GeoLabel != '' and 'Punt' not in GeoLabel and 'Vlak' not in GeoLabel and 'Gebiedsmarkering' not in GeoLabel and 'Lijn' not in GeoLabel and 'Handmatig' not in GeoLabel:
            if address_final == '':
                address_final = GeoLabel
            else:
                address_final = GeoLabel + ', ' + address_final

        if 'POINT' in geometry:
            coordinaten = get_coordinates_from_point(geometry)
            all_c.append(coordinaten)
            all_a.append(address_final)
            all_g.append(geometry)

        elif 'POLYGON' in geometry or 'LINESTRING' in geometry:
            all_a.append(address_final)
            all_g.append(geometry)
    
    if len(all_g) == len(all_c):
        return all_a, all_g, all_c

    else:
        return all_a, all_g, ''



In [13]:
def beschikking_scraper(sftp, start_date, item):
    end_date = datetime.today
    
    current_date = start_date

    outer_dict = {}
    
    while current_date <= end_date:
        date = current_date.strftime('%Y/%m/%d')
        print(date)
        current_date += timedelta(days=1)
    
        directory_path = f"/{date}/{item}"

        try:
            directory_contents = sftp.listdir(directory_path)
        except FileNotFoundError:
            continue
        

        for content in directory_contents:

            new_path = f"{directory_path}/{content}"
            new_contents = sftp.listdir(new_path)
            
            try:
                remote_path = new_path + '/metadata.xml'
                remote_file = sftp.open(remote_path, 'r') 
        
                tree = ET.parse(remote_file) 
                root = tree.getroot()
                
            except (FileNotFoundError, ET.ParseError) as e:
                continue

            
            creator = xpath("DC.creator", root)

            if creator == 'Friesland':
                creator = 'Fryslân'
    
            if creator in organisaties:
                doc_list = [{
                    'dc_title': xpath("DC.title", root),
                    'dc_source': f"https://zoek.officielebekendmakingen.nl/{content}.pdf",
                    'dc_type': xpath("OVERHEID.Informatietype", root),
                    'foi_url_on_web': True
                }]
                publisher = organisaties[creator]
                
                target_name = "OVERHEIDop.gebiedsmarkering"
                addresses = root.findall(".//metadata[@name='" + target_name + "']")
                adressen, geography, coordinates = identify_addresses(addresses, root)
                
                target_name = "OVERHEIDop.externeBijlage"
                bijlagen = root.findall(".//metadata[@name='" + target_name + "']")

                
                
                if bijlagen != []:
                    for bijlage in bijlagen:
                        bijlage_content = bijlage.get('content')

                        if '|' in bijlage_content:
                            title, url = bijlage_content.split('|')
                        else:
                            title = ''
                            url = bijlage_content
                        
                        bijlage_dict = {
                            'dc_title': title,
                            'dc_source': f"https://repository.officiele-overheidspublicaties.nl/externebijlagen/{url}/1/bijlage/{url}.pdf",
                            'dc_type': 'bijlage',
                            'foi_url_on_web': True
                        }
                        
                        doc_list.append(bijlage_dict)

                dict = {
                    "dc_title": xpath("DC.title", root),
                    "dc_type": "2k",
                    "dc_description": xpath("DCTERMS.abstract", root),
                    "dc_source": f"https://zoek.officielebekendmakingen.nl/{content}",
                    "dc_publisher": publisher,
                    "dc_creator": "WooScraper_v1",
                    "foi_publishedDate": xpath("DCTERMS.available", root),
                    "dc_date_year": xpath("OVERHEIDop.jaargang", root),
                    "foi_files": {'foi_documenten': doc_list},
                    "foi_subjectAddress": adressen,
                    "foi_subjectGeography": geography,
                    "foi_subjectCoordinates": coordinates,
                }

                final_dict = {
                    'resource': f"nl.{publisher}.2k.{xpath('OVERHEIDop.jaargang', root)}.",
                    'infobox': dict
                }

                final_resource = f"nl.{publisher}.2k.{xpath('OVERHEIDop.jaargang', root)}"
                
                if final_resource not in outer_dict:
                    outer_dict[final_resource] = [final_dict]

                else:
                    outer_dict[final_resource].append(final_dict)

                remote_file.close()

            else:
                remote_file.close()
                continue


    with open('SFTP_Waterschappen.json', "w") as file:
        ujson.dump(outer_dict, file, indent = 4)
    


In [15]:
%%time
date = datetime(2022, 12, 30)
# date = datetime.today()

beschikking_scraper(sftp, date, 'prb')

2022/12/30
CPU times: total: 609 ms
Wall time: 10.8 s


In [14]:
# Close the SFTP and SSH connections when done
# sftp.close()
# ssh.close()