# Define functions

In [1]:
import psycopg2
import zipfile
import io
import xml.etree.ElementTree as ET

class ParamsManager:
    def __init__(self):
        self.namespace_params = {
            'vbo': {
                'ns2': 'www.kadaster.nl/schemas/lvbag/imbag/objecten/v20200601',
                'ns3': 'www.kadaster.nl/schemas/lvbag/imbag/objecten-ref/v20200601',
                'ns4': 'www.kadaster.nl/schemas/lvbag/imbag/historie/v20200601',
                'ns5': 'http://www.opengis.net/gml/3.2'
            },
            'pand': {
                'lvbag': 'www.kadaster.nl/schemas/lvbag/imbag/objecten/v20200601',
                'gml': 'http://www.opengis.net/gml/3.2',
                'hist': 'www.kadaster.nl/schemas/lvbag/imbag/historie/v20200601'
            },
            'num': {
                'lvbag': 'www.kadaster.nl/schemas/lvbag/imbag/objecten/v20200601',
                'hist': 'www.kadaster.nl/schemas/lvbag/imbag/historie/v20200601'
            }
        }
        self.database_params = {
            'dbname': 'urbanmining', 
            'user': 'postgres', 
            'host': 'localhost', 
            'password': 'Tunacompany5694!', 
            'port': '5432'
        } 
        self.table_params = {
            'vbo': {
                'table_name': 'bag_vbo_new',
                'columns_sql': '''
                    id_vbo VARCHAR, id_num VARCHAR, id_pand VARCHAR, 
                    geometry VARCHAR, function VARCHAR, sqm VARCHAR, status VARCHAR, 
                    document_date VARCHAR, document_number VARCHAR, 
                    registration_start VARCHAR, registration_end VARCHAR
                    ''', 
                'column_names': '(id_vbo, id_num, id_pand, geometry, function, sqm, status, document_date, document_number, registration_start, registration_end)',   
            }, 
            'pand': {
                'table_name': 'bag_pand_new',
                'columns_sql': '''
                    id_pand VARCHAR, geometry VARCHAR, build_year VARCHAR, status VARCHAR, 
                    document_date VARCHAR, document_number VARCHAR, 
                    registration_start VARCHAR, registration_end VARCHAR''', 
                'column_names': '(id_pand, geometry, build_year, status, document_date, document_number, registration_start, registration_end)'
            },
            'num': {
                'table_name': 'bag_num',
                'columns_sql': '''
                    id_num VARCHAR, 
                    house_number VARCHAR, house_letter VARCHAR, post_code VARCHAR, 
                    document_date VARCHAR, document_number VARCHAR, 
                    registration_start VARCHAR, registration_end VARCHAR''', 
                'column_names': '(id_num, house_number, house_letter, post_code, document_date, document_number, registration_start, registration_end)',
            }
        } 
        self.xml_params = {
            # zip_file_name, extract_function 
            'vbo': {
                'zip_file_name': '9999VBO08082024.zip',
                'extract_function': 'extract_vbo_data'
            },
            'pand': {
                'zip_file_name': '9999PND08082024.zip',
                'extract_function': 'extract_pand_data'
            },
            'num': {
                'zip_file_name': '9999NUM08082024.zip',
                'extract_function': 'extract_num_data'
            }
        } 

    def get_namespace_params(self, data_type):
        return self.namespace_params[data_type]
    
    def get_database_params(self):
        return self.database_params
    
    def get_table_params(self, data_type):
        return self.table_params[data_type]
    
    def get_xml_params(self, data_type):
        return self.xml_params[data_type]

class DataExtractor:
    def __init__(self, root, ns):
        self.root = root
        self.ns = ns

    def get_text_or_none(self, element, xpath):
        found_element = element.find(xpath, namespaces=self.ns)
        return found_element.text if found_element is not None else None

    def extract_vbo_data(self):
        return self._extract_common_data(
            './/ns2:Verblijfsobject',
            {
                'id_vbo': './/ns2:identificatie',
                'id_num': './/ns3:NummeraanduidingRef',
                'id_pand': './/ns3:PandRef',
                'geometry': './/ns5:pos',
                'function': './/ns2:gebruiksdoel',
                'sqm': './/ns2:oppervlakte',
                'status': './/ns2:status',
                'document_date': './/ns2:documentdatum',
                'document_number': './/ns2:documentnummer',
                'registration_start': './/ns4:Voorkomen/ns4:beginGeldigheid',
                'registration_end': './/ns4:Voorkomen/ns4:eindGeldigheid'
            }
        )

    def extract_pand_data(self):
        return self._extract_common_data(
            './/lvbag:Pand',
            {
                'id_pand': './/lvbag:identificatie',
                'geometry': './/gml:posList',
                'build_year': './/lvbag:oorspronkelijkBouwjaar',
                'status': './/lvbag:status',
                'document_date': './/lvbag:documentdatum',
                'document_number': './/lvbag:documentnummer',
                'registration_start': './/hist:beginGeldigheid',
                'registration_end': './/hist:eindGeldigheid'
            }
        )

    def extract_num_data(self):
        return self._extract_common_data(
            './/lvbag:Nummeraanduiding',
            {
                'id_num': 'lvbag:identificatie',
                'house_number': 'lvbag:huisnummer',
                'house_letter': 'lvbag:huisletter',
                'post_code': 'lvbag:postcode',
                'document_date': 'lvbag:documentdatum',
                'document_number': 'lvbag:documentnummer',
                'registration_start': './/hist:beginGeldigheid',
                'registration_end': './/hist:eindGeldigheid'
            }
        )

    def _extract_common_data(self, xpath, fields):
        data = []
        elements = self.root.findall(xpath, namespaces=self.ns)
        for elem in elements:
            extracted_data = tuple(self.get_text_or_none(elem, path) for path in fields.values())
            data.append(extracted_data)
        return data

class DatabaseManager:    
    def __init__(self):
        self.params_manager = ParamsManager()

    def connect(self):
        db_params = self.params_manager.get_database_params()
        return psycopg2.connect(**db_params)

    def create_table(self, data_type):
        conn = self.connect()
        cursor = conn.cursor()
        
        table_params = self.params_manager.get_table_params(data_type)
        table_name = table_params['table_name']
        columns_sql = table_params['columns_sql']
        cursor.execute(f'''
            CREATE TABLE IF NOT EXISTS {table_name} ({columns_sql});
        ''')
        
        conn.commit()
        cursor.close()
        conn.close()

    def insert_batch(self, cursor, insert_query_prefix, values_list, placeholders):
        if not values_list:
            return
        query = insert_query_prefix + ','.join(cursor.mogrify(f'({placeholders})', x).decode('utf-8') for x in values_list)
        cursor.execute(query)
        values_list.clear() 

class XMLImporter:
    def __init__(self, batch_size=1000):
        self.db_manager = DatabaseManager()
        self.params_manager = ParamsManager()
        self.batch_size = batch_size

    def create_params(self, data_type): 
        self.table_params = self.params_manager.get_table_params(data_type)
        self.namespace_params = self.params_manager.get_namespace_params(data_type)
        self.xml_params = self.params_manager.get_xml_params(data_type)

    def process_and_insert_xml(self, data_type):
        db_manager = self.db_manager
        self.create_params(data_type)

        zip_file_name = self.xml_params['zip_file_name']
        extract_function_name = self.xml_params['extract_function']
        table_name = self.table_params['table_name']
        column_names = self.table_params['column_names']
        
        insert_query_prefix = f'INSERT INTO {table_name} {column_names} VALUES '
        columns_list = column_names.strip('()').replace(" ", "").split(',')
        num_columns = len(columns_list)
        placeholders = ', '.join(['%s'] * num_columns)

        values_list = []

        with zipfile.ZipFile('data/bag/lvbag-extract-nl.zip', 'r') as outer_zip:
            with outer_zip.open(zip_file_name) as inner_zip:
                with zipfile.ZipFile(io.BytesIO(inner_zip.read())) as zfile:
                    xml_names = zfile.namelist()
                    print(f'Found {len(xml_names)} XML files in {zip_file_name} ...')

                    for xml_name in xml_names:
                        print(xml_name)
                        with zfile.open(xml_name) as xml:
                            tree = ET.parse(xml)
                            root = tree.getroot()
                            data_extractor = DataExtractor(root, self.namespace_params)
                            extract_function = getattr(data_extractor, extract_function_name)
                            data_batch = extract_function()
                            values_list.extend(data_batch)

                            if len(values_list) >= self.batch_size:
                                # Insert batch into the database
                                with db_manager.connect() as conn:
                                    with conn.cursor() as cursor:
                                        db_manager.insert_batch(cursor, insert_query_prefix, values_list, placeholders)

                    # Insert any remaining data
                    if values_list:
                        with db_manager.connect() as conn:
                            with conn.cursor() as cursor:
                                db_manager.insert_batch(cursor, insert_query_prefix, values_list, placeholders)



# Run functions

In [2]:
database_manager = DatabaseManager()
XML_importer = XMLImporter()

data_type = 'vbo'
database_manager.create_table(data_type)
XML_importer.process_and_insert_xml(data_type)

Found 2384 XML files in 9999VBO08082024.zip ...
9999VBO08082024-000001.xml
9999VBO08082024-000002.xml
9999VBO08082024-000003.xml
9999VBO08082024-000004.xml
9999VBO08082024-000005.xml
9999VBO08082024-000006.xml
9999VBO08082024-000007.xml
9999VBO08082024-000008.xml
9999VBO08082024-000009.xml
9999VBO08082024-000010.xml
9999VBO08082024-000011.xml
9999VBO08082024-000012.xml
9999VBO08082024-000013.xml
9999VBO08082024-000014.xml
9999VBO08082024-000015.xml
9999VBO08082024-000016.xml
9999VBO08082024-000017.xml
9999VBO08082024-000018.xml
9999VBO08082024-000019.xml
9999VBO08082024-000020.xml
9999VBO08082024-000021.xml
9999VBO08082024-000022.xml
9999VBO08082024-000023.xml
9999VBO08082024-000024.xml
9999VBO08082024-000025.xml
9999VBO08082024-000026.xml
9999VBO08082024-000027.xml
9999VBO08082024-000028.xml
9999VBO08082024-000029.xml
9999VBO08082024-000030.xml
9999VBO08082024-000031.xml
9999VBO08082024-000032.xml
9999VBO08082024-000033.xml
9999VBO08082024-000034.xml
9999VBO08082024-000035.xml
9999VBO

# Reading XML file 
The zip folder contains the following files: 
- '9999InOnderzoek08022024.zip': This folder contains XML files for objects that are under investigation or examination.
- '9999NUM08022024.zip': This folder contains XML files for number indications (NUM), which likely represent address or location identifiers.
- '9999VBO08022024.zip': This folder contains XML files for verblijfsobjecten (VBO), which are dwelling units or accommodations within a building.
- '9999WPL08022024.zip': This folder contains XML files for woonplaatsen (WPL), which are residential places or localities.
- '9999LIG08022024.zip': This folder contains XML files for ligplaatsen (LIG), which are mooring places for boats or vessels.
- '9999PND08022024.zip': This folder contains XML files for panden (PND), which are buildings or structures.
- '9999NietBag08022024.zip': This folder contains XML files for objects that are not considered part of the BAG database.
- '9999STA08022024.zip': This folder contains XML files for openbare ruimtes (STA), which are public spaces or areas.
- '9999OPR08022024.zip': This folder contains XML files for operations (OPR), which likely represent administrative or operational data.
- '9999Inactief08022024.zip': This folder contains XML files for inactive objects that are no longer part of the active lifecycle.

In [13]:
with zipfile.ZipFile('data/bag/lvbag-extract-nl.zip', 'r') as zip_ref:
    print(zip_ref.namelist())

['Leveringsdocument-BAG-Extract.xml', 'GEM-WPL-RELATIE-08022024.zip', '9999InOnderzoek08022024.zip', '9999NUM08022024.zip', '9999VBO08022024.zip', '9999WPL08022024.zip', '9999LIG08022024.zip', '9999PND08022024.zip', '9999NietBag08022024.zip', '9999STA08022024.zip', '9999OPR08022024.zip', '9999Inactief08022024.zip']


## VBO
Verblijfs object

In [101]:
def extract_vbo_data(root_vbo):
    # Assuming root_vbo is the root of your parsed XML.
    ns = {
        'ns2': 'www.kadaster.nl/schemas/lvbag/imbag/objecten/v20200601',
        'ns3': 'www.kadaster.nl/schemas/lvbag/imbag/objecten-ref/v20200601',
        'ns4': 'www.kadaster.nl/schemas/lvbag/imbag/historie/v20200601',
        'ns5': 'http://www.opengis.net/gml/3.2'
    }

    data = []

    vbo_elements = root_vbo.findall('.//ns2:Verblijfsobject', namespaces=ns)

    def get_text_or_none(element, xpath, namespaces):
        found_element = element.find(xpath, namespaces=namespaces)
        return found_element.text if found_element is not None else None

    for verblijfsobject in vbo_elements:

        id_vbo = get_text_or_none(verblijfsobject, './/ns2:identificatie', ns)
        id_num = get_text_or_none(verblijfsobject, './/ns3:NummeraanduidingRef', ns)
        id_pand = get_text_or_none(verblijfsobject, './/ns3:PandRef', ns)
        geometry = get_text_or_none(verblijfsobject, './/ns5:pos', ns)
        
        function = get_text_or_none(verblijfsobject, './/ns2:gebruiksdoel', ns)
        sqm = get_text_or_none(verblijfsobject, './/ns2:oppervlakte', ns)
        status = get_text_or_none(verblijfsobject, './/ns2:status', ns)
        
        document_date = get_text_or_none(verblijfsobject, './/ns2:documentdatum', ns)
        document_number = get_text_or_none(verblijfsobject, './/ns2:documentnummer', ns)
        registration_start = get_text_or_none(verblijfsobject, './/ns4:Voorkomen/ns4:beginGeldigheid', ns)
        registration_end = get_text_or_none(verblijfsobject, './/ns4:Voorkomen/ns4:eindGeldigheid', ns)

        data.append((id_vbo, id_num, id_pand, geometry, function, sqm, status, document_date, document_number, registration_start, registration_end))
    
    return data


## Pand

In [103]:
def extract_pand_data(root_pand): 
    # Initialize lists to store extracted data
    data = []

    # Define the namespace dictionary
    ns = {
        'lvbag': 'www.kadaster.nl/schemas/lvbag/imbag/objecten/v20200601',
        'gml': 'http://www.opengis.net/gml/3.2',
        'hist': 'www.kadaster.nl/schemas/lvbag/imbag/historie/v20200601'
    }

    # Function to extract text or return None if element is not found
    def get_text_or_none(element, xpath, namespaces=ns):
        found_element = element.find(xpath, namespaces=namespaces)
        return found_element.text if found_element is not None else None

    # Counter to track the number of extracted "Pand" elements
    pand_elements = root_pand.findall('.//lvbag:Pand', namespaces=ns)

    # Iterate over each 'Pand' element
    for pand in pand_elements:
        # Extract data for each 'Pand' element using the utility function
        id_pand = get_text_or_none(pand, './/lvbag:identificatie', ns)
        geometry = get_text_or_none(pand, './/gml:posList', ns)
        
        build_year = get_text_or_none(pand, './/lvbag:oorspronkelijkBouwjaar', ns)
        status = get_text_or_none(pand, './/lvbag:status', ns)
        
        document_date = get_text_or_none(pand, './/lvbag:documentdatum', ns)
        document_number = get_text_or_none(pand, './/lvbag:documentnummer', ns)
        registration_start = get_text_or_none(pand, './/hist:beginGeldigheid', ns)
        registration_end = get_text_or_none(pand, './/hist:eindGeldigheid', ns)
        
        # Append extracted data to the list
        data.append((id_pand, geometry, build_year, status, document_date, document_number, registration_start, registration_end))

    return data


## NUM

In [105]:
def extract_num_data(root_num):
    # Namespace dictionary
    ns = {
        'lvbag': 'www.kadaster.nl/schemas/lvbag/imbag/objecten/v20200601',
        'hist': 'www.kadaster.nl/schemas/lvbag/imbag/historie/v20200601'
    }

    # Utility function to extract text or return None
    def get_text_or_none(element, xpath, namespaces=ns):
        found_element = element.find(xpath, namespaces=namespaces)
        return found_element.text if found_element is not None else None

    # List to collect rows
    data = []

    # Finding all 'Nummeraanduiding' elements
    num_elements = root_num.findall('.//lvbag:Nummeraanduiding', namespaces=ns)
    
    for num_elem in num_elements:
        # Extracting the required elements using the utility function
        id_num = get_text_or_none(num_elem, 'lvbag:identificatie', ns)
        
        house_number = get_text_or_none(num_elem, 'lvbag:huisnummer', ns)
        house_letter = get_text_or_none(num_elem, 'lvbag:huisletter', ns)
        post_code = get_text_or_none(num_elem, 'lvbag:postcode', ns)
        
        document_date = get_text_or_none(num_elem, 'lvbag:documentdatum', ns)
        document_number = get_text_or_none(num_elem, 'lvbag:documentnummer', ns)
        registration_start = get_text_or_none(num_elem, './/hist:beginGeldigheid', ns)
        registration_end = get_text_or_none(num_elem, './/hist:eindGeldigheid', ns)
                
        # Appending row to the data list
        data.append((id_num, house_number, house_letter, post_code, document_date, document_number, registration_start, registration_end))
    
    return data


# Extract all data from XML to dataframe

### Create tables in PostgreSQL database
Run only once, no need to run again

In [111]:
# Connect to your PostgreSQL database
conn_params = "dbname='urbanmine' user='postgres' host='localhost' password='Tunacompany5694!' port='5432'"
conn = psycopg2.connect(conn_params)
cursor = conn.cursor()

# create tables for VBO, PND, and NUM
cursor.execute("""
CREATE TABLE IF NOT EXISTS bag_vbo (
    id_vbo VARCHAR,
    id_num VARCHAR,
    id_pand VARCHAR,
    geometry VARCHAR,
    function VARCHAR,
    sqm VARCHAR,
    status VARCHAR,
    document_date VARCHAR,
    document_number VARCHAR,
    registration_start VARCHAR,
    registration_end VARCHAR
);
""")
conn.commit()
cursor.execute("""
CREATE TABLE IF NOT EXISTS bag_pand (
    id_pand VARCHAR,
    geometry VARCHAR,
    build_year VARCHAR,
    status VARCHAR,
    document_date VARCHAR,
    document_number VARCHAR,
    registration_start VARCHAR,
    registration_end VARCHAR
);
""")
conn.commit()
cursor.execute("""
CREATE TABLE IF NOT EXISTS bag_num (
    id_num VARCHAR,
    house_number VARCHAR,
    house_letter VARCHAR,
    post_code VARCHAR,
    document_date DATE,
    document_number VARCHAR,
    registration_start VARCHAR,
    registration_end VARCHAR
);
""")
conn.commit()
cursor.close()
conn.close()

In [79]:
# Convert the DataFrame to a CSV format in memory
output = StringIO()
df_vbo.to_csv(output, sep='\t', header=False, index=False)
output.seek(0)  # We need to seek to the start of the StringIO object

# Connect to your PostgreSQL database
conn_params = "dbname='urbanmine' user='postgres' host='localhost' password='Tunacompany5694!' port='5432'"
conn = psycopg2.connect(conn_params)
cursor = conn.cursor()

# create table and copy data to it
table_name = "bag_vbo"  # Define your table name
column_definitions = ", ".join([f"{col} VARCHAR" for col in df_vbo.columns])
create_table_sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({column_definitions});"
cursor.execute(create_table_sql)
conn.commit()
cursor.copy_from(output, 'bag_vbo', null='', sep='\t')

# commit and close the connection
conn.commit()
cursor.close()
conn.close()


### Stream data from XML to PostgreSQL database

In [124]:
def process_and_insert_xml(params_dict, batch_size=1000):

    # extract params
    zip_file_name = params_dict['zip_file_name']
    extract_function = params_dict['extract_function']
    db_params = params_dict['db_params']
    table_name = params_dict['table_name']
    column_names = params_dict['column_names']
    print(f'Processing {zip_file_name} ...')

    insert_query_prefix = f'INSERT INTO {table_name} {column_names} VALUES '
    values_list = []

    # get num columns for mogrify
    columns_list = column_names.strip('()').replace(" ", "").split(',')
    num_columns = len(columns_list)
    placeholders = ', '.join(['%s'] * num_columns)

    def insert_batch(cursor, values_list):
        if not values_list:
            return
        query = insert_query_prefix + ','.join(cursor.mogrify(f'({placeholders})', x).decode('utf-8') for x in values_list)
        cursor.execute(query)
        values_list.clear()

    with zipfile.ZipFile('data/bag/lvbag-extract-nl.zip', 'r') as outer_zip:
        with outer_zip.open(zip_file_name) as inner_zip:
            with zipfile.ZipFile(io.BytesIO(inner_zip.read())) as zfile:
                xml_names = zfile.namelist()
                print(f'Found {len(xml_names)} XML files in {zip_file_name} ...')

                for xml_name in xml_names:
                    print(xml_name)
                    with zfile.open(xml_name) as xml:
                        tree = ET.parse(xml)
                        root = tree.getroot()
                        data_batch = extract_function(root)
                        values_list.extend(data_batch)

                        if len(values_list) >= batch_size:
                            with psycopg2.connect(db_params) as conn:
                                with conn.cursor() as cursor:
                                    insert_batch(cursor, values_list)

                # Insert any remaining data
                if values_list:
                    with psycopg2.connect(db_params) as conn:
                        with conn.cursor() as cursor:
                            insert_batch(cursor, values_list)

In [125]:
params_dict_all = {
    'bag_vbo': { 
        'zip_file_name': '9999VBO08022024.zip',
        'extract_function': extract_vbo_data,
        'table_name': "bag_vbo",
        'column_names': '(id_vbo, id_num, id_pand, geometry, function, sqm, status, document_date, document_number, registration_start, registration_end)',
        'db_params': "dbname='urbanmine' user='postgres' host='localhost' password='Tunacompany5694!' port='5432'"
    }, 
    'bag_pand': {
        'zip_file_name': '9999PND08022024.zip',
        'extract_function': extract_pand_data,
        'table_name': "bag_pand",
        'column_names': '(id_pand, geometry, build_year, status, document_date, document_number, registration_start, registration_end)',
        'db_params': "dbname='urbanmine' user='postgres' host='localhost' password='Tunacompany5694!' port='5432'"
    },
    'bag_num': {
        'zip_file_name': '9999NUM08022024.zip',
        'extract_function': extract_num_data,
        'table_name': "bag_num",
        'column_names': '(id_num, house_number, house_letter, post_code, document_date, document_number, registration_start, registration_end)',
        'db_params': "dbname='urbanmine' user='postgres' host='localhost' password='Tunacompany5694!' port='5432'"
    }
}

for params_dict in params_dict_all.values():
    process_and_insert_xml(params_dict)

Processing 9999VBO08022024.zip ...
Found 2326 XML files in 9999VBO08022024.zip ...
9999VBO08022024-000001.xml
9999VBO08022024-000002.xml
9999VBO08022024-000003.xml
9999VBO08022024-000004.xml
9999VBO08022024-000005.xml
9999VBO08022024-000006.xml
9999VBO08022024-000007.xml
9999VBO08022024-000008.xml
9999VBO08022024-000009.xml
9999VBO08022024-000010.xml
9999VBO08022024-000011.xml
9999VBO08022024-000012.xml
9999VBO08022024-000013.xml
9999VBO08022024-000014.xml
9999VBO08022024-000015.xml
9999VBO08022024-000016.xml
9999VBO08022024-000017.xml
9999VBO08022024-000018.xml
9999VBO08022024-000019.xml
9999VBO08022024-000020.xml
9999VBO08022024-000021.xml
9999VBO08022024-000022.xml
9999VBO08022024-000023.xml
9999VBO08022024-000024.xml
9999VBO08022024-000025.xml
9999VBO08022024-000026.xml
9999VBO08022024-000027.xml
9999VBO08022024-000028.xml
9999VBO08022024-000029.xml
9999VBO08022024-000030.xml
9999VBO08022024-000031.xml
9999VBO08022024-000032.xml
9999VBO08022024-000033.xml
9999VBO08022024-000034.xml

# Re-stream bag pand data to Postgresql database

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium 
import zipfile
import xml.etree.ElementTree as ET
import io 
import psycopg2
from io import StringIO

In [None]:
def connect_to_db():
    conn_params = "dbname='urbanmine' user='postgres' host='localhost' password='Tunacompany5694!' port='5432'"
    conn = psycopg2.connect(conn_params)
    cursor = conn.cursor()
    return conn, cursor

def make_new_bag_pand_table(cursor, conn):
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS bag_pand_new (
        id_pand VARCHAR,
        geometry VARCHAR,
        build_year VARCHAR,
        status VARCHAR,
        document_date VARCHAR,
        document_number VARCHAR,
        registration_start VARCHAR,
        registration_end VARCHAR
    );
    """)
    conn.commit()
    cursor.close()
    conn.close()

def extract_pand_data(root_pand): 
    # Initialize lists to store extracted data
    data = []

    # Define the namespace dictionary
    ns = {
        'lvbag': 'www.kadaster.nl/schemas/lvbag/imbag/objecten/v20200601',
        'gml': 'http://www.opengis.net/gml/3.2',
        'hist': 'www.kadaster.nl/schemas/lvbag/imbag/historie/v20200601'
    }

    # Function to extract text or return None if element is not found
    def get_text_or_none(element, xpath, namespaces=ns):
        found_element = element.find(xpath, namespaces=namespaces)
        return found_element.text if found_element is not None else None

    # Counter to track the number of extracted "Pand" elements
    pand_elements = root_pand.findall('.//lvbag:Pand', namespaces=ns)

    # Iterate over each 'Pand' element
    for pand in pand_elements:
        # Extract data for each 'Pand' element using the utility function
        id_pand = get_text_or_none(pand, './/lvbag:identificatie', ns)
        geometry = get_text_or_none(pand, './/gml:posList', ns)
        
        build_year = get_text_or_none(pand, './/lvbag:oorspronkelijkBouwjaar', ns)
        status = get_text_or_none(pand, './/lvbag:status', ns)
        
        document_date = get_text_or_none(pand, './/lvbag:documentdatum', ns)
        document_number = get_text_or_none(pand, './/lvbag:documentnummer', ns)
        registration_start = get_text_or_none(pand, './/hist:beginGeldigheid', ns)
        registration_end = get_text_or_none(pand, './/hist:eindGeldigheid', ns)
        
        # Append extracted data to the list
        data.append((id_pand, geometry, build_year, status, document_date, document_number, registration_start, registration_end))

    return data

def process_and_insert_xml(params_dict, batch_size=1000):

    # extract params
    zip_file_name = params_dict['zip_file_name']
    extract_function = params_dict['extract_function']
    db_params = params_dict['db_params']
    table_name = params_dict['table_name']
    column_names = params_dict['column_names']
    print(f'Processing {zip_file_name} ...')

    insert_query_prefix = f'INSERT INTO {table_name} {column_names} VALUES '
    values_list = []

    # get num columns for mogrify
    columns_list = column_names.strip('()').replace(" ", "").split(',')
    num_columns = len(columns_list)
    placeholders = ', '.join(['%s'] * num_columns)

    def insert_batch(cursor, values_list):
        if not values_list:
            return
        query = insert_query_prefix + ','.join(cursor.mogrify(f'({placeholders})', x).decode('utf-8') for x in values_list)
        cursor.execute(query)
        values_list.clear()

    with zipfile.ZipFile('data/bag/lvbag-extract-nl.zip', 'r') as outer_zip:
        with outer_zip.open(zip_file_name) as inner_zip:
            with zipfile.ZipFile(io.BytesIO(inner_zip.read())) as zfile:
                xml_names = zfile.namelist()
                print(f'Found {len(xml_names)} XML files in {zip_file_name} ...')

                for xml_name in xml_names:
                    print(xml_name)
                    with zfile.open(xml_name) as xml:
                        tree = ET.parse(xml)
                        root = tree.getroot()
                        data_batch = extract_function(root)
                        values_list.extend(data_batch)

                        if len(values_list) >= batch_size:
                            with psycopg2.connect(db_params) as conn:
                                with conn.cursor() as cursor:
                                    insert_batch(cursor, values_list)

                # Insert any remaining data
                if values_list:
                    with psycopg2.connect(db_params) as conn:
                        with conn.cursor() as cursor:
                            insert_batch(cursor, values_list)

In [None]:
# connect to db, make new table
conn, cursor = connect_to_db()
make_new_bag_pand_table(cursor, conn)

# stream XML data to new table
params_dict_all = {
    'bag_pand': {
        'zip_file_name': '9999PND08022024.zip',
        'extract_function': extract_pand_data,
        'table_name': "bag_pand_new",
        'column_names': '(id_pand, geometry, build_year, status, document_date, document_number, registration_start, registration_end)',
        'db_params': "dbname='urbanmine' user='postgres' host='localhost' password='Tunacompany5694!' port='5432'"
    }
}
for params_dict in params_dict_all.values():
    process_and_insert_xml(params_dict)

# Examining mismatch between pand and vbo
- There are currently ~10 million buildings in the Netherlands. 
- `bag_pand` has 12,100,336 buildings, and `bag_vbo` has 6,728,828 buildings. 
- All 6.7 million buildings in `bag_vbo` have a match with `bag_pand`, so there are ~6 million buildings in `bag_pand` that have no vbo (sqm, function) information

### Looking at sample in center of Amsterdam

In [2]:
df_file = gpd.read_file('data/sql_outputs/final_result.shp')
df = df_file.copy()
df['match'] = df.V_ID_PAND.map(lambda x: 'no' if x is None else 'yes')
matches = df[df['match'] == 'yes']
non_matches = df[df['match'] == 'no']

In [7]:
# plotly pie chart of match column of df
import plotly.express as px
fig = px.pie(df, names='match', title='Match between PAND and VBO')
fig.update_layout(width=500)
fig.show()

In [3]:
# plot df in folium map using match as color
m = folium.Map(location=[52.379189, 4.899431], zoom_start=12)
match_group = folium.FeatureGroup(name='Matches', show=True)
non_match_group = folium.FeatureGroup(name='Non-Matches', show=True)
for _, row in matches.iterrows():
    folium.GeoJson(row['geometry'], style_function=lambda x: {'color': 'green'}).add_to(match_group)
for _, row in non_matches.iterrows():
    folium.GeoJson(row['geometry'], style_function=lambda x: {'color': 'red'}).add_to(non_match_group)

non_match_group.add_to(m)
match_group.add_to(m)
folium.LayerControl().add_to(m)
m