In [1]:
import gzip
from bs4 import BeautifulSoup
import bs4
import pandas as pd
import numpy as np
import requests
import re

In [2]:
pd.options.display.max_columns = 60

In [29]:
# open and read gzipped xml file
infile = gzip.open('incidents.xml.gz')
content = infile.read()

# Transform content into Beautiful Soup for further processing
soup = BeautifulSoup(content, "lxml")
soup

<?xml version="1.0" encoding="UTF-8"?><html><body><soap:envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:body><d2logicalmodel modelbaseversion="2" xmlns="http://datex2.eu/schema/2/2_0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><exchange><supplieridentification><country>nl</country><nationalidentifier>NLNDW</nationalidentifier></supplieridentification><subscription><operatingmode>operatingMode3</operatingmode><subscriptionstarttime>2021-01-11T08:12:59.686Z</subscriptionstarttime><subscriptionstate>active</subscriptionstate><updatemethod>snapshot</updatemethod><target><address></address><protocol>HTTP</protocol></target></subscription></exchange><payloadpublication lang="nl" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="SituationPublication"><publicationtime>2021-01-18T13:14:05.916Z</publicationtime><publicationcreator><country>nl</country><nationalidentifier>NLNDW</nationalidentifier></publicationcreator><situation id="RWS03_966892" ver

In [None]:
# List all tags of the file

tags = [tag.name for tag in soup.find_all()]
tags

In [None]:
# Work-around to find content of 'situation' tag

indices = [i for i, x in enumerate(tags) if x == "situation"]
print(indices)
for i in range(len(indices)-1):
    print(indices[i+1]-indices[i])

In [None]:
# There are 48 situations with different number of tags within each. The first situation starts from tag 22.

situation_tags = set(tags[22:])
situation_tags

In [None]:
# Ids of each situation in the file

ids = [tag['id'] for tag in soup.select('situation[id]')]
ids

In [None]:
for i in ids:
    cdict = {'id':i}
    situation = soup.find_all('situation', cdict)
    childs = [tag.name for tag in situation[0].find_all()]

In [None]:
# Present each situation as a separate dataframe and collect all dataframes in a list

df_list = []
for id_code in ids:
    cdict = {'id':id_code}
    situation = soup.find_all('situation', cdict)
    childs = [tag.name for tag in situation[0].find_all()]
    
    situation_dict = {'situation_id':id_code}

    for tag in childs:
        situation_dict[tag] = situation[0].find_all(tag)[0].next

    df_list.append(pd.DataFrame.from_dict(situation_dict, orient='index'))

In [None]:
# Aggreage all situations in one dataframe

df = pd.concat(df_list, axis=1).transpose().reset_index().drop('index', axis=1)

In [None]:
# Drop duplicated columns

to_drop = ['headerinformation','situationrecord','source', 'sourcename', 'values', 'validity', 'validitytimespecification',
          'groupoflocations', 'locationfordisplay', 'alertclinear', 'alertcdirection', 'alertcmethod4primarypointlocation',
          'alertclocation', 'linearextension', 'linearbycoordinatesextension', 'situationrecordextension',
          'situationrecordextendedapproved', 'mobilityofobstruction', 'pointextension', 'roadsidereferencepoint',
          'roadsidereferencepointidentifier', 'administrativearea', 'roadname', 'obstructiontype',
          'situationrecordobservationtime', 'situationrecordversiontime','linearcoordinatesstartpoint', 'pointcoordinates',
           'linearcoordinatesendpoint']
df.drop(to_drop, axis=1, inplace=True)

In [None]:
# Small function to extract content of tags nested within the values of dataframes

def extract_content(line):
    if type(line) == bs4.element.Tag:
        if type(line.contents[0]) == bs4.element.Tag:
            return line.contents[0].contents[0]
        else:
            return line.contents[0]
    else:
        return line
    
# extract_content(df['offsetdistance'][0])    # test
# extract_content(df.iloc[38], 'offsetdistance')     # test

In [None]:
# Appliance of function above to extract content of nested tags

for col in df.columns:
    if df[col].apply(lambda x: type(x) == bs4.element.Tag).any():
#         print(col)
        df[col] = df[col].apply(lambda row: extract_content(row))

In [None]:
# List all columns containing only digits

numeric_columns = []
for col in df.columns:
    if (not df[col].dropna().apply(lambda x: re.findall(r'[a-zA-Z]', str(x))).all()) == True:
        numeric_columns.append(col)
        
df[numeric_columns] = df[numeric_columns].astype(float)

In [None]:
# Convert all values in dataframe into string instead of Navigable String for further processing, excluding columns processed
# earlier

columns = [col for col in df.columns if col not in numeric_columns]
df[columns] = df[columns].applymap(str)

In [None]:
# List all columns containing date and time
time_columns = [str(i) for i in list(df.columns) if 'time' in i]

# Transform these columns into datetime format
df[time_columns] = pd.to_datetime(df[time_columns].stack()).unstack()

In [None]:
df.info()

In [None]:
df.to_csv('incidents_17.12.2020.csv')

1. Transform xml file into dataframe:
    - build visualiztion on map
2. Compare files from different dates - what is the update process?
3. Find meaningful connections with other data
4. Set up pipeline