In [171]:
import gzip
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re

In [203]:
pd.options.display.max_columns = 60

In [106]:
# open and read gzipped xml file
infile = gzip.open('incidents.xml.gz')
content = infile.read()

# Transform content into Beautiful Soup for further processing
soup = BeautifulSoup(content, "lxml")
soup

<?xml version="1.0" encoding="UTF-8"?><html><body><soap:envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:body><d2logicalmodel modelbaseversion="2" xmlns="http://datex2.eu/schema/2/2_0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><exchange><supplieridentification><country>nl</country><nationalidentifier>NLNDW</nationalidentifier></supplieridentification><subscription><operatingmode>operatingMode3</operatingmode><subscriptionstarttime>2020-10-29T04:22:19.204Z</subscriptionstarttime><subscriptionstate>active</subscriptionstate><updatemethod>snapshot</updatemethod><target><address></address><protocol>HTTP</protocol></target></subscription></exchange><payloadpublication lang="nl" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="SituationPublication"><publicationtime>2020-12-17T16:44:50.449Z</publicationtime><publicationcreator><country>nl</country><nationalidentifier>NLNDW</nationalidentifier></publicationcreator><situation id="RWS03_956972" ver

In [107]:
# cdict = {'id':'RWS03_956972'}
# repos = soup.find_all('situation', cdict)
# repos

In [101]:
# List all tags of the file

tags = [tag.name for tag in soup.find_all()]
tags

['html',
 'body',
 'soap:envelope',
 'soap:body',
 'd2logicalmodel',
 'exchange',
 'supplieridentification',
 'country',
 'nationalidentifier',
 'subscription',
 'operatingmode',
 'subscriptionstarttime',
 'subscriptionstate',
 'updatemethod',
 'target',
 'address',
 'protocol',
 'payloadpublication',
 'publicationtime',
 'publicationcreator',
 'country',
 'nationalidentifier',
 'situation',
 'overallseverity',
 'situationversiontime',
 'headerinformation',
 'confidentiality',
 'informationstatus',
 'situationrecord',
 'situationrecordcreationreference',
 'situationrecordcreationtime',
 'situationrecordobservationtime',
 'situationrecordversiontime',
 'situationrecordfirstsupplierversiontime',
 'probabilityofoccurrence',
 'source',
 'sourcename',
 'values',
 'value',
 'validity',
 'validitystatus',
 'validitytimespecification',
 'overallstarttime',
 'groupoflocations',
 'locationfordisplay',
 'latitude',
 'longitude',
 'alertclinear',
 'alertclocationcountrycode',
 'alertclocationtable

In [131]:
# Work-around to find content of 'situation' tag

indices = [i for i, x in enumerate(tags) if x == "situation"]
print(indices)
for i in range(len(indices)-1):
    print(indices[i+1]-indices[i])

[22, 79, 136, 193, 250, 304, 361, 418, 473, 530, 587, 644, 701, 758, 815, 870, 927, 973, 1030, 1087, 1142, 1185, 1242, 1286, 1343, 1389, 1446, 1492, 1549, 1606, 1661, 1718, 1775, 1827, 1873, 1930, 1987, 2041, 2098, 2138, 2195, 2252, 2309, 2366, 2407, 2464, 2508, 2563]
57
57
57
57
54
57
57
55
57
57
57
57
57
57
55
57
46
57
57
55
43
57
44
57
46
57
46
57
57
55
57
57
52
46
57
57
54
57
40
57
57
57
57
41
57
44
55


In [141]:
# There are 48 situations with different number of tags within each. The first situation starts from tag 22.

situation_tags = set(tags[22:])
situation_tags

{'accidenttype',
 'administrativearea',
 'alertcdirection',
 'alertcdirectioncoded',
 'alertclinear',
 'alertclocation',
 'alertclocationcountrycode',
 'alertclocationtablenumber',
 'alertclocationtableversion',
 'alertcmethod4primarypointlocation',
 'alertcmethod4secondarypointlocation',
 'alertcpoint',
 'confidentiality',
 'groupoflocations',
 'headerinformation',
 'informationstatus',
 'latitude',
 'linearbycoordinatesextension',
 'linearcoordinatesendpoint',
 'linearcoordinatesstartpoint',
 'linearextension',
 'locationfordisplay',
 'longitude',
 'mobilityofobstruction',
 'mobilitytype',
 'obstructiontype',
 'offsetdistance',
 'overallseverity',
 'overallstarttime',
 'pointbycoordinates',
 'pointcoordinates',
 'pointextension',
 'probabilityofoccurrence',
 'roadname',
 'roadsidereferencepoint',
 'roadsidereferencepointidentifier',
 'safetyrelatedmessage',
 'situation',
 'situationrecord',
 'situationrecordcreationreference',
 'situationrecordcreationtime',
 'situationrecordextended

In [121]:
# Ids of each situation in the file

ids = [tag['id'] for tag in soup.select('situation[id]')]
ids

['RWS03_956972',
 'RWS03_956973',
 'RWS03_956974',
 'RWS03_956975',
 'RWS03_956976',
 'RWS03_956977',
 'RWS03_956978',
 'RWS03_956979',
 'RWS03_956970',
 'RWS03_956971',
 'RWS03_956983',
 'RWS03_956984',
 'RWS03_956985',
 'RWS03_956986',
 'RWS03_956987',
 'RWS03_956988',
 'RWS03_956989',
 'RWS03_956980',
 'RWS03_956981',
 'RWS03_956982',
 'RWS03_956554',
 'RWS03_957005',
 'RWS03_957006',
 'RWS03_957007',
 'RWS03_957008',
 'RWS03_957009',
 'RWS03_957000',
 'RWS03_957001',
 'RWS03_957002',
 'RWS03_957003',
 'RWS03_957004',
 'RWS03_957017',
 'RWS03_956967',
 'RWS03_957010',
 'RWS03_957011',
 'RWS03_957012',
 'RWS03_957013',
 'RWS03_957014',
 'RWS03_957015',
 'RWS03_956994',
 'RWS03_956995',
 'RWS03_956996',
 'RWS03_956997',
 'RWS03_956999',
 'RWS03_956990',
 'RWS03_956991',
 'RWS03_956992',
 'RWS03_956993']

In [151]:
cdict = {'id':'RWS03_956972'}
situation = soup.find_all('situation', cdict)
childs = [tag.name for tag in situation[0].find_all()]
childs

['overallseverity',
 'situationversiontime',
 'headerinformation',
 'confidentiality',
 'informationstatus',
 'situationrecord',
 'situationrecordcreationreference',
 'situationrecordcreationtime',
 'situationrecordobservationtime',
 'situationrecordversiontime',
 'situationrecordfirstsupplierversiontime',
 'probabilityofoccurrence',
 'source',
 'sourcename',
 'values',
 'value',
 'validity',
 'validitystatus',
 'validitytimespecification',
 'overallstarttime',
 'groupoflocations',
 'locationfordisplay',
 'latitude',
 'longitude',
 'alertclinear',
 'alertclocationcountrycode',
 'alertclocationtablenumber',
 'alertclocationtableversion',
 'alertcdirection',
 'alertcdirectioncoded',
 'alertcmethod4primarypointlocation',
 'alertclocation',
 'specificlocation',
 'offsetdistance',
 'offsetdistance',
 'alertcmethod4secondarypointlocation',
 'alertclocation',
 'specificlocation',
 'offsetdistance',
 'offsetdistance',
 'linearextension',
 'linearbycoordinatesextension',
 'linearcoordinatesstar

In [211]:
# Present each situation as a separate dataframe and collect all dataframes in a list

df_list = []
for id_code in ids:
    situation_dict = {'situation_id':id_code}

    for tag in childs:
        situation_dict[tag] = situation[0].find_all(tag)[0].next
    
    df_list.append(pd.DataFrame.from_dict(situation_dict, orient='index'))

In [212]:
# Aggreage all situations in one dataframe

df = pd.concat(df_list, axis=1).transpose().reset_index().drop('index', axis=1)

In [213]:
# Drop duplicated columns

to_drop = ['headerinformation','situationrecord','source', 'sourcename', 'values', 'validity', 'validitytimespecification',
          'groupoflocations', 'locationfordisplay', 'alertclinear', 'alertcdirection', 'alertcmethod4primarypointlocation',
          'alertclocation', 'linearextension', 'linearbycoordinatesextension', 'situationrecordextension',
          'situationrecordextendedapproved', 'mobilityofobstruction']
df.drop(to_drop, axis=1, inplace=True)

In [216]:
df

Unnamed: 0,situation_id,overallseverity,situationversiontime,confidentiality,informationstatus,situationrecordcreationreference,situationrecordcreationtime,situationrecordobservationtime,situationrecordversiontime,situationrecordfirstsupplierversiontime,probabilityofoccurrence,value,validitystatus,overallstarttime,latitude,longitude,alertclocationcountrycode,alertclocationtablenumber,alertclocationtableversion,alertcdirectioncoded,specificlocation,offsetdistance,alertcmethod4secondarypointlocation,linearcoordinatesstartpoint,pointcoordinates,linearcoordinatesendpoint,safetyrelatedmessage,mobilitytype,vehicleobstructiontype
0,RWS03_956972,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
1,RWS03_956973,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
2,RWS03_956974,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
3,RWS03_956975,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
4,RWS03_956976,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
5,RWS03_956977,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
6,RWS03_956978,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
7,RWS03_956979,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
8,RWS03_956970,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle
9,RWS03_956971,unknown,2020-12-17T15:33:15Z,noRestriction,real,LCM-LCM20168480_IM_1,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,2020-12-17T15:33:15Z,certain,VC Noordwest-Nederland,active,2020-12-17T15:32:36Z,52.42696,4.682106,8,5.21,A,negative,10589,[1100],[[10591]],"[[52.42696], [4.682106]]",[52.42696],"[[52.42696], [4.682106]]",True,stationary,brokenDownVehicle


In [77]:
soup.find_all('publicationtime')[0].contents

['2020-12-17T16:44:50.449Z']

In [99]:
soup.find('situation').findChildren()

[<overallseverity>unknown</overallseverity>,
 <situationversiontime>2020-12-17T15:33:15Z</situationversiontime>,
 <headerinformation><confidentiality>noRestriction</confidentiality><informationstatus>real</informationstatus></headerinformation>,
 <confidentiality>noRestriction</confidentiality>,
 <informationstatus>real</informationstatus>,
 <situationrecord id="RWS03_956972_1" version="2" xsi:type="VehicleObstruction"><situationrecordcreationreference>LCM-LCM20168480_IM_1</situationrecordcreationreference><situationrecordcreationtime>2020-12-17T15:33:15Z</situationrecordcreationtime><situationrecordobservationtime>2020-12-17T15:33:15Z</situationrecordobservationtime><situationrecordversiontime>2020-12-17T15:33:15Z</situationrecordversiontime><situationrecordfirstsupplierversiontime>2020-12-17T15:33:15Z</situationrecordfirstsupplierversiontime><probabilityofoccurrence>certain</probabilityofoccurrence><source><sourcename><values><value lang="nl">VC Noordwest-Nederland</value></values></

1. Transform xml file into dataframe
2. Compare files from different dates - what is the update process?
3. Find meaningful connections with other data
4. Set up pipeline