In [46]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import xml.dom.minidom
import json
import csv
import datetime as dt

In [47]:
myToken = 'NLOD'
myUrl = 'https://api.entur.io/realtime/v1/rest/et?datasetId=FLT'
head = {'Authorization': 'token {}'.format(myToken)}
response = requests.get(myUrl, headers=head)
# Print raw text content of the response to check the format, this is how I found out that it´s xml
#print(response.content)

b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Siri xmlns="http://www.siri.org.uk/siri" xmlns:ns2="http://www.ifopt.org.uk/acsb" xmlns:ns3="http://www.ifopt.org.uk/ifopt" xmlns:ns4="http://datex2.eu/schema/2_0RC1/2_0" version="2.0"><ServiceDelivery><ResponseTimestamp>2024-10-21T21:13:13.055691831+02:00</ResponseTimestamp><ProducerRef>ENT</ProducerRef><EstimatedTimetableDelivery version="2.0"><ResponseTimestamp>2024-10-21T21:13:13.055693468+02:00</ResponseTimestamp><EstimatedJourneyVersionFrame><RecordedAtTime>2024-10-21T21:13:13.05569244+02:00</RecordedAtTime><EstimatedVehicleJourney><RecordedAtTime>2024-10-21T18:11:13.890534162+02:00</RecordedAtTime><LineRef>FLT:Line:FLY1</LineRef><DirectionRef>ToDrammen</DirectionRef><FramedVehicleJourneyRef><DataFrameRef>2024-10-22</DataFrameRef><DatedVehicleJourneyRef>FLT:ServiceJourney:1-2614-3754-20241022</DatedVehicleJourneyRef></FramedVehicleJourneyRef><VehicleMode>rail</VehicleMode><OriginRef>GAR</OriginRef><DestinationRef>DRM</Dest

In [48]:
# Parse the raw XML content
xml_string = response.content

# Parse the original, unformatted XML string
tree = ET.ElementTree(ET.fromstring(xml_string))
root = tree.getroot()

# Pretty print for viewing:
dom = xml.dom.minidom.parseString(xml_string)
pretty_xml_as_string = dom.toprettyxml()
#print(pretty_xml_as_string)  # Looking at the xml format

<?xml version="1.0" ?>
<Siri xmlns="http://www.siri.org.uk/siri" xmlns:ns2="http://www.ifopt.org.uk/acsb" xmlns:ns3="http://www.ifopt.org.uk/ifopt" xmlns:ns4="http://datex2.eu/schema/2_0RC1/2_0" version="2.0">
	<ServiceDelivery>
		<ResponseTimestamp>2024-10-21T21:13:13.055691831+02:00</ResponseTimestamp>
		<ProducerRef>ENT</ProducerRef>
		<EstimatedTimetableDelivery version="2.0">
			<ResponseTimestamp>2024-10-21T21:13:13.055693468+02:00</ResponseTimestamp>
			<EstimatedJourneyVersionFrame>
				<RecordedAtTime>2024-10-21T21:13:13.05569244+02:00</RecordedAtTime>
				<EstimatedVehicleJourney>
					<RecordedAtTime>2024-10-21T18:11:13.890534162+02:00</RecordedAtTime>
					<LineRef>FLT:Line:FLY1</LineRef>
					<DirectionRef>ToDrammen</DirectionRef>
					<FramedVehicleJourneyRef>
						<DataFrameRef>2024-10-22</DataFrameRef>
						<DatedVehicleJourneyRef>FLT:ServiceJourney:1-2614-3754-20241022</DatedVehicleJourneyRef>
					</FramedVehicleJourneyRef>
					<VehicleMode>rail</VehicleMode>
					

In [49]:
# Define the namespace used in the XML
namespace = {'siri': 'http://www.siri.org.uk/siri'}

# Prepare lists to store data for each EstimatedVehicleJourney and EstimatedCall
rows = []

# Iterate through EstimatedVehicleJourney elements
for journey in root.findall('.//siri:EstimatedVehicleJourney', namespace):
    # Extract common fields for each vehicle journey
    vehicle_data = {
        'RecordedAtTime': journey.find('siri:RecordedAtTime', namespace).text,
        'DirectionRef': journey.find('siri:DirectionRef', namespace).text,
        'OriginRef': journey.find('siri:OriginRef', namespace).text,
        'DestinationRef': journey.find('siri:DestinationRef', namespace).text,
        'VehicleRef': journey.find('siri:VehicleRef', namespace).text,
        'OperatorRef': journey.find('siri:OperatorRef', namespace).text
    }
    
    # Iterate through the EstimatedCall elements for each journey
    for call in journey.findall('siri:EstimatedCalls/siri:EstimatedCall', namespace):
        aimed_departure_time = call.find('siri:AimedDepartureTime', namespace)
        aimed_departure_time_value = aimed_departure_time.text if aimed_departure_time is not None else None
        
        # Prepare call data
        call_data = {
            'StopPointName': call.find('siri:StopPointName', namespace).text,
            'ArrivalStatus': call.find('siri:ArrivalStatus', namespace).text,
            'AimedDepartureTime': aimed_departure_time_value,
            'ExpectedDepartureTime': call.find('siri:ExpectedDepartureTime', namespace).text if call.find('siri:ExpectedDepartureTime', namespace) is not None else None,
            'Order': call.find('siri:Order', namespace).text if call.find('siri:Order', namespace) is not None else None,
        }

        # Combine vehicle data with call data into one dictionary
        combined_data = {**vehicle_data, **call_data}
        
        # Add the combined data to the list of rows
        rows.append(combined_data)

# Create a DataFrame from the rows
df = pd.DataFrame(rows)

# Display the DataFrame
df.head()


Unnamed: 0,RecordedAtTime,DirectionRef,OriginRef,DestinationRef,VehicleRef,OperatorRef,StopPointName,ArrivalStatus,AimedDepartureTime,ExpectedDepartureTime,Order
0,2024-10-21T18:11:13.890534162+02:00,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Gardermoen,arrived,2024-10-22T14:10:00+02:00,2024-10-22T14:10:00+02:00,1
1,2024-10-21T18:11:13.890534162+02:00,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Lillestrøm,arrived,2024-10-22T14:22:00+02:00,2024-10-22T14:22:00+02:00,2
2,2024-10-21T18:11:13.890534162+02:00,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Oslo S,arrived,2024-10-22T14:35:00+02:00,2024-10-22T14:35:00+02:00,3
3,2024-10-21T18:11:13.890534162+02:00,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Nationaltheatret,arrived,2024-10-22T14:37:00+02:00,2024-10-22T14:37:00+02:00,4
4,2024-10-21T18:11:13.890534162+02:00,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Skøyen,arrived,2024-10-22T14:41:00+02:00,2024-10-22T14:41:00+02:00,5


In [50]:
df['RecordedAtTime'] = pd.to_datetime(df['RecordedAtTime'])
df['AimedDepartureTime'] = pd.to_datetime(df['AimedDepartureTime'])
df['ExpectedDepartureTime'] = pd.to_datetime(df['ExpectedDepartureTime'])

df['aimed_dep_time'] = df['AimedDepartureTime'].dt.time
df['expected_dep_time'] = df['ExpectedDepartureTime'].dt.time
df['date'] = df['ExpectedDepartureTime'].dt.date
df['month'] = df['ExpectedDepartureTime'].dt.month_name()
df['year'] = df['ExpectedDepartureTime'].dt.year
df['recorded_at_time'] = df['RecordedAtTime'].dt.strftime('%Y-%m-%d %H:%M:%S')

cols = ['recorded_at_time', 'DirectionRef', 'OriginRef', 'DestinationRef', 'VehicleRef', 
         'OperatorRef', 'StopPointName', 'ArrivalStatus', 'Order', 'aimed_dep_time',
         'expected_dep_time', 'month', 'year', 'date'
       ]
df = df[cols] 

df.rename(columns={'Order': 'order_of_stops'}, inplace=True)


df.head()


Unnamed: 0,recorded_at_time,DirectionRef,OriginRef,DestinationRef,VehicleRef,OperatorRef,StopPointName,ArrivalStatus,order_of_stops,aimed_dep_time,expected_dep_time,month,year,date
0,2024-10-21 18:11:13,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Gardermoen,arrived,1,14:10:00,14:10:00,October,2024.0,2024-10-22
1,2024-10-21 18:11:13,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Lillestrøm,arrived,2,14:22:00,14:22:00,October,2024.0,2024-10-22
2,2024-10-21 18:11:13,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Oslo S,arrived,3,14:35:00,14:35:00,October,2024.0,2024-10-22
3,2024-10-21 18:11:13,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Nationaltheatret,arrived,4,14:37:00,14:37:00,October,2024.0,2024-10-22
4,2024-10-21 18:11:13,ToDrammen,GAR,DRM,3754,FLT:Operator:507,Skøyen,arrived,5,14:41:00,14:41:00,October,2024.0,2024-10-22
