In [61]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import xml.dom.minidom
import json
import csv
import datetime as dt

In [62]:
myToken = 'NLOD'
myUrl = 'https://api.entur.io/realtime/v1/rest/et?datasetId=FLT'
head = {'Authorization': 'token {}'.format(myToken)}
response = requests.get(myUrl, headers=head)
# Print raw text content of the response to check the format, this is how I found out that it´s xml
#print(response.content)

In [63]:
# Parse the raw XML content
xml_string = response.content

# Parse the original, unformatted XML string
tree = ET.ElementTree(ET.fromstring(xml_string))
root = tree.getroot()

# Pretty print for viewing:
dom = xml.dom.minidom.parseString(xml_string)
pretty_xml_as_string = dom.toprettyxml()
#print(pretty_xml_as_string)  # Looking at the xml format

In [64]:
namespace = {'siri': 'http://www.siri.org.uk/siri'}

# Prepare lists to store data for each EstimatedVehicleJourney and EstimatedCall
rows = []

# Iterate through EstimatedVehicleJourney elements
for journey in root.findall('.//siri:EstimatedVehicleJourney', namespace):
    # Extract common fields for each vehicle journey
    vehicle_data = {
        'RecordedAtTime': journey.find('siri:RecordedAtTime', namespace).text,
        'DirectionRef': journey.find('siri:DirectionRef', namespace).text,
        'OriginRef': journey.find('siri:OriginRef', namespace).text,
        'DestinationRef': journey.find('siri:DestinationRef', namespace).text,
        'VehicleRef': journey.find('siri:VehicleRef', namespace).text,
        'OperatorRef': journey.find('siri:OperatorRef', namespace).text
    }
    
    # Iterate through the EstimatedCall elements for each journey
    for call in journey.findall('siri:EstimatedCalls/siri:EstimatedCall', namespace):
        aimed_departure_time = call.find('siri:AimedDepartureTime', namespace)
        aimed_departure_time_value = aimed_departure_time.text if aimed_departure_time is not None else None
        
        # Prepare call data
        call_data = {
            'StopPointName': call.find('siri:StopPointName', namespace).text,
            'ArrivalStatus': call.find('siri:ArrivalStatus', namespace).text,
            'AimedDepartureTime': aimed_departure_time_value,
            'ExpectedDepartureTime': call.find('siri:ExpectedDepartureTime', namespace).text if call.find('siri:ExpectedDepartureTime', namespace) is not None else None,
            'Order': call.find('siri:Order', namespace).text if call.find('siri:Order', namespace) is not None else None,
        }

        
        combined_data = {**vehicle_data, **call_data}
        
        
        rows.append(combined_data)


df = pd.DataFrame(rows)

df.head()


Unnamed: 0,RecordedAtTime,DirectionRef,OriginRef,DestinationRef,VehicleRef,OperatorRef,StopPointName,ArrivalStatus,AimedDepartureTime,ExpectedDepartureTime,Order
0,2024-10-22T09:02:50.994910969+02:00,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Drammen,arrived,2024-10-23T05:02:00+02:00,2024-10-23T05:02:00+02:00,1
1,2024-10-22T09:02:50.994910969+02:00,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Asker,arrived,2024-10-23T05:15:00+02:00,2024-10-23T05:15:00+02:00,2
2,2024-10-22T09:02:50.994910969+02:00,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Sandvika,arrived,2024-10-23T05:21:00+02:00,2024-10-23T05:21:00+02:00,3
3,2024-10-22T09:02:50.994910969+02:00,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Lysaker,arrived,2024-10-23T05:27:00+02:00,2024-10-23T05:27:00+02:00,4
4,2024-10-22T09:02:50.994910969+02:00,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Skøyen,arrived,2024-10-23T05:30:00+02:00,2024-10-23T05:30:00+02:00,5


In [65]:
df['RecordedAtTime'] = pd.to_datetime(df['RecordedAtTime'])
df['AimedDepartureTime'] = pd.to_datetime(df['AimedDepartureTime'])
df['ExpectedDepartureTime'] = pd.to_datetime(df['ExpectedDepartureTime'])

df['aimed_dep_time'] = df['AimedDepartureTime'].dt.time
df['expected_dep_time'] = df['ExpectedDepartureTime'].dt.time
df['date'] = df['ExpectedDepartureTime'].dt.date
df['month'] = df['ExpectedDepartureTime'].dt.month_name()
df['year'] = df['ExpectedDepartureTime'].dt.year
df['recorded_at_time'] = df['RecordedAtTime'].dt.strftime('%Y-%m-%d %H:%M:%S')

cols = ['recorded_at_time', 'DirectionRef', 'OriginRef', 'DestinationRef', 'VehicleRef', 
         'OperatorRef', 'StopPointName', 'ArrivalStatus', 'Order', 'aimed_dep_time',
         'expected_dep_time', 'month', 'year', 'date'
       ]
df = df[cols] 

df.rename(columns={'Order': 'order_of_stops'}, inplace=True)


df.head()


Unnamed: 0,recorded_at_time,DirectionRef,OriginRef,DestinationRef,VehicleRef,OperatorRef,StopPointName,ArrivalStatus,order_of_stops,aimed_dep_time,expected_dep_time,month,year,date
0,2024-10-22 09:02:50,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Drammen,arrived,1,05:02:00,05:02:00,October,2024.0,2024-10-23
1,2024-10-22 09:02:50,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Asker,arrived,2,05:15:00,05:15:00,October,2024.0,2024-10-23
2,2024-10-22 09:02:50,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Sandvika,arrived,3,05:21:00,05:21:00,October,2024.0,2024-10-23
3,2024-10-22 09:02:50,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Lysaker,arrived,4,05:27:00,05:27:00,October,2024.0,2024-10-23
4,2024-10-22 09:02:50,ToGardermoen,DRM,GAR,3707,FLT:Operator:507,Skøyen,arrived,5,05:30:00,05:30:00,October,2024.0,2024-10-23
