In [1]:
import pandas as pd
import numpy as np
import os
import json

In [6]:
filename_json = 'gtfsr_connect_0207.json'
with open(filename_json) as json_file:
    df_json = json.load(json_file)

In [11]:
# trips
trips = pd.json_normalize(df_json['Entity'])
trips = trips.loc[[not x.startswith('#') for x in trips['Id']],:]
trips.head(3)

Unnamed: 0,Id,IsDeleted,TripUpdate.Trip.TripId,TripUpdate.Trip.RouteId,TripUpdate.Trip.StartTime,TripUpdate.Trip.StartDate,TripUpdate.Trip.ScheduleRelationship,TripUpdate.StopTimeUpdate
0,292595792,False,292595792,43235_2,21:07:00,20230702,Scheduled,"[{'StopId': '000009013927', 'Departure': {'Del..."
1,284276697,False,284276697,51652_3,13:10:00,20230702,Scheduled,"[{'StopId': '000009054797', 'Departure': {'Del..."
2,291600955,False,291600955,14205_3,13:38:00,20230702,Scheduled,"[{'StopId': '000000990042', 'Departure': {'Del..."


In [12]:
# stop times
stop_times = pd.DataFrame(index=trips['TripUpdate.Trip.TripId'].rename('TripId'))
for trip_id, stop_time_updates in zip(trips['TripUpdate.Trip.TripId'], trips['TripUpdate.StopTimeUpdate']):
    for stop_time_update in stop_time_updates:
        for field, value in stop_time_update.items():
            if field in ['Arrival', 'Departure']:
                for field2, value2 in value.items():
                    stop_times.loc[trip_id, f'{field}{field2}'] = value2
            else:
                stop_times.loc[trip_id, field] = value
stop_times = stop_times[['StopId', 'StopSequence', 'ArrivalDelay', 'ArrivalTime',
                         'DepartureDelay', 'DepartureTime', 'ScheduleRelationship']]
trips.drop('TripUpdate.StopTimeUpdate', axis=1, inplace=True)
stop_times.head(3)

Unnamed: 0_level_0,StopId,StopSequence,ArrivalDelay,ArrivalTime,DepartureDelay,DepartureTime,ScheduleRelationship
TripId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
292595792,9013927,,,,0.0,,Scheduled
284276697,9054797,,,,0.0,,Scheduled
291600955,990042,,,,0.0,,Scheduled


In [13]:
# choose better fitting column names
trips.columns = ['EntityId', 'IsDeleted', 'TripId', 'RouteId', 'StartTime', 'StartDate', 'ScheduleRelationship']

In [14]:
# data cleaning 1

# RouteId is formatted using '_' in contradiction with target data stop ids, i.e. values need to be adapted
trips['RouteId'] = [x.split('_')[0] for x in trips['RouteId']]

# IsDeleted is always False and ScheduleRelationship is always 'Scheduled', i.e. columns can be dropped
trips.drop(['IsDeleted', 'ScheduleRelationship'], axis=1, inplace=True)

trips.head(3)

Unnamed: 0,EntityId,TripId,RouteId,StartTime,StartDate
0,292595792,292595792,43235,21:07:00,20230702
1,284276697,284276697,51652,13:10:00,20230702
2,291600955,291600955,14205,13:38:00,20230702


In [15]:
# data cleaning 2

# remove preceding 0s in column StopId
stop_times['StopId'] = stop_times['StopId'].astype(int)
stop_times.head(3)

Unnamed: 0_level_0,StopId,StopSequence,ArrivalDelay,ArrivalTime,DepartureDelay,DepartureTime,ScheduleRelationship
TripId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
292595792,9013927,,,,0.0,,Scheduled
284276697,9054797,,,,0.0,,Scheduled
291600955,990042,,,,0.0,,Scheduled


In [16]:
trips.to_csv('actualdata_trips.csv', index=False)
stop_times.to_csv('actualdata_stop_times.csv', index=True)