In [135]:
import pandas as pd
import numpy as np
import os
import json
import woodwork as ww

In [136]:
filename_json = 'gtfsr_connect.json'
with open(filename_json) as json_file:
    df_json = json.load(json_file)

In [137]:
# trips
trips = pd.json_normalize(df_json['Entity'])
trips = trips.loc[[not x.startswith('#') for x in trips['Id']],:]
trips.head(3)

Unnamed: 0,Id,IsDeleted,TripUpdate.Trip.TripId,TripUpdate.Trip.RouteId,TripUpdate.Trip.StartTime,TripUpdate.Trip.StartDate,TripUpdate.Trip.ScheduleRelationship,TripUpdate.StopTimeUpdate
0,254163638,False,254163638,43975_3,19:44:00,20230508,Scheduled,"[{'StopId': '000009057862', 'Departure': {'Del..."
1,282351984,False,282351984,43245_2,17:12:00,20230509,Scheduled,"[{'StopId': '000009013478', 'Departure': {'Del..."
2,282385362,False,282385362,43195_2,20:03:00,20230508,Scheduled,"[{'StopId': '000009303000', 'Departure': {'Del..."


In [138]:
# stop times
stop_times = pd.DataFrame(index=trips['TripUpdate.Trip.TripId'].rename('TripId'))
for trip_id, stop_time_updates in zip(trips['TripUpdate.Trip.TripId'], trips['TripUpdate.StopTimeUpdate']):
    for stop_time_update in stop_time_updates:
        for field, value in stop_time_update.items():
            if field in ['Arrival', 'Departure']:
                for field2, value2 in value.items():
                    stop_times.loc[trip_id, f'{field}{field2}'] = value2
            else:
                stop_times.loc[trip_id, field] = value
stop_times = stop_times[['StopId', 'StopSequence', 'ArrivalDelay', 'ArrivalTime',
                         'DepartureDelay', 'DepartureTime', 'ScheduleRelationship']]
trips.drop('TripUpdate.StopTimeUpdate', axis=1, inplace=True)
stop_times.head(3)

Unnamed: 0_level_0,StopId,StopSequence,ArrivalDelay,ArrivalTime,DepartureDelay,DepartureTime,ScheduleRelationship
TripId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
254163638,9057862,,,,0.0,,Scheduled
282351984,9013478,,,,0.0,,Scheduled
282385362,9058008,5.0,-60.0,,0.0,,Scheduled


In [139]:
# choose better fitting column names
trips.columns = ['EntityId', 'IsDeleted', 'TripId', 'RouteId', 'StartTime', 'StartDate', 'ScheduleRelationship']

# RouteId is formatted using '_' in contradiction with target data stop ids, i.e. values need to be adapted
trips['RouteId'] = [x.split('_')[0] for x in trips['RouteId']]

# IsDeleted is always False and ScheduleRelationship is always 'Scheduled', i.e. columns are redundant
trips.drop(['IsDeleted', 'ScheduleRelationship'], axis=1, inplace=True)

trips.head(3)

Unnamed: 0,EntityId,TripId,RouteId,StartTime,StartDate
0,254163638,254163638,43975,19:44:00,20230508
1,282351984,282351984,43245,17:12:00,20230509
2,282385362,282385362,43195,20:03:00,20230508


In [140]:
# remove preceding 0s in column StopId
stop_times['StopId'] = stop_times['StopId'].astype(int)
stop_times.head(3)

Unnamed: 0_level_0,StopId,StopSequence,ArrivalDelay,ArrivalTime,DepartureDelay,DepartureTime,ScheduleRelationship
TripId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
254163638,9057862,,,,0.0,,Scheduled
282351984,9013478,,,,0.0,,Scheduled
282385362,9058008,5.0,-60.0,,0.0,,Scheduled


In [141]:
trips.to_csv('actualdata_trips.csv', index=False)
stop_times.to_csv('actualdata_stop_times.csv', index=True)

In [142]:
# analyze schema and data types (data profiling)
trips.ww.init()
trips.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EntityId,int64,Integer,['numeric']
TripId,int64,Integer,['numeric']
RouteId,category,PostalCode,['category']
StartTime,datetime64[ns],Datetime,[]
StartDate,int64,Integer,['numeric']


In [143]:
stop_times.ww.init()
stop_times.ww

Unnamed: 0_level_0,Physical Type,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
StopId,int64,Integer,['numeric']
StopSequence,Int64,IntegerNullable,['numeric']
ArrivalDelay,Int64,IntegerNullable,['numeric']
ArrivalTime,Int64,IntegerNullable,['numeric']
DepartureDelay,Int64,IntegerNullable,['numeric']
DepartureTime,Int64,IntegerNullable,['numeric']
ScheduleRelationship,category,Categorical,['category']
