# Assignment 2
Based on Vienna's GTFS data (`data/GTFS`), find the average headway of U1 at Stephansplatz (to Wien Leopoldau) from 7:00 to 9:00 on a typical Monday in October, 2024.

Hint: for records meeting the above conditions, they have:
* agency_id == 4 (Wiener Linien GmbH & Co KG)
* route_short_name == 'U1'
* service_id == 'T5#9' (weekdays from 20240819 to 20241124)
* direction_id == 0 or trip_headsign == 'Wien Leopoldau'
* stop_id = 'at:49:1320:0:6' (Stephansplatz)

In [None]:
# !rm -rf data && mkdir data

In [None]:
# ### download the data.zip file from: https://tuwienacat-my.sharepoint.com/:f:/g/personal/bingyu_zhao_tuwien_ac_at/EkhK2y95nU9Pu7jBNSX6zsEBuMzb1rslbH7gBzFF0UsmvQ?e=Tf5iA8
# ### upload it to the data folder
# !unzip data/data.zip -d data

In [1]:
### import libraries data analysis
import numpy as np
import pandas as pd

### import libraries for plotting
import matplotlib.pyplot as plt

### other utilities
import glob

### Step 1. Familiarise yourself with the GTFS files

In [2]:
### list all the files in the given folder
glob.glob('data/GTFS/*')

['data/GTFS/agency.txt',
 'data/GTFS/calendar_dates.txt',
 'data/GTFS/stop_times.txt',
 'data/GTFS/shapes.txt',
 'data/GTFS/trips.txt',
 'data/GTFS/stops.txt',
 'data/GTFS/calendar.txt',
 'data/GTFS/routes.txt']

In [5]:
### look inside each file, for example, 'agency.txt'
gtfs_agency = pd.read_csv('data/GTFS/agency.txt')
gtfs_agency.head()

### feel free to look inside other files

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_fare_url
0,4,Wiener Linien GmbH & Co KG,https://www.wienerlinien.at,Europe/Vienna,DE,https://shop.wienmobil.at/products
1,3,Wiener Lokalbahnen GmbH,https://www.wlb.at,Europe/Vienna,DE,


In [12]:
gtfs_calendar = pd.read_csv('data/GTFS/calendar.txt')
gtfs_calendar.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,T0,1,1,1,1,1,0,0,20231210,20240329
1,T0#1,1,1,1,1,1,0,0,20240330,20241214
2,T0#10,1,1,1,1,1,0,0,20240902,20241214
3,T0#100,1,1,1,1,1,0,0,20231216,20240324
4,T0#101,1,1,1,1,1,0,0,20240402,20241129


### Step 2. Find all records
U1, to Wien Leopoldau, weekday, stops at Stephansplatz

In [6]:
### list all routes with name "U1"
gtfs_routes = pd.read_csv('data/GTFS/routes.txt')
routes_u1 = gtfs_routes[gtfs_routes['route_short_name']=='U1']
routes_u1.head()

### there seems multiple routes with the name "U1". Which row (route_id) should we use?

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color
15,21-U1-j24-1,4,U1,Oberlaa - Leopoldau,1,E3000F,FFFFFF
16,21-U1-j24-10,4,U1,Oberlaa - Leopoldau,1,E3000F,FFFFFF
17,21-U1-j24-2,4,U1,Oberlaa - Leopoldau Baustellenfahrplan gültig ...,1,E3000F,FFFFFF
18,21-U1-j24-24,4,U1,Oberlaa - Leopoldau,1,E3000F,FFFFFF
19,21-U1-j24-3,4,U1,Oberlaa - Leopoldau,1,E3000F,FFFFFF


In [7]:
### read all trip records
trips = pd.read_csv('data/GTFS/trips.txt')
# display(trips_u1.head())

### list all trips associated with "U1" routes
trips_u1 = trips[trips['route_id'].isin(routes_u1['route_id'])]
# display(trips_u1.head())

### filter for trips (1) running on the weekdays; and (2) to Wien Leopoldau
trips_u1_mon = trips_u1.copy()
trips_u1_mon = trips_u1_mon[(trips_u1_mon['service_id'].isin(['T5#9'])) & (trips_u1_mon['trip_headsign']=='Wien Leopoldau')]
display(trips_u1_mon.head())

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id,block_id
12308,21-U1-j24-10,T5#9,1.T5.21-U1-j24-10.1.H,21-U1-j24-10.1.H,Wien Leopoldau,0,
12312,21-U1-j24-10,T5#9,10.T5.21-U1-j24-10.2.H,21-U1-j24-10.2.H,Wien Leopoldau,0,
12316,21-U1-j24-10,T5#9,100.T5.21-U1-j24-10.3.H,21-U1-j24-10.3.H,Wien Leopoldau,0,
12320,21-U1-j24-10,T5#9,101.T5.21-U1-j24-10.1.H,21-U1-j24-10.1.H,Wien Leopoldau,0,
12324,21-U1-j24-10,T5#9,102.T5.21-U1-j24-10.3.H,21-U1-j24-10.3.H,Wien Leopoldau,0,


In [8]:
### read all stop times information
stop_times = pd.read_csv('data/GTFS/stop_times.txt')
# display(stop_times.head())

### list all stop times for U1 trips (1) running on the weekdays; and (2) to Wien Leopoldau
stop_times_u1_mon = stop_times[stop_times['trip_id'].isin(trips_u1_mon['trip_id'])]
# display(stop_times_u1_am.head())

### filter for all stop times for the above trips at stephansplatz
stop_times_u1_mon_stephans = stop_times_u1_mon.copy()
stop_times_u1_mon_stephans = stop_times_u1_mon_stephans[stop_times_u1_mon_stephans['stop_id']=='at:49:1320:0:6']
stop_times_u1_mon_stephans = stop_times_u1_mon_stephans.sort_values('arrival_time')
display(stop_times_u1_mon_stephans.head())


Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled
66318,1.T5.21-U1-j24-10.1.H,05:13:00,05:13:00,at:49:1320:0:6,11,0,0,8442.09
276546,2.T5.21-U1-j24-10.1.H,05:20:00,05:20:00,at:49:1320:0:6,11,0,0,8442.09
437308,3.T5.21-U1-j24-10.1.H,05:28:00,05:28:00,at:49:1320:0:6,11,0,0,8442.09
638426,4.T5.21-U1-j24-10.1.H,05:35:00,05:35:00,at:49:1320:0:6,11,0,0,8442.09
775425,5.T5.21-U1-j24-10.1.H,05:43:00,05:43:00,at:49:1320:0:6,11,0,0,8442.09


### Step 3. Calculate the average headway
Filter for trains arriving between 7:00 and 9:00

In [29]:
### only keep the trains arriving between 7:00 and 9:00
stop_times_u1_mon_stephans_am = stop_times_u1_mon_stephans.copy()
stop_times_u1_mon_stephans_am['arrival_hour'] = stop_times_u1_mon_stephans_am['arrival_time'].str.split(':').str[0]
stop_times_u1_mon_stephans_am = stop_times_u1_mon_stephans_am[stop_times_u1_mon_stephans_am['arrival_hour'].isin(['07', '08'])]
stop_times_u1_mon_stephans_am.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,shape_dist_traveled,arrival_hour
373708,26.T5.21-U1-j24-10.2.H,07:00:00,07:00:00,at:49:1320:0:6,10,0,0,7342.0,7
389411,27.T5.21-U1-j24-10.1.H,07:03:00,07:03:00,at:49:1320:0:6,11,0,0,8442.09,7
421275,29.T5.21-U1-j24-10.3.H,07:05:00,07:05:00,at:49:1320:0:6,9,0,0,6317.53,7
405362,28.T5.21-U1-j24-10.1.H,07:08:00,07:08:00,at:49:1320:0:6,11,0,0,8442.09,7
462461,31.T5.21-U1-j24-10.3.H,07:10:00,07:10:00,at:49:1320:0:6,9,0,0,6317.53,7


In [32]:
### what is the average headway in minutes?
# Convert 'arrival_time' to datetime format to handle time differences
stop_times_u1_mon_stephans_am['arrival_time'] = pd.to_datetime(stop_times_u1_mon_stephans_am['arrival_time'], format='%H:%M:%S').dt.time

stop_times_u1_mon_stephans_am['arrival_time_delta'] = pd.to_timedelta(
    stop_times_u1_mon_stephans_am['arrival_time'].astype(str)
)
stop_times_u1_mon_stephans_am = stop_times_u1_mon_stephans_am.sort_values(by='arrival_time_delta')
stop_times_u1_mon_stephans_am['headway'] = stop_times_u1_mon_stephans_am['arrival_time_delta'].diff().dt.total_seconds() / 60

# Calculate the average headway
average_headway = stop_times_u1_mon_stephans_am['headway'].mean()

print(f'The average headway is {average_headway:.2f} minutes')

The average headway is 2.57 minutes
