In [1]:
import pandas as pd
import numpy as np
import os

In [5]:
gtfs_folder = "gtfs_data"  # Folder with your GTFS zip or extracted files

routes = pd.read_csv(os.path.join(gtfs_folder, "routes.txt"), low_memory=False)
trips = pd.read_csv(os.path.join(gtfs_folder, "trips.txt"), low_memory=False)
stop_times = pd.read_csv(os.path.join(gtfs_folder, "stop_times.txt"), low_memory=False)
stops = pd.read_csv(os.path.join(gtfs_folder, "stops.txt"), low_memory=False)


In [7]:
gtfs_merged = stop_times.merge(trips, on='trip_id', how='left').merge(routes, on='route_id', how='left')

In [9]:
mode_mapping = {
    0: 'tram',       # Light rail
    1: 'subway',     # Metro
    2: 'rail',       # Train
    3: 'bus'         # Bus
}

# Replace numeric route types with mode names
gtfs_merged['mode'] = gtfs_merged['route_type'].map(mode_mapping)

# Drop rows with unknown modes
gtfs_merged = gtfs_merged.dropna(subset=['mode'])

# Count number of stops per mode
mode_counts = gtfs_merged.groupby(['stop_id', 'mode']).size().unstack(fill_value=0).reset_index()

In [11]:
stops_mode = pd.merge(stops[['stop_id', 'stop_lat', 'stop_lon']], mode_counts, on='stop_id', how='inner')

In [13]:
import geopandas as gpd
from shapely.geometry import Point

stops_mode['geometry'] = stops_mode.apply(lambda row: Point(row['stop_lon'], row['stop_lat']), axis=1)
stops_gdf = gpd.GeoDataFrame(stops_mode, geometry='geometry', crs='EPSG:4326')

In [15]:
# 1. Load SA2 shapefile (if not already loaded)
sa2_shapes = gpd.read_file("Assignment Resources/SA2_2021_AUST_SHP_GDA2020")  # adjust path if needed

# 2. Ensure both are in the same CRS
stops_gdf = stops_gdf.to_crs(sa2_shapes.crs)

# 3. Spatial join: assign each stop to its SA2 region
stops_with_sa2 = gpd.sjoin(stops_gdf, sa2_shapes[['SA2_NAME21', 'geometry']], how='inner', predicate='within')

In [21]:
routes['route_type'].value_counts()

route_type
712    8674
700    1143
204     155
205     107
714      59
106      25
4        24
2        17
900       6
401       1
Name: count, dtype: int64

In [23]:
routes[['route_id', 'route_type']].drop_duplicates().sort_values('route_type')

Unnamed: 0,route_id,route_type
2353,2-T1-W-sj2-1,2
2354,2-T2-sj2-1,2
2357,2-T5-sj2-1,2
2352,2-T1-N-sj2-1,2
2351,2-SCO-sj2-1,2
...,...,...
10128,78-NLR-sj2-1,900
10127,78-LX-sj2-1,900
10126,78-L4-sj2-1,900
10125,78-L3-sj2-1,900


In [25]:
route_type_map = {
    2: 'train',
    4: 'ferry',
    700: 'bus',
    712: 'bus',
    714: 'bus',
    204: 'metro',     # hypothetical
    205: 'light_rail',  # hypothetical
    106: 'tram',
    900: 'metro',
    # Add more mappings based on inspection
}

In [27]:
routes['mode'] = routes['route_type'].map(route_type_map)

In [29]:
routes[['route_id', 'route_type']].drop_duplicates().sort_values('route_type')

Unnamed: 0,route_id,route_type
2353,2-T1-W-sj2-1,2
2354,2-T2-sj2-1,2
2357,2-T5-sj2-1,2
2352,2-T1-N-sj2-1,2
2351,2-SCO-sj2-1,2
...,...,...
10128,78-NLR-sj2-1,900
10127,78-LX-sj2-1,900
10126,78-L4-sj2-1,900
10125,78-L3-sj2-1,900


In [31]:
route_type_mapping = {
    2: 'Train',
    700: 'Bus',
    701: 'Bus',
    702: 'Bus',
    703: 'Bus',
    704: 'Bus',
    705: 'Bus',
    706: 'Bus',
    707: 'Bus',
    708: 'Bus',
    709: 'Bus',
    710: 'Bus',
    711: 'Bus',
    712: 'Bus',
    713: 'Bus',
    714: 'Bus',
    900: 'Light Rail',  # Often used for Metro/Light Rail
    204: 'Ferry',
    205: 'Ferry',
    4: 'Ferry',  # Standard GTFS for Ferry
    106: 'Metro',
    401: 'Regional'  # Or custom category
}

In [33]:
routes['mode'] = routes['route_type'].map(route_type_mapping).fillna('Other')

In [37]:
import geopandas as gpd
from shapely.geometry import Point

# 1. Convert stop coordinates to geometry points
stops['geometry'] = stops.apply(lambda row: Point(row['stop_lon'], row['stop_lat']), axis=1)
stops_gdf = gpd.GeoDataFrame(stops, geometry='geometry', crs='EPSG:4326')

# 2. Load suburb boundaries (update path if needed)
abs_shapefile_path = "Assignment Resources/SA2_2021_AUST_SHP_GDA2020"
suburb_shapes = gpd.read_file(abs_shapefile_path)

# 3. Reproject suburb_shapes to WGS84 (EPSG:4326)
suburb_shapes_wgs84 = suburb_shapes.to_crs(epsg=4326)

# 4. Spatial join to assign SA2_NAME21 to each stop
stops_with_sa2 = gpd.sjoin(stops_gdf, suburb_shapes_wgs84[['SA2_NAME21', 'geometry']], how="left", predicate='within')

# 5. Check if merge was successful
stops_with_sa2[['stop_id', 'stop_name', 'SA2_NAME21']].head()

Unnamed: 0,stop_id,stop_name,SA2_NAME21
0,2000110,"Central Grand Concourse, Light Rail Trackwork",Sydney (South) - Haymarket
1,2000112,"Central Station, Forecourt, Coach Bay 5",Sydney (South) - Haymarket
2,2000115,"Central Station, Forecourt, Coach Bay 7",Sydney (South) - Haymarket
3,2000124,"Central Station, Forecourt, Coach Bay 9",Sydney (South) - Haymarket
4,2000132,"Central Station, Forecourt, Coach Bay 8",Sydney (South) - Haymarket


In [41]:
# Merge GTFS trips with mode info
trips_with_modes = trips.merge(routes[['route_id', 'mode']], on='route_id', how='left')
trip_stop_times = stop_times.merge(trips_with_modes[['trip_id', 'mode']], on='trip_id', how='left')

# Now merge with stops_with_sa2 instead of `stops`
stops_with_modes = trip_stop_times.merge(stops_with_sa2[['stop_id', 'SA2_NAME21']], on='stop_id', how='left')

# Drop NAs and continue
stops_with_modes_clean = stops_with_modes.dropna(subset=['SA2_NAME21', 'mode'])

pta_mode_counts = stops_with_modes_clean.groupby(['SA2_NAME21', 'mode']).size().reset_index(name='count')
pta_mode_wide = pta_mode_counts.pivot(index='SA2_NAME21', columns='mode', values='count').fillna(0).reset_index()

pta_mode_wide.head()

mode,SA2_NAME21,Bus,Ferry,Light Rail,Metro,Regional,Train
0,Acacia Gardens,1818.0,0.0,0.0,0.0,0.0,0.0
1,Acton,32.0,0.0,0.0,0.0,0.0,0.0
2,Adamstown - Kotara,10092.0,0.0,0.0,0.0,0.0,990.0
3,Adelaide,0.0,4.0,0.0,0.0,0.0,0.0
4,Ainslie,4.0,0.0,0.0,0.0,0.0,0.0


In [43]:
# Load the existing PTAI + IRSD data
pta_irsd_df = pd.read_csv("sydney_sa2_pta_geom.csv")

# Merge with the new mode-wise PTAI scores
pta_combined = pta_irsd_df.merge(pta_mode_wide, how='left', left_on='SA2_NAME_2011', right_on='SA2_NAME21')

# Check output
pta_combined.head()

Unnamed: 0,SA2_CODE21,SA2_NAME21_x,CHG_FLAG21,CHG_LBL21,SA3_CODE21,SA3_NAME21,SA4_CODE21,SA4_NAME21,GCC_CODE21,GCC_NAME21,...,IRSD_DECILE,pta_score,SA2_NAME21_y,SA2_NAME21,Bus,Ferry,Light Rail,Metro,Regional,Train
0,102011028,Avoca Beach - Copacabana,0,No change,10201,Gosford,102,Central Coast,1GSYD,Greater Sydney,...,9.0,6.5,Avoca Beach - Copacabana,Avoca Beach - Copacabana,3569.0,0.0,0.0,0.0,0.0,0.0
1,102011029,Box Head - MacMasters Beach,0,No change,10201,Gosford,102,Central Coast,1GSYD,Greater Sydney,...,8.0,0.866667,Box Head - MacMasters Beach,Box Head - MacMasters Beach,6312.0,23.0,0.0,0.0,0.0,0.0
2,102011030,Calga - Kulnura,0,No change,10201,Gosford,102,Central Coast,1GSYD,Greater Sydney,...,6.0,0.032012,Calga - Kulnura,Calga - Kulnura,1088.0,0.0,0.0,0.0,0.0,549.0
3,102011031,Erina - Green Point,0,No change,10201,Gosford,102,Central Coast,1GSYD,Greater Sydney,...,6.0,0.782609,Erina - Green Point,Erina - Green Point,18179.0,0.0,0.0,0.0,0.0,0.0
4,102011032,Gosford - Springfield,0,No change,10201,Gosford,102,Central Coast,1GSYD,Greater Sydney,...,4.0,3.625,Gosford - Springfield,Gosford - Springfield,20587.0,35.0,0.0,91.0,0.0,1394.0


In [45]:
pta_combined.to_csv("sydney_sa2_pta_geom_modes.csv", index=False)

In [47]:
pta_mode_by_sa2

Unnamed: 0,SA2_NAME21,pta_train
0,Adamstown - Kotara,990
1,Albion Park Rail,885
2,Arncliffe - Bardwell Valley,3717
3,Artarmon,5736
4,Ashfield - South,5461
...,...,...
198,Woonona - Bulli - Russell Vale,1328
199,Woy Woy - Blackwall,1230
200,Wyong,1312
201,Yagoona - Birrong,1022
