In [18]:
import numpy as np
import pandas as pd
import pyproj
import data.constants as c
import plotly.express as px
import geopandas as gpd
from shapely.geometry import Point

### Data Processing

In [19]:
import os
import platform

def get_data_folder_path():
    # Get the current operating system
    os_type = platform.system()
    user_name = os.getlogin()

    # Define data folder paths for different systems
    if os_type == 'Windows' and user_name == 'Alice':
        data_folder_path = f"C:\\Users\\{user_name}\\Documents\\Data"
    elif os_type == 'Linux' and user_name == 'salathem':
        data_folder_path = '/cluster/home/salathem/Programming/data/'
    elif os_type == 'Darwin':
        data_folder_path = '/Users/Marco/Library/CloudStorage/OneDrive-Persönlich/ETHZ/Agent Based Modeling/data/'
    else:
        raise Exception("Unsupported system configuration")

    return data_folder_path
data_path = get_data_folder_path()

In [20]:
def execute(path):
    data_path = path

    df_mz_trips = pd.read_csv("%s/microzensus/wege.csv" % data_path, encoding = "latin1")
    df_mz_stages = pd.read_csv("%s/microzensus/etappen.csv" % data_path, encoding = "latin1")

    df_mz_trips = df_mz_trips[[
        "HHNR", "WEGNR", "f51100", "f51400", "wzweck1", "wzweck2", "wmittel",
        "S_X_CH1903", "S_Y_CH1903", "Z_X_CH1903", "Z_Y_CH1903", "W_X_CH1903", "W_Y_CH1903",
        "w_rdist", 'dauer2'
    ]]

    df_mz_stages = df_mz_stages[[
        "HHNR", "WEGNR", "ETNR", "f51300"
    ]]

    # First, adjust the modes
    df_mz_trips.loc[df_mz_trips["wmittel"] == -99, "mode"] = "unknown" # Pseudo stage
    df_mz_trips.loc[df_mz_trips["wmittel"] == 1, "mode"] = "pt" # Plane
    df_mz_trips.loc[df_mz_trips["wmittel"] == 2, "mode"] = "pt" # Train
    df_mz_trips.loc[df_mz_trips["wmittel"] == 3, "mode"] = "pt" # Postauto
    df_mz_trips.loc[df_mz_trips["wmittel"] == 4, "mode"] = "pt" # Ship
    df_mz_trips.loc[df_mz_trips["wmittel"] == 5, "mode"] = "pt" # Tram
    df_mz_trips.loc[df_mz_trips["wmittel"] == 6, "mode"] = "pt" # Bus
    df_mz_trips.loc[df_mz_trips["wmittel"] == 7, "mode"] = "pt" # other PT
    df_mz_trips.loc[df_mz_trips["wmittel"] == 8, "mode"] = "pt" # Reisecar -> I think this is a coach in Swiss German?
    df_mz_trips.loc[df_mz_trips["wmittel"] == 9, "mode"] = "car" # Car
    df_mz_trips.loc[df_mz_trips["wmittel"] == 10, "mode"] = "car" # Truck
    df_mz_trips.loc[df_mz_trips["wmittel"] == 11, "mode"] = "pt" # Taxi
    df_mz_trips.loc[df_mz_trips["wmittel"] == 12, "mode"] = "car" # Motorbike
    df_mz_trips.loc[df_mz_trips["wmittel"] == 13, "mode"] = "car" # Mofa
    df_mz_trips.loc[df_mz_trips["wmittel"] == 14, "mode"] = "bike" # Biciycle / E-bike
    df_mz_trips.loc[df_mz_trips["wmittel"] == 15, "mode"] = "walk" # Walking
    df_mz_trips.loc[df_mz_trips["wmittel"] == 16, "mode"] = "car" # "Machines similar to a vehicle"
    df_mz_trips.loc[df_mz_trips["wmittel"] == 17, "mode"] = "unknown" # Other / don't know

    df_mz_trips["mode_detailed"] = df_mz_trips["mode"]
    df_mz_trips.loc[df_mz_trips["wmittel"] == 1, "mode_detailed"] = "plane"
    df_mz_trips.loc[df_mz_trips["wmittel"] == 11, "mode_detailed"] = "taxi"

    # Find passenger trips
    df_mz_stages["is_car_passenger"] = df_mz_stages["f51300"] == 8
    df_passengers = df_mz_stages[["HHNR", "WEGNR", "is_car_passenger"]].groupby(["HHNR", "WEGNR"]).sum().reset_index()
    df_mz_trips = pd.merge(df_mz_trips, df_passengers, on = ["HHNR", "WEGNR"], how = "left")
    df_mz_trips.loc[df_mz_trips["is_car_passenger"] > 0, "mode_detailed"] = "car_passenger"
    df_mz_trips.loc[df_mz_trips["is_car_passenger"] > 0, "mode"] = "car_passenger"
    del df_mz_trips["is_car_passenger"]

    # Second, adjust the purposes
    df_mz_trips.loc[df_mz_trips["wzweck1"] == -99, "purpose"] = "unknown" # Pseudo stage
    df_mz_trips.loc[df_mz_trips["wzweck1"] == -98, "purpose"] = "unknown" # No answer
    df_mz_trips.loc[df_mz_trips["wzweck1"] == -97, "purpose"] = "unknown" # Don't know
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 1, "purpose"] = "interaction" # Transfer, change of mode, park car
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 2, "purpose"] = "work" # Work
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 3, "purpose"] = "education" # Education
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 4, "purpose"] = "shop" # Shopping
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 5, "purpose"] = "other" # Chores, use of public services
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 6, "purpose"] = "work" # Business activity
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 7, "purpose"] = "work" # Business trip
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 8, "purpose"] = "leisure" # Leisure
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 9, "purpose"] = "other" # Bring children
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 10, "purpose"] = "other" # Bring others (disabled, ...)
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 11, "purpose"] = "home" # Return home
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 12, "purpose"] = "unknown" # Other
    df_mz_trips.loc[df_mz_trips["wzweck1"] == 13, "purpose"] = "border" # Going out of country

    # Adjust trips back home
    df_mz_trips.loc[df_mz_trips["wzweck2"] > 1, "purpose"] = "home"

    # Adjust times
    df_mz_trips.loc[:, "departure_time"] = df_mz_trips["f51100"] * 60
    df_mz_trips.loc[:, "arrival_time"] = df_mz_trips["f51400"] * 60

    # Adjust id
    df_mz_trips.loc[:, "person_id"] = df_mz_trips["HHNR"]
    df_mz_trips.loc[:, "trip_id"] = df_mz_trips["WEGNR"]

    # Adjust coordinates
    for mz_attribute, df_attribute in [("Z", "destination"), ("S", "origin"), ("W", "home")]:
        coords = df_mz_trips[["%s_X_CH1903" % mz_attribute, "%s_Y_CH1903" % mz_attribute]].values
        transformer = pyproj.Transformer.from_crs(c.CH1903, c.CH1903_PLUS)
        x, y = transformer.transform(coords[:, 0], coords[:, 1])
        df_mz_trips.loc[:, "%s_x" % df_attribute] = x
        df_mz_trips.loc[:, "%s_y" % df_attribute] = y

    # Add crowfly distance
    df_mz_trips.loc[:, "crowfly_distance"] = np.sqrt(
        (df_mz_trips["origin_x"] - df_mz_trips["destination_x"])**2 +
        (df_mz_trips["origin_y"] - df_mz_trips["destination_y"])**2)

    # Add activity durations by joining the trips with themselves
    df_mz_trips.loc[:, "previous_trip_id"] = df_mz_trips["trip_id"] -1

    df_durations = pd.merge(
        df_mz_trips[["person_id", "trip_id", "departure_time"]],
        df_mz_trips[["person_id", "previous_trip_id", "arrival_time"]],
        left_on = ["person_id", "trip_id"], right_on = ["person_id", "previous_trip_id"])

    df_durations.loc[:, "activity_duration"] = df_durations["arrival_time"] - df_durations["departure_time"]

    df_mz_trips = pd.merge(
        df_mz_trips, df_durations[["person_id", "trip_id", "activity_duration"]],
        on = ["person_id", "trip_id"], how = "left"
    )

    # Filter persons for which we do not have sufficient information
    unknown_ids = set(df_mz_trips[
        (df_mz_trips["mode"] == "unknown") | (df_mz_trips["purpose"] == "unknown")
    ]["person_id"])

    print("  Removed %d persons with trips with unknown mode or unknown purpose" % len(unknown_ids))
    df_mz_trips = df_mz_trips[~df_mz_trips["person_id"].isin(unknown_ids)]

    # Filter persons which do not start or end with "home"
    df_end = df_mz_trips[["person_id", "trip_id", "purpose"]].sort_values("trip_id", ascending = False).drop_duplicates("person_id")
    df_end = df_end[df_end["purpose"] != "home"]

    before_length = len(np.unique(df_mz_trips["person_id"]))
    df_mz_trips = df_mz_trips[~df_mz_trips["person_id"].isin(df_end["person_id"])]
    after_length = len(np.unique(df_mz_trips["person_id"]))
    print("  Removed %d persons with trips not ending with 'home'" % (before_length - after_length,))

    df_start = df_mz_trips[["person_id", "trip_id", "origin_x", "origin_y", "home_x", "home_y"]]
    df_start = df_start[
        (df_start["trip_id"] == 1) & ((df_start["origin_x"] != df_start["home_x"]) |
        (df_start["origin_y"] != df_start["home_y"]))
    ]

    before_length = len(np.unique(df_mz_trips["person_id"]))
    df_mz_trips = df_mz_trips[~df_mz_trips["person_id"].isin(df_start["person_id"])]
    after_length = len(np.unique(df_mz_trips["person_id"]))
    print("  Removed %d persons with trips not starting at home location" % (before_length - after_length,))

    # Parking cost
    df_mz_stages = pd.read_csv("%s/microzensus/etappen.csv" % data_path, encoding = "latin1")

    df_cost = pd.DataFrame(df_mz_stages[["HHNR", "WEGNR", "f51330"]], copy = True)
    df_cost.columns = ["person_id", "trip_id", "parking_cost"]
    df_cost["parking_cost"] = np.maximum(0, df_cost["parking_cost"])
    df_cost = df_cost.groupby(["person_id", "trip_id"]).sum().reset_index()

    df_mz_trips = pd.merge(df_mz_trips, df_cost, on = ["person_id", "trip_id"], how = "left")
    assert(not np.any(np.isnan(df_mz_trips["parking_cost"])))

    # Network distance
    df_mz_trips["network_distance"] = df_mz_trips["w_rdist"] * 1000.0

    return df_mz_trips

In [21]:
trips = execute(data_path)

  Removed 1217 persons with trips with unknown mode or unknown purpose
  Removed 1136 persons with trips not ending with 'home'
  Removed 4161 persons with trips not starting at home location


In [23]:
# Load geographic data from a shapefile
shapefile_path = data_path + '/scenarios/Zurich/ScenarioBoundary/zurich_city_5km.shp'  # please replace with your shapefile path
gdf = gpd.read_file(shapefile_path)

zurich_polygon = gdf[gdf['scenario'] == 'zurich_city'].iloc[0]['geometry']

# Create Point geometries for origin and destination
trips['origin_point'] = trips.apply(lambda row: Point(row['origin_x'], row['origin_y']), axis=1)
trips['destination_point'] = trips.apply(lambda row: Point(row['destination_x'], row['destination_y']), axis=1)

# Filter trips where both origin and destination are within the Zurich city polygon
filtered_trips = trips[
    trips['origin_point'].apply(lambda point: point.within(zurich_polygon)) &
    trips['destination_point'].apply(lambda point: point.within(zurich_polygon))
]


In [24]:
# Function to create activity chains
def create_activity_chain(group):
    chain = '-'.join(['h'] + [purpose[0] for purpose in group['purpose'].tolist()])  # Add 'H' at the start of each chain
    return pd.Series({'activity_chain': chain})

# Create activity chains
df_activity_chains =  filtered_trips.groupby(['person_id']).apply(create_activity_chain).reset_index()


In [25]:
filtered_trips.to_csv(data_path + '/microzensus/trips.csv')
df_mz_trips = filtered_trips

### Display Data

In [26]:
# Capitalize and remove underscores from mode names
df_mz_trips['mode'] = df_mz_trips['mode'].str.replace('_', ' ').str.upper()

# Calculate total counts for each mode
mode_counts = df_mz_trips['mode'].value_counts().reset_index()
mode_counts.columns = ['Mode', 'Count']

# Plot total counts
fig1 = px.bar(mode_counts, x='Mode', y='Count', title='Mode Share Distribution - Total Counts',
              labels={'Count': 'Total Count', 'Mode': 'Mode of Transportation'})
fig1.update_layout(width=600, height=600)
fig1.show()

# Calculate percentage distribution for each mode
mode_counts['Percentage'] = (mode_counts['Count'] / mode_counts['Count'].sum()) * 100

# Plot percentage distribution
fig2 = px.bar(mode_counts, x='Mode', y='Percentage', title='Mode Share Distribution - Percentage',
              labels={'Percentage': 'Percentage (%)', 'Mode': 'Mode of Transportation'})
fig2.update_layout(width=600, height=600)
fig2.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [27]:
# Convert seconds to datetime and resample times to 15-minute bins
df_mz_trips['departure_time'] = pd.to_datetime(df_mz_trips['departure_time'], unit='s').dt.floor('30T').dt.time
df_mz_trips['arrival_time'] = pd.to_datetime(df_mz_trips['arrival_time'], unit='s').dt.floor('30T').dt.time

# Count occurrences in each 15-minute bin
departure_counts = df_mz_trips.groupby('departure_time').size().reset_index(name='Count')
departure_counts['Type'] = 'Departures'
departure_counts = departure_counts.rename(columns={'departure_time': 'Time'})

arrival_counts = df_mz_trips.groupby('arrival_time').size().reset_index(name='Count')
arrival_counts['Type'] = 'Arrivals'
arrival_counts = arrival_counts.rename(columns={'arrival_time': 'Time'})

# Combine data
time_counts = pd.concat([departure_counts, arrival_counts], axis=0)

# Plot using Plotly Express
fig = px.bar(time_counts, x='Time', y='Count', color='Type',
             title='Departure and Arrival Times over a Day',
             labels={'Count': 'Count', 'Time': 'Time of Day'},
             barmode='group')

# Customize x-axis ticks and scale y-axis
fig.update_xaxes(type='category', tickangle=45, dtick=1)
fig.update_yaxes(range=[0, time_counts['Count'].max()])

# Show plot
fig.update_layout(width=1200, height=600)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:
# Capitalize and remove underscores from purpose names
df_mz_trips['purpose'] = df_mz_trips['purpose'].str.replace('_', ' ').str.upper()

# Calculate total counts for each purpose
purpose_counts = df_mz_trips['purpose'].value_counts().reset_index()
purpose_counts.columns = ['Purpose', 'Count']

# Plot total counts
fig1 = px.bar(purpose_counts, x='Purpose', y='Count', title='Purpose Distribution - Total Counts',
              labels={'Count': 'Total Count', 'Purpose': 'Purpose'})
fig1.update_layout(width=600, height=600)
fig1.show()

# Calculate percentage distribution for each purpose
purpose_counts['Percentage'] = (purpose_counts['Count'] / purpose_counts['Count'].sum()) * 100

# Plot percentage distribution
fig2 = px.bar(purpose_counts, x='Purpose', y='Percentage', title='Purpose Distribution - Percentage',
              labels={'Percentage': 'Percentage (%)', 'Purpose': 'Purpose'})
fig2.update_layout(width=600, height=600)
fig2.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
print(df_activity_chains.activity_chain.nunique())
df_activity_chains

545


Unnamed: 0,person_id,activity_chain
0,100116,h-o-s-o-h-h
1,100391,h-s-h-h
2,100593,h-l-l
3,101196,h-l
4,101326,h-w-o-o-h
...,...,...
2383,499290,h-l
2384,499399,h-l-h-l-h-l-h
2385,499853,h-l-l-h
2386,499917,h-w-h


In [30]:
filtered_trips[['HHNR', 'WEGNR', 'purpose']]

Unnamed: 0,HHNR,WEGNR,purpose
50,100116,1,OTHER
51,100116,2,SHOP
52,100116,3,OTHER
53,100116,4,HOME
56,100116,7,HOME
...,...,...,...
170516,499853,3,HOME
170519,499917,1,WORK
170520,499917,2,HOME
170538,499996,1,LEISURE


In [31]:
filtered_trips[trips.person_id == 101196]


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,HHNR,WEGNR,f51100,f51400,wzweck1,wzweck2,wmittel,S_X_CH1903,S_Y_CH1903,Z_X_CH1903,...,origin_y,home_x,home_y,crowfly_distance,previous_trip_id,activity_duration,parking_cost,network_distance,origin_point,destination_point
520,101196,2,913,1004,8,1,5,682736,248230,681880,...,1248230.0,2634763.0,1171194.0,871.755547,1,19680.0,0.0,6130.0,POINT (2682736.899315401 1248229.838946302),POINT (2681880.900319892 1248064.84363314)


In [32]:
# Calculate total counts for each activity chain
chain_counts = df_activity_chains['activity_chain'].value_counts().reset_index()
chain_counts.columns = ['Activity Chain', 'Count']

# Plot total counts
fig = px.bar(chain_counts, x='Activity Chain', y='Count', title='Activity Chain Distribution - Total Counts',
             labels={'Count': 'Total Count', 'Activity Chain': 'Activity Chain'})
fig.update_layout(width=1600, height=800)
fig.show()