In [1]:
import pandas as pd
import geopandas as gpd
import acbm
from pam.read import load_travel_diary
from pam import write

from shapely import wkt, Point


This notebook shows how to save acbm output as matsim xml files using PAM. A number of preprocessing steps need to be done befehand
Ideally, some of these should be fixed upstream in the acbm code. Some may be the result of NTS data issues.

# Read the data

In [2]:
# path to the data
individuals = pd.read_csv(acbm.root_path / "data/processed/activities_pam/people.csv")
households = pd.read_csv(acbm.root_path / "data/processed/activities_pam/households.csv")
activities = pd.read_csv(acbm.root_path / "data/processed/activities_pam/activities.csv")
legs = pd.read_csv(acbm.root_path / "data/processed/activities_pam/legs.csv")
legs_geo = pd.read_parquet(acbm.root_path / "data/processed/activities_pam/legs_with_locations.parquet")

In [3]:
activities.head(5)

Unnamed: 0.1,Unnamed: 0,pid,hid,freq,activity,seq,start time,end time,duration,zone
0,0,199,89,,home,0.0,1900-01-01 00:00:00,1900-01-01 03:40:00,3:40:00,E00059031
1,1,199,89,,work,2.0,1900-01-01 03:55:00,1900-01-01 15:00:00,11:05:00,E00059010
2,2,199,89,,home,3.0,1900-01-01 15:25:00,1900-01-01 16:15:00,0:50:00,E00059031
3,3,199,89,,escort,4.0,1900-01-01 16:20:00,1900-01-01 16:30:00,0:10:00,E00187029
4,4,199,89,,home,5.0,1900-01-01 16:35:00,1900-01-01 17:25:00,0:50:00,E00059031


In [4]:
legs_geo.head(5)

Unnamed: 0,pid,hid,ozone,dzone,purp,origin activity,destination activity,mode,seq,tst,tet,duration,start_location_id,end_location_id,start_location_geometry_wkt,end_location_geometry_wkt
0,199,89,E00059031,E00059010,work,home,work,car,1.0,1900-01-01 03:40:00,1900-01-01 03:55:00,0:15:00,1735597748,396275870,POINT (-1.399263822377326 53.934588245650026),POINT (-1.3055139634314723 53.913818157845895)
1,199,89,E00059031,E00059031,home,work,home,car,2.0,1900-01-01 15:00:00,1900-01-01 15:25:00,0:25:00,396275870,1735597748,POINT (-1.3055139634314723 53.913818157845895),POINT (-1.399263822377326 53.934588245650026)
2,199,89,E00059031,E00187029,escort,home,escort,car,3.0,1900-01-01 16:15:00,1900-01-01 16:20:00,0:05:00,1735597748,406897034,POINT (-1.399263822377326 53.934588245650026),POINT (-1.549098144281633 53.79109058480764)
3,199,89,E00187029,E00059031,home,escort,home,car,4.0,1900-01-01 16:30:00,1900-01-01 16:35:00,0:05:00,406897034,1735597748,POINT (-1.549098144281633 53.79109058480764),POINT (-1.399263822377326 53.934588245650026)
4,199,89,E00059031,E00169811,escort,home,escort,car,5.0,1900-01-01 17:25:00,1900-01-01 17:30:00,0:05:00,1735597748,793245518,POINT (-1.399263822377326 53.934588245650026),POINT (-1.545446674787045 53.79345475501955)


# Clean the data

In [5]:
# We will be removing some rows in each planning operation. This function helps keep a 
# record of the number of rows in each table after each operation.

row_counts = []

# # Function to log row counts
def log_row_count(df, name, operation):
    row_counts.append((operation, name, len(df)))


### Record number of rows

In [6]:
log_row_count(individuals, "individuals", "0_initial")
log_row_count(households, "households", "0_initial")
log_row_count(activities, "activities", "0_initial")
log_row_count(legs, "legs", "0_initial")
log_row_count(legs_geo, "legs_geo", "0_initial")

In [7]:
row_counts

[('0_initial', 'individuals', 5560),
 ('0_initial', 'households', 3267),
 ('0_initial', 'activities', 21173),
 ('0_initial', 'legs', 15613),
 ('0_initial', 'legs_geo', 15613)]

## Remove people that don't exist across all datasets

When writing to matsim using pam, we get an error when a pid exists in one dataset but not in the other. We will remove these people from the datasets.

In [8]:
def filter_by_pid(individuals, activities, legs, legs_geo, households):
    """
    Filter the input DataFrames to include only include people (pids) that exist in all
    dfs 

    Parameters
    ----------
    individuals: pd.DataFrame
        Individuals DataFrame.
    activities: pd.DataFrame
        Activities DataFrame.
    legs: pd.DataFrame: 
        Legs DataFrame.
    legs_geo: pd.DataFrame
        Legs with geo DataFrame.
    households: pd.DataFrame
        Households DataFrame.

    Returns
    -------
    tuple
        A tuple containing the filtered DataFrames (individuals, activities, legs, legs_geo, households).
    """
    # Identify common pids
    common_pids = set(individuals["pid"]).intersection(activities["pid"]).intersection(legs["pid"]).intersection(legs_geo["pid"])

    # Filter Individual Level DataFrames
    individuals = individuals[individuals["pid"].isin(common_pids)]
    activities = activities[activities["pid"].isin(common_pids)]
    legs = legs[legs["pid"].isin(common_pids)]
    legs_geo = legs_geo[legs_geo["pid"].isin(common_pids)]

    # Filter Household Level DataFrame
    households = households[households["hid"].isin(individuals["hid"])]

    return individuals, activities, legs, legs_geo, households

# Apply
individuals, activities, legs, legs_geo, households = filter_by_pid(individuals, activities, legs, legs_geo, households)

In [9]:
log_row_count(individuals, "individuals", "1_filter_by_pid")
log_row_count(households, "households", "1_filter_by_pid")
log_row_count(activities, "activities", "1_filter_by_pid")
log_row_count(legs, "legs", "1_filter_by_pid")
log_row_count(legs_geo, "legs_geo", "1_filter_by_pid")

In [10]:
row_counts

[('0_initial', 'individuals', 5560),
 ('0_initial', 'households', 3267),
 ('0_initial', 'activities', 21173),
 ('0_initial', 'legs', 15613),
 ('0_initial', 'legs_geo', 15613),
 ('1_filter_by_pid', 'individuals', 5527),
 ('1_filter_by_pid', 'households', 3253),
 ('1_filter_by_pid', 'activities', 21140),
 ('1_filter_by_pid', 'legs', 15613),
 ('1_filter_by_pid', 'legs_geo', 15613)]

In [11]:
sorted(row_counts, key=lambda x: x[0])

[('0_initial', 'individuals', 5560),
 ('0_initial', 'households', 3267),
 ('0_initial', 'activities', 21173),
 ('0_initial', 'legs', 15613),
 ('0_initial', 'legs_geo', 15613),
 ('1_filter_by_pid', 'individuals', 5527),
 ('1_filter_by_pid', 'households', 3253),
 ('1_filter_by_pid', 'activities', 21140),
 ('1_filter_by_pid', 'legs', 15613),
 ('1_filter_by_pid', 'legs_geo', 15613)]

In [12]:
# all rows where start_location_geometry_wkt is null
legs_geo[legs_geo['start_location_geometry_wkt'].isnull()]

# all rows where end_location_geometry_wkt is null
#legs_geo[legs_geo['end_location_geometry_wkt'].isnull()]

Unnamed: 0,pid,hid,ozone,dzone,purp,origin activity,destination activity,mode,seq,tst,tet,duration,start_location_id,end_location_id,start_location_geometry_wkt,end_location_geometry_wkt
61,2599,1178,E00059033,E00059033,home,work,home,bike,2.0,1900-01-01 16:30:00,1900-01-01 16:42:00,0:12:00,,1686166180,,POINT (-1.404103278117469 53.932568458070904)
168,8604,3856,E00059018,E00169797,visit,home,visit,car,1.0,1900-01-01 09:00:00,1900-01-01 13:50:00,4:50:00,,5486065,,POINT (-1.5460523885771345 53.795387301592854)
245,14523,6256,E00058340,E00058340,home,work,home,bike,2.0,1900-01-01 16:15:00,1900-01-01 16:30:00,0:15:00,,434335536,,POINT (-1.6894687523869094 53.91648662464584)
300,16994,7443,,E00058342,home,education,home,car,2.0,1900-01-01 17:05:00,1900-01-01 17:10:00,0:05:00,,434272106,,POINT (-1.7160385871105066 53.91430614745741)
316,17680,7770,E00058339,E00169788,visit,home,visit,car,1.0,1900-01-01 17:30:00,1900-01-01 17:50:00,0:20:00,,310161982,,POINT (-1.5321341689042165 53.79709234774438)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15528,788745,332192,E00170268,E00058196,other,home,other,pt,1.0,1900-01-01 13:20:00,1900-01-01 14:45:00,1:25:00,,360280832,,POINT (-1.5994355884342584 53.742369840245125)
15545,790235,332883,E00170037,E00170457,other,home,other,car,2.0,1900-01-01 19:00:00,1900-01-01 19:10:00,0:10:00,,739069206,,POINT (-1.5320678237431327 53.74784766856606)
15553,790365,332955,E00057791,E00170457,visit,home,visit,pt,1.0,1900-01-01 13:00:00,1900-01-01 14:30:00,1:30:00,,559255900,,POINT (-1.5300967558033916 53.748946003757354)
15554,790367,332955,E00057791,E00058697,visit,home,visit,pt,1.0,1900-01-01 13:00:00,1900-01-01 14:30:00,1:30:00,,1351560664,,POINT (-1.5004604814652411 53.827260070501495)


## Rename columns for PAM

In [13]:
# TODO rename in 3.3_assign_facility_all script
# rename start_location_geometry_wkt and end_location_geometry_wkt to start_loc and end_loc
legs_geo.rename(columns={"start_location_geometry_wkt": "start_loc", "end_location_geometry_wkt": "end_loc"}, inplace=True)
legs_geo


Unnamed: 0,pid,hid,ozone,dzone,purp,origin activity,destination activity,mode,seq,tst,tet,duration,start_location_id,end_location_id,start_loc,end_loc
0,199,89,E00059031,E00059010,work,home,work,car,1.0,1900-01-01 03:40:00,1900-01-01 03:55:00,0:15:00,1735597748,396275870,POINT (-1.399263822377326 53.934588245650026),POINT (-1.3055139634314723 53.913818157845895)
1,199,89,E00059031,E00059031,home,work,home,car,2.0,1900-01-01 15:00:00,1900-01-01 15:25:00,0:25:00,396275870,1735597748,POINT (-1.3055139634314723 53.913818157845895),POINT (-1.399263822377326 53.934588245650026)
2,199,89,E00059031,E00187029,escort,home,escort,car,3.0,1900-01-01 16:15:00,1900-01-01 16:20:00,0:05:00,1735597748,406897034,POINT (-1.399263822377326 53.934588245650026),POINT (-1.549098144281633 53.79109058480764)
3,199,89,E00187029,E00059031,home,escort,home,car,4.0,1900-01-01 16:30:00,1900-01-01 16:35:00,0:05:00,406897034,1735597748,POINT (-1.549098144281633 53.79109058480764),POINT (-1.399263822377326 53.934588245650026)
4,199,89,E00059031,E00169811,escort,home,escort,car,5.0,1900-01-01 17:25:00,1900-01-01 17:30:00,0:05:00,1735597748,793245518,POINT (-1.399263822377326 53.934588245650026),POINT (-1.545446674787045 53.79345475501955)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15608,794273,334647,E00187064,E00170040,home,shop,home,car,2.0,1900-01-01 11:20:00,1900-01-01 11:30:00,0:10:00,161018776,156862474,POINT (-1.4766943999717894 53.80655601001103),POINT (-1.5323959656166048 53.7918310553218)
15609,794273,334647,E00170040,E00170433,shop,home,shop,car,3.0,1900-01-01 12:30:00,1900-01-01 12:45:00,0:15:00,156862474,330474538,POINT (-1.5323959656166048 53.7918310553218),POINT (-1.5314662035857667 53.801889553354314)
15610,794273,334647,E00170433,E00170040,home,shop,home,car,4.0,1900-01-01 17:10:00,1900-01-01 17:30:00,0:20:00,330474538,156862474,POINT (-1.5314662035857667 53.801889553354314),POINT (-1.5323959656166048 53.7918310553218)
15611,794273,334647,E00170040,E00057576,other,home,other,car,5.0,1900-01-01 19:00:00,1900-01-01 19:05:00,0:05:00,156862474,1310300528,POINT (-1.5323959656166048 53.7918310553218),POINT (-1.5200520397523276 53.81710209134754)


## Remove people with missing locations

In [14]:
def filter_no_location(individuals, households, activities, legs, legs_geo):
    """
    Cleans the provided DataFrames by removing rows without location data. Gets all pids
    that have at least one row with missing location data, and removes all rows with 
    these pids. pids are geneerated from two sources: 
       - legs_geo with missing start_loc or end_loc
       - individuals with missing hzone 

    Parameters
    ----------
    individuals : pd.DataFrame
        DataFrame containing individual data.
    households : pd.DataFrame
        DataFrame containing household data.
    activities : pd.DataFrame
        DataFrame containing activity data.
    legs : pd.DataFrame
        DataFrame containing legs data.
    legs_geo : pd.DataFrame
        DataFrame containing legs with geographic data.

    Returns
    -------
    tuple
        A tuple containing the cleaned DataFrames (individuals_cleaned, households_cleaned, activities_cleaned, legs_cleaned, legs_geo_cleaned).
    """
    # Identify rows in legs_geo where start_loc or end_loc are null
    invalid_rows_legs_geo = legs_geo[legs_geo["start_loc"].isnull() | legs_geo["end_loc"].isnull()]

    # Extract the pid values associated with these rows
    invalid_pids_legs_geo = invalid_rows_legs_geo["pid"].unique()

    # Identify rows in individuals where hzone is null
    invalid_rows_individuals = individuals[individuals["hzone"].isnull()]

    # Extract the pid values associated with these rows
    invalid_pids_individuals = invalid_rows_individuals["pid"].unique()

    # Combine the invalid pid values from both sources
    invalid_pids = set(invalid_pids_legs_geo).union(set(invalid_pids_individuals))

    # Remove rows with these pids from all DataFrames
    individuals_cleaned = individuals[~individuals["pid"].isin(invalid_pids)]
    activities_cleaned = activities[~activities["pid"].isin(invalid_pids)]
    legs_cleaned = legs[~legs["pid"].isin(invalid_pids)]
    legs_geo_cleaned = legs_geo[~legs_geo["pid"].isin(invalid_pids)]

    # Extract remaining hid values from individuals_cleaned
    remaining_hids = individuals_cleaned["hid"].unique()

    # Filter households_cleaned to only include rows with hid values in remaining_hids
    households_cleaned = households[households["hid"].isin(remaining_hids)]

    return individuals_cleaned, households_cleaned, activities_cleaned, legs_cleaned, legs_geo_cleaned

# Apply
individuals, households, activities, legs, legs_geo = filter_no_location(individuals, 
                                                                         households, 
                                                                         activities, 
                                                                         legs, 
                                                                         legs_geo)



In [15]:
log_row_count(individuals, "individuals", "2_filter_no_location")
log_row_count(households, "households", "2_filter_no_location")
log_row_count(activities, "activities", "2_filter_no_location")
log_row_count(legs, "legs", "2_filter_no_location")
log_row_count(legs_geo, "legs_geo", "2_filter_no_location")


In [16]:
row_counts

[('0_initial', 'individuals', 5560),
 ('0_initial', 'households', 3267),
 ('0_initial', 'activities', 21173),
 ('0_initial', 'legs', 15613),
 ('0_initial', 'legs_geo', 15613),
 ('1_filter_by_pid', 'individuals', 5527),
 ('1_filter_by_pid', 'households', 3253),
 ('1_filter_by_pid', 'activities', 21140),
 ('1_filter_by_pid', 'legs', 15613),
 ('1_filter_by_pid', 'legs_geo', 15613),
 ('2_filter_no_location', 'individuals', 5241),
 ('2_filter_no_location', 'households', 3100),
 ('2_filter_no_location', 'activities', 20325),
 ('2_filter_no_location', 'legs', 15084),
 ('2_filter_no_location', 'legs_geo', 15084)]

In [17]:
def calculate_percentage_remaining(row_counts):
    """
    Calculate the percentage of rows remaining for each DataFrame based on the initial counts.

    Parameters
    ----------
    row_counts : list of tuples
        List of tuples containing stage, DataFrame names, and their row counts.

    Returns
    -------
    list of tuples
        List of tuples containing stage, DataFrame names, and their percentage of rows remaining.
    """
    # Extract initial counts
    initial_counts = {df_name: count for stage, df_name, count in row_counts if stage == '0_initial'}

    # Calculate percentage remaining
    percentage_remaining = []
    for stage, df_name, count in row_counts:
        if df_name in initial_counts:
            initial_count = initial_counts[df_name]
            percentage = round((count / initial_count) * 100, 1)
            percentage_remaining.append((stage, df_name, count, percentage))

    # Sort by df_name
    percentage_remaining.sort(key=lambda x: x[1])

    return percentage_remaining


percentages = calculate_percentage_remaining(row_counts)

# Print the percentages
for stage, df_name, count, percentage in percentages:
    print(f"{stage} - {df_name} - {count} rows: {percentage:.1f}% rows remaining")

# # Log the percentages
# for stage, df_name, count, percentage in percentages:
#     logging.info(f"{stage} - {df_name} - {count} rows: {percentage:.1f}% rows remaining")

0_initial - activities - 21173 rows: 100.0% rows remaining
1_filter_by_pid - activities - 21140 rows: 99.8% rows remaining
2_filter_no_location - activities - 20325 rows: 96.0% rows remaining
0_initial - households - 3267 rows: 100.0% rows remaining
1_filter_by_pid - households - 3253 rows: 99.6% rows remaining
2_filter_no_location - households - 3100 rows: 94.9% rows remaining
0_initial - individuals - 5560 rows: 100.0% rows remaining
1_filter_by_pid - individuals - 5527 rows: 99.4% rows remaining
2_filter_no_location - individuals - 5241 rows: 94.3% rows remaining
0_initial - legs - 15613 rows: 100.0% rows remaining
1_filter_by_pid - legs - 15613 rows: 100.0% rows remaining
2_filter_no_location - legs - 15084 rows: 96.6% rows remaining
0_initial - legs_geo - 15613 rows: 100.0% rows remaining
1_filter_by_pid - legs_geo - 15613 rows: 100.0% rows remaining
2_filter_no_location - legs_geo - 15084 rows: 96.6% rows remaining


## Convert to point 

In [22]:
# Function to convert to Point if not already a Point
def convert_to_point(value):
    if isinstance(value, Point):
        return value
    return wkt.loads(value)

# Convert start_loc and end_loc to shapely point objects
legs_geo["start_loc"] = legs_geo["start_loc"].apply(convert_to_point)
legs_geo["end_loc"] = legs_geo["end_loc"].apply(convert_to_point)

# Verify the type of the first element in the 'start_loc' column
print(type(legs_geo['start_loc'].iloc[0]))  # Should be <class 'shapely.geometry.point.Point'>

# Convert to GeoDataFrame with start_loc as the active geometry
legs_geo = gpd.GeoDataFrame(legs_geo, geometry='start_loc')


<class 'shapely.geometry.point.Point'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  legs_geo["start_loc"] = legs_geo["start_loc"].apply(convert_to_point)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  legs_geo["end_loc"] = legs_geo["end_loc"].apply(convert_to_point)


## Add home location to individuals 

In [23]:
def add_home_location_to_individuals(legs_geo, individuals):
    """
    Adds home location to individuals dataframe. Location is obtained 
    from legs_geo (rows with orign activity = home) 

    Parameters
    ----------
    legs_geo : pd.DataFrame
        DataFrame containing legs with geographic data.
    individuals : pd.DataFrame
        DataFrame containing individual data.

    Returns
    -------
    pd.DataFrame
        The modified individuals DataFrame with location information.
    """
    # Filter by origin activity = home
    legs_geo_home = legs_geo[legs_geo["origin activity"] == "home"]
    
    # Get one row for each hid group
    legs_geo_home = legs_geo_home.groupby("hid").first().reset_index()
    
    # Keep only the columns we need: hid and start_location
    legs_geo_home = legs_geo_home[["hid", "start_loc"]]
    
    # Rename start_loc to loc
    legs_geo_home.rename(columns={"start_loc": "loc"}, inplace=True)
    
    # Merge legs_geo_home with individuals
    individuals_geo = individuals.copy()
    individuals_geo = individuals_geo.merge(legs_geo_home, on="hid")
    
    # Remove rows with missing loc
    individuals_geo = individuals_geo[individuals_geo["loc"].notnull()]
    
    return individuals_geo

# Apply
individuals_geo = add_home_location_to_individuals(legs_geo, individuals)


## Add Car Ownership

TODO: get num_cars per household from spc_with_nts

this can then be passed on using hhs_attributes in pam.load_travel_diary

# Read in population data

In [24]:
population = load_travel_diary(
        trips=legs_geo,
        persons_attributes=individuals,
        tour_based=False,
        include_loc=True,
        sort_by_seq=True,
        # hhs_attributes = None,
    )


Using simple trip based purpose parser, this assumes first activity is 'home'.
If you do not wish to assume this, try setting 'tour_based' = True (default).



In [46]:
population[89][200].print()

Person: 200
{'hzone': 'E00059031', 'age_years': 38}
0:	Activity(act:home, location:POINT (-1.399263822377326 53.934588245650026), time:00:00:00 --> 12:30:00, duration:12:30:00)
1:	Leg(mode:car, area:POINT (-1.399263822377326 53.934588245650026) --> POINT (-1.5223977194364153 53.80563634318617), time:12:30:00 --> 13:00:00, duration:0:30:00)
2:	Activity(act:work, location:POINT (-1.5223977194364153 53.80563634318617), time:13:00:00 --> 16:30:00, duration:3:30:00)
3:	Leg(mode:car, area:POINT (-1.5223977194364153 53.80563634318617) --> POINT (-1.399263822377326 53.934588245650026), time:16:30:00 --> 17:00:00, duration:0:30:00)
4:	Activity(act:home, location:POINT (-1.399263822377326 53.934588245650026), time:17:00:00 --> 00:00:00, duration:7:00:00)


# Jitter the plans

In [25]:
from datetime import timedelta
from pam.samplers.time import apply_jitter_to_plan


for hid, pid, person in population.people():
    apply_jitter_to_plan(
        person.plan,
        jitter=timedelta(minutes=30),
        min_duration=timedelta(minutes=10)
    )
    # crop to 24-hours
    person.plan.crop()

In [48]:
population[89][200].print()

Person: 200
{'hzone': 'E00059031', 'age_years': 38}
0:	Activity(act:home, location:POINT (-1.399263822377326 53.934588245650026), time:00:00:00 --> 12:40:36, duration:12:40:36)
1:	Leg(mode:car, area:POINT (-1.399263822377326 53.934588245650026) --> POINT (-1.5223977194364153 53.80563634318617), time:12:40:36 --> 13:10:36, duration:0:30:00)
2:	Activity(act:work, location:POINT (-1.5223977194364153 53.80563634318617), time:13:10:36 --> 16:34:17.600000, duration:3:23:41.600000)
3:	Leg(mode:car, area:POINT (-1.5223977194364153 53.80563634318617) --> POINT (-1.399263822377326 53.934588245650026), time:16:34:17.600000 --> 17:04:17.600000, duration:0:30:00)
4:	Activity(act:home, location:POINT (-1.399263822377326 53.934588245650026), time:17:04:17.600000 --> 00:00:00, duration:6:55:42.400000)


# Write the population to matsim xml

In [26]:
write.write_matsim_population_v6(
    population=population,
    path= acbm.root_path / "data/processed/activities_pam/plans.xml",
)