In [1]:
import pandas as pd
import geopandas as gpd
import acbm
from pam.read import load_travel_diary
from pam import write

from shapely import wkt, Point


This notebook shows how to save acbm output as matsim xml files using PAM. A number of preprocessing steps need to be done befehand
Ideally, some of these should be fixed upstream in the acbm code. Some may be the result of NTS data issues.

# Read the data

In [2]:
# path to the data
individuals = pd.read_csv(acbm.root_path / "data/processed/activities_pam/people.csv")
households = pd.read_csv(acbm.root_path / "data/processed/activities_pam/households.csv")
activities = pd.read_csv(acbm.root_path / "data/processed/activities_pam/activities.csv")
legs = pd.read_csv(acbm.root_path / "data/processed/activities_pam/legs.csv")
legs_geo = pd.read_parquet(acbm.root_path / "data/processed/activities_pam/legs_with_locations.parquet")

In [3]:
activities.head(5)

Unnamed: 0.1,Unnamed: 0,pid,hid,freq,activity,seq,start time,end time,duration,zone
0,0,200,89,,home,0.0,1900-01-01 00:00:00,1900-01-01 12:30:00,12:30:00,E00059031
1,1,200,89,,work,2.0,1900-01-01 13:00:00,1900-01-01 16:30:00,3:30:00,E00058877
2,2,200,89,,home,3.0,1900-01-01 17:00:00,1900-01-02 00:00:00,7:00:00,E00059031
3,3,312,139,,home,0.0,1900-01-01 00:00:00,1900-01-01 06:35:00,6:35:00,E00059045
4,4,312,139,,escort,2.0,1900-01-01 06:54:00,1900-01-01 06:55:00,0:01:00,E00058294


In [4]:
legs_geo.head(5)

Unnamed: 0,pid,hid,ozone,dzone,purp,origin activity,destination activity,mode,seq,tst,tet,duration,start_location_id,end_location_id,start_location_geometry_wkt,end_location_geometry_wkt
0,200,89,E00059031,E00058877,work,home,work,car,1.0,1900-01-01 12:30:00,1900-01-01 13:00:00,0:30:00,1735597748,284127794,POINT (-1.399263822377326 53.934588245650026),POINT (-1.5223977194364153 53.80563634318617)
1,200,89,E00059031,E00059031,home,work,home,car,2.0,1900-01-01 16:30:00,1900-01-01 17:00:00,0:30:00,284127794,1735597748,POINT (-1.5223977194364153 53.80563634318617),POINT (-1.399263822377326 53.934588245650026)
2,312,139,E00059045,E00058294,escort,home,escort,car,1.0,1900-01-01 06:35:00,1900-01-01 06:54:00,0:19:00,1750466246,1325685728,POINT (-1.4001307419061457 53.927941746853776),POINT (-1.5100280118435312 53.89948871840075)
3,312,139,E00058294,E00169797,work,escort,work,car,2.0,1900-01-01 06:55:00,1900-01-01 07:22:00,0:27:00,1325685728,443167284,POINT (-1.5100280118435312 53.89948871840075),POINT (-1.5484560286132372 53.797016858417656)
4,312,139,E00059045,E00059045,home,work,home,car,3.0,1900-01-01 18:00:00,1900-01-01 18:35:00,0:35:00,443167284,1750466246,POINT (-1.5484560286132372 53.797016858417656),POINT (-1.4001307419061457 53.927941746853776)


# Clean the data

In [5]:
# We will be removing some rows in each planning operation. This function helps keep a 
# record of the number of rows in each table after each operation.

row_counts = []
# 
# # Function to log row counts
def log_row_count(df, name, operation):
    row_counts.append((operation, name, len(df)))


### Record number of rows

In [6]:
log_row_count(individuals, "individuals", "0_initial")
log_row_count(households, "households", "0_initial")
log_row_count(activities, "activities", "0_initial")
log_row_count(legs, "legs", "0_initial")
log_row_count(legs_geo, "legs_geo", "0_initial")

In [7]:
row_counts

[('0_initial', 'individuals', 5348),
 ('0_initial', 'households', 3270),
 ('0_initial', 'activities', 20451),
 ('0_initial', 'legs', 15103),
 ('0_initial', 'legs_geo', 15103)]

## Remove people that don't exist across all datasets

When writing to matsim using pam, we get an error when a pid exists in one dataset but not in the other. We will remove these people from the datasets.

In [8]:
def filter_by_pid(individuals, activities, legs, legs_geo, households):
    """
    Filter the input DataFrames to include only include people (pids) that exist in all
    dfs 

    Parameters
    ----------
    individuals: pd.DataFrame
        Individuals DataFrame.
    activities: pd.DataFrame
        Activities DataFrame.
    legs: pd.DataFrame: 
        Legs DataFrame.
    legs_geo: pd.DataFrame
        Legs with geo DataFrame.
    households: pd.DataFrame
        Households DataFrame.

    Returns
    -------
    tuple
        A tuple containing the filtered DataFrames (individuals, activities, legs, legs_geo, households).
    """
    # Identify common pids
    common_pids = set(individuals["pid"]).intersection(activities["pid"]).intersection(legs["pid"]).intersection(legs_geo["pid"])

    # Filter Individual Level DataFrames
    individuals = individuals[individuals["pid"].isin(common_pids)]
    activities = activities[activities["pid"].isin(common_pids)]
    legs = legs[legs["pid"].isin(common_pids)]
    legs_geo = legs_geo[legs_geo["pid"].isin(common_pids)]

    # Filter Household Level DataFrame
    households = households[households["hid"].isin(individuals["hid"])]

    return individuals, activities, legs, legs_geo, households

# Apply
individuals, activities, legs, legs_geo, households = filter_by_pid(individuals, activities, legs, legs_geo, households)

In [9]:
log_row_count(individuals, "individuals", "1_filter_by_pid")
log_row_count(households, "households", "1_filter_by_pid")
log_row_count(activities, "activities", "1_filter_by_pid")
log_row_count(legs, "legs", "1_filter_by_pid")
log_row_count(legs_geo, "legs_geo", "1_filter_by_pid")

In [10]:
row_counts

[('0_initial', 'individuals', 5348),
 ('0_initial', 'households', 3270),
 ('0_initial', 'activities', 20451),
 ('0_initial', 'legs', 15103),
 ('0_initial', 'legs_geo', 15103),
 ('1_filter_by_pid', 'individuals', 5327),
 ('1_filter_by_pid', 'households', 3257),
 ('1_filter_by_pid', 'activities', 20430),
 ('1_filter_by_pid', 'legs', 15103),
 ('1_filter_by_pid', 'legs_geo', 15103)]

In [11]:
sorted(row_counts, key=lambda x: x[0])

[('0_initial', 'individuals', 5348),
 ('0_initial', 'households', 3270),
 ('0_initial', 'activities', 20451),
 ('0_initial', 'legs', 15103),
 ('0_initial', 'legs_geo', 15103),
 ('1_filter_by_pid', 'individuals', 5327),
 ('1_filter_by_pid', 'households', 3257),
 ('1_filter_by_pid', 'activities', 20430),
 ('1_filter_by_pid', 'legs', 15103),
 ('1_filter_by_pid', 'legs_geo', 15103)]

In [12]:
# all rows where start_location_geometry_wkt is null
legs_geo[legs_geo['start_location_geometry_wkt'].isnull()]

# all rows where end_location_geometry_wkt is null
#legs_geo[legs_geo['end_location_geometry_wkt'].isnull()]

Unnamed: 0,pid,hid,ozone,dzone,purp,origin activity,destination activity,mode,seq,tst,tet,duration,start_location_id,end_location_id,start_location_geometry_wkt,end_location_geometry_wkt
25,611,283,E00059012,E00187029,shop,home,shop,car,3.0,1900-01-01 09:00:00,1900-01-01 09:45:00,0:45:00,,58037722,,POINT (-1.5478279172683373 53.79159169768763)
31,612,283,E00059012,E00059012,home,home,home,car,2.0,1900-01-01 15:20:00,1900-01-01 15:35:00,0:15:00,,1786591706,,POINT (-1.3847011407877632 53.941207506875145)
34,613,283,E00059012,E00059012,home,home,home,car,2.0,1900-01-01 15:20:00,1900-01-01 15:35:00,0:15:00,,1786591706,,POINT (-1.3847011407877632 53.941207506875145)
36,743,342,E00059033,E00059033,home,work,home,walk,2.0,1900-01-01 15:30:00,1900-01-01 16:00:00,0:30:00,,1686166202,,POINT (-1.4045496974131315 53.93159752764455)
422,18391,8126,E00058341,E00058352,work,home,work,pt,1.0,1900-01-01 18:45:00,1900-01-01 19:30:00,0:45:00,,236657166,,POINT (-1.6927095171964548 53.90546199473189)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15025,788528,332068,E00187081,E00057829,work,home,work,walk,1.0,1900-01-01 16:00:00,1900-01-01 16:20:00,0:20:00,,346429998,,POINT (-1.5255803571326352 53.77661714873766)
15037,788861,332250,,E00170037,home,work,home,bike,2.0,1900-01-01 22:06:00,1900-01-01 22:29:00,0:23:00,,876787898,,POINT (-1.533519026201066 53.790367225435986)
15043,789966,332734,,E00170620,home,work,home,bike,2.0,1900-01-01 17:30:00,1900-01-01 17:45:00,0:15:00,,408724844,,POINT (-1.536943796191781 53.793365783627195)
15076,793240,334223,,E00170040,home,work,home,bike,2.0,1900-01-01 17:30:00,1900-01-01 17:40:00,0:10:00,,357650556,,POINT (-1.5305884339932605 53.790421209628086)


## Rename columns for PAM

In [13]:
# TODO rename in 3.3_assign_facility_all script
# rename start_location_geometry_wkt and end_location_geometry_wkt to start_loc and end_loc
legs_geo.rename(columns={"start_location_geometry_wkt": "start_loc", "end_location_geometry_wkt": "end_loc"}, inplace=True)
legs_geo


Unnamed: 0,pid,hid,ozone,dzone,purp,origin activity,destination activity,mode,seq,tst,tet,duration,start_location_id,end_location_id,start_loc,end_loc
0,200,89,E00059031,E00058877,work,home,work,car,1.0,1900-01-01 12:30:00,1900-01-01 13:00:00,0:30:00,1735597748,284127794,POINT (-1.399263822377326 53.934588245650026),POINT (-1.5223977194364153 53.80563634318617)
1,200,89,E00059031,E00059031,home,work,home,car,2.0,1900-01-01 16:30:00,1900-01-01 17:00:00,0:30:00,284127794,1735597748,POINT (-1.5223977194364153 53.80563634318617),POINT (-1.399263822377326 53.934588245650026)
2,312,139,E00059045,E00058294,escort,home,escort,car,1.0,1900-01-01 06:35:00,1900-01-01 06:54:00,0:19:00,1750466246,1325685728,POINT (-1.4001307419061457 53.927941746853776),POINT (-1.5100280118435312 53.89948871840075)
3,312,139,E00058294,E00169797,work,escort,work,car,2.0,1900-01-01 06:55:00,1900-01-01 07:22:00,0:27:00,1325685728,443167284,POINT (-1.5100280118435312 53.89948871840075),POINT (-1.5484560286132372 53.797016858417656)
4,312,139,E00059045,E00059045,home,work,home,car,3.0,1900-01-01 18:00:00,1900-01-01 18:35:00,0:35:00,443167284,1750466246,POINT (-1.5484560286132372 53.797016858417656),POINT (-1.4001307419061457 53.927941746853776)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15098,794027,334502,E00187122,E00057826,home,visit,home,car,4.0,1900-01-01 22:30:00,1900-01-01 22:50:00,0:20:00,1521321996,1238852478,POINT (-1.5538081142782385 53.79719139687514),POINT (-1.5332742693848147 53.78164817605564)
15099,794272,334647,E00170040,E00169786,shop,home,shop,car,1.0,1900-01-01 08:50:00,1900-01-01 09:00:00,0:10:00,156862474,406267498,POINT (-1.5323959656166048 53.7918310553218),POINT (-1.538353198422133 53.79629354711603)
15100,794272,334647,E00169786,E00170040,home,shop,home,car,2.0,1900-01-01 10:20:00,1900-01-01 10:30:00,0:10:00,406267498,156862474,POINT (-1.538353198422133 53.79629354711603),POINT (-1.5323959656166048 53.7918310553218)
15101,794273,334647,E00170040,E00058885,shop,home,shop,car,1.0,1900-01-01 08:50:00,1900-01-01 09:00:00,0:10:00,156862474,55534042,POINT (-1.5323959656166048 53.7918310553218),POINT (-1.5560064002486698 53.80004788418821)


## Remove people with missing locations

In [14]:
def filter_no_location(individuals, households, activities, legs, legs_geo):
    """
    Cleans the provided DataFrames by removing rows without location data. Gets all pids
    that have at least one row with missing location data, and removes all rows with 
    these pids. pids are geneerated from two sources: 
       - legs_geo with missing start_loc or end_loc
       - individuals with missing hzone 

    Parameters
    ----------
    individuals : pd.DataFrame
        DataFrame containing individual data.
    households : pd.DataFrame
        DataFrame containing household data.
    activities : pd.DataFrame
        DataFrame containing activity data.
    legs : pd.DataFrame
        DataFrame containing legs data.
    legs_geo : pd.DataFrame
        DataFrame containing legs with geographic data.

    Returns
    -------
    tuple
        A tuple containing the cleaned DataFrames (individuals_cleaned, households_cleaned, activities_cleaned, legs_cleaned, legs_geo_cleaned).
    """
    # Identify rows in legs_geo where start_loc or end_loc are null
    invalid_rows_legs_geo = legs_geo[legs_geo["start_loc"].isnull() | legs_geo["end_loc"].isnull()]

    # Extract the pid values associated with these rows
    invalid_pids_legs_geo = invalid_rows_legs_geo["pid"].unique()

    # Identify rows in individuals where hzone is null
    invalid_rows_individuals = individuals[individuals["hzone"].isnull()]

    # Extract the pid values associated with these rows
    invalid_pids_individuals = invalid_rows_individuals["pid"].unique()

    # Combine the invalid pid values from both sources
    invalid_pids = set(invalid_pids_legs_geo).union(set(invalid_pids_individuals))

    # Remove rows with these pids from all DataFrames
    individuals_cleaned = individuals[~individuals["pid"].isin(invalid_pids)]
    activities_cleaned = activities[~activities["pid"].isin(invalid_pids)]
    legs_cleaned = legs[~legs["pid"].isin(invalid_pids)]
    legs_geo_cleaned = legs_geo[~legs_geo["pid"].isin(invalid_pids)]

    # Extract remaining hid values from individuals_cleaned
    remaining_hids = individuals_cleaned["hid"].unique()

    # Filter households_cleaned to only include rows with hid values in remaining_hids
    households_cleaned = households[households["hid"].isin(remaining_hids)]

    return individuals_cleaned, households_cleaned, activities_cleaned, legs_cleaned, legs_geo_cleaned

# Apply
individuals, households, activities, legs, legs_geo = filter_no_location(individuals, 
                                                                         households, 
                                                                         activities, 
                                                                         legs, 
                                                                         legs_geo)



In [15]:
log_row_count(individuals, "individuals", "2_filter_no_location")
log_row_count(households, "households", "2_filter_no_location")
log_row_count(activities, "activities", "2_filter_no_location")
log_row_count(legs, "legs", "2_filter_no_location")
log_row_count(legs_geo, "legs_geo", "2_filter_no_location")


In [16]:
row_counts

[('0_initial', 'individuals', 5348),
 ('0_initial', 'households', 3270),
 ('0_initial', 'activities', 20451),
 ('0_initial', 'legs', 15103),
 ('0_initial', 'legs_geo', 15103),
 ('1_filter_by_pid', 'individuals', 5327),
 ('1_filter_by_pid', 'households', 3257),
 ('1_filter_by_pid', 'activities', 20430),
 ('1_filter_by_pid', 'legs', 15103),
 ('1_filter_by_pid', 'legs_geo', 15103),
 ('2_filter_no_location', 'individuals', 5107),
 ('2_filter_no_location', 'households', 3136),
 ('2_filter_no_location', 'activities', 19730),
 ('2_filter_no_location', 'legs', 14623),
 ('2_filter_no_location', 'legs_geo', 14623)]

In [17]:
def calculate_percentage_remaining(row_counts):
    """
    Calculate the percentage of rows remaining for each DataFrame based on the initial counts.

    Parameters
    ----------
    row_counts : list of tuples
        List of tuples containing stage, DataFrame names, and their row counts.

    Returns
    -------
    list of tuples
        List of tuples containing stage, DataFrame names, and their percentage of rows remaining.
    """
    # Extract initial counts
    initial_counts = {df_name: count for stage, df_name, count in row_counts if stage == '0_initial'}

    # Calculate percentage remaining
    percentage_remaining = []
    for stage, df_name, count in row_counts:
        if df_name in initial_counts:
            initial_count = initial_counts[df_name]
            percentage = round((count / initial_count) * 100, 1)
            percentage_remaining.append((stage, df_name, count, percentage))

    # Sort by df_name
    percentage_remaining.sort(key=lambda x: x[1])

    return percentage_remaining


percentages = calculate_percentage_remaining(row_counts)

# Print the percentages
for stage, df_name, count, percentage in percentages:
    print(f"{stage} - {df_name} - {count} rows: {percentage:.1f}% rows remaining")

# # Log the percentages
# for stage, df_name, count, percentage in percentages:
#     logging.info(f"{stage} - {df_name} - {count} rows: {percentage:.1f}% rows remaining")

0_initial - activities - 20451 rows: 100.0% rows remaining
1_filter_by_pid - activities - 20430 rows: 99.9% rows remaining
2_filter_no_location - activities - 19730 rows: 96.5% rows remaining
0_initial - households - 3270 rows: 100.0% rows remaining
1_filter_by_pid - households - 3257 rows: 99.6% rows remaining
2_filter_no_location - households - 3136 rows: 95.9% rows remaining
0_initial - individuals - 5348 rows: 100.0% rows remaining
1_filter_by_pid - individuals - 5327 rows: 99.6% rows remaining
2_filter_no_location - individuals - 5107 rows: 95.5% rows remaining
0_initial - legs - 15103 rows: 100.0% rows remaining
1_filter_by_pid - legs - 15103 rows: 100.0% rows remaining
2_filter_no_location - legs - 14623 rows: 96.8% rows remaining
0_initial - legs_geo - 15103 rows: 100.0% rows remaining
1_filter_by_pid - legs_geo - 15103 rows: 100.0% rows remaining
2_filter_no_location - legs_geo - 14623 rows: 96.8% rows remaining


## Convert to point 

In [18]:
# Function to convert to Point if not already a Point
def convert_to_point(value):
    if isinstance(value, Point):
        return value
    return wkt.loads(value)

# Convert start_loc and end_loc to shapely point objects
legs_geo["start_loc"] = legs_geo["start_loc"].apply(convert_to_point)
legs_geo["end_loc"] = legs_geo["end_loc"].apply(convert_to_point)

# Verify the type of the first element in the 'start_loc' column
print(type(legs_geo['start_loc'].iloc[0]))  # Should be <class 'shapely.geometry.point.Point'>

# Convert to GeoDataFrame with start_loc as the active geometry
legs_geo = gpd.GeoDataFrame(legs_geo, geometry='start_loc')


<class 'shapely.geometry.point.Point'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  legs_geo["start_loc"] = legs_geo["start_loc"].apply(convert_to_point)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  legs_geo["end_loc"] = legs_geo["end_loc"].apply(convert_to_point)


## Add home location to individuals 

In [19]:
def add_home_location_to_individuals(legs_geo, individuals):
    """
    Adds home location to individuals dataframe. Location is obtained 
    from legs_geo (rows with orign activity = home) 

    Parameters
    ----------
    legs_geo : pd.DataFrame
        DataFrame containing legs with geographic data.
    individuals : pd.DataFrame
        DataFrame containing individual data.

    Returns
    -------
    pd.DataFrame
        The modified individuals DataFrame with location information.
    """
    # Filter by origin activity = home
    legs_geo_home = legs_geo[legs_geo["origin activity"] == "home"]
    
    # Get one row for each hid group
    legs_geo_home = legs_geo_home.groupby("hid").first().reset_index()
    
    # Keep only the columns we need: hid and start_location
    legs_geo_home = legs_geo_home[["hid", "start_loc"]]
    
    # Rename start_loc to loc
    legs_geo_home.rename(columns={"start_loc": "loc"}, inplace=True)
    
    # Merge legs_geo_home with individuals
    individuals_geo = individuals.copy()
    individuals_geo = individuals_geo.merge(legs_geo_home, on="hid")
    
    # Remove rows with missing loc
    individuals_geo = individuals_geo[individuals_geo["loc"].notnull()]
    
    return individuals_geo

# Apply
individuals_geo = add_home_location_to_individuals(legs_geo, individuals)


## Add Car Ownership

TODO: get num_cars per household from spc_with_nts

this can then be passed on using hhs_attributes in pam.load_travel_diary

# Read in population data

In [20]:
population = load_travel_diary(
        trips=legs_geo,
        persons_attributes=individuals,
        tour_based=False,
        include_loc=True,
        sort_by_seq=True,
        # hhs_attributes = None,
    )


Using simple trip based purpose parser, this assumes first activity is 'home'.
If you do not wish to assume this, try setting 'tour_based' = True (default).

Using freq of 'None' for all trips.


# Write the population to matsim xml

In [21]:
write.write_matsim_population_v6(
    population=population,
    path= acbm.root_path / "data/processed/activities_pam/plans.xml",
)