# Agents filtering
1. Travel time > 0
2. Walking speed < 10 km/h
3. Utility score by MATSim simulation > 0
4. Activity time > 0

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\sysmo-data-pt
%matplotlib inline

D:\sysmo-data-pt


In [2]:
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
from tqdm import tqdm
import matsim

## 1. Process plans to get infeasible agents

In [36]:
def process_plans(df=None):
    df_r_h = df.loc[(df['act_id'] == 0) & (df['src']=='input'), :]
    home_dict = dict(zip(df_r_h.PId, 
                         list(df_r_h[['POINT_X', 'POINT_Y']].itertuples(index=False, name=None))))
    df = df.loc[df['src'] == 'output', :]
    df.loc[:, 'POINT_X'] = df.apply(lambda row: home_dict[row['PId']][0] if row['act_id'] == 0 else row['POINT_X'],
                                        axis=1)
    df.loc[:, 'POINT_Y'] = df.apply(lambda row: home_dict[row['PId']][1] if row['act_id'] == 0 else row['POINT_Y'],
                                        axis=1)
    return df.drop(columns=['src'])

In [37]:
# Data location
agents2remove = []
total_agents_list = []
total_agents = 0
for batch in tqdm(range(0, 10), desc='Process plans'):
    plans_file = f'dbs/output/plans_{batch}.csv.gz'
    df_plans = pd.read_csv(plans_file, compression='gzip')
    total_agents += df_plans.PId.nunique()
    total_agents_list += list(df_plans.PId.unique())
    df_plans = df_plans.loc[(df_plans['act_id'] != 0) & (df_plans['src'] == 'output'), :]
    
    # Travel time
    agents2remove += list(df_plans.loc[df_plans['trav_time_min'] <= 0, 'PId'].unique())
    
    # Walking speed
    agents2remove += list(df_plans.loc[(df_plans['speed'] >= 10) &\
                                       (df_plans['mode'] == 'walk'), 'PId'].unique())
    
    # Utility score
    agents2remove += list(df_plans.loc[df_plans['score'] <= 0, 'PId'].unique())
    
    # Activity time
    agents2remove += list(df_plans.loc[(df_plans['act_time'] <= 0) & \
                                       (df_plans['act_purpose'] == 'pt interaction'), 'PId'].unique())
    
    # Remove agents from the plans
    agents2remove = list(set(agents2remove))
    df_r = process_plans(pd.read_csv(plans_file, compression='gzip'))
    df_r = df_r.loc[~df_r.PId.isin(agents2remove), :]
    df_r.to_parquet(f'dbs/product/plans_{batch}.parquet', index=False)

Process plans: 100%|██████████| 10/10 [23:55<00:00, 143.50s/it]


In [38]:
agents2keep = list(set(total_agents_list) - set(agents2remove))
print(f"Share of agents remained: {len(agents2keep) / total_agents * 100} %")

Share of agents remained: 72.76782567552567 %


In [39]:
print(f'Number of agents left: {len(agents2keep)}.')

Number of agents left: 6751639.


## 2. Filter population

In [3]:
# Get agent list
agents2keep = []
for batch in tqdm(range(0, 10), desc='Load agent ids'):
    plans_file = f'dbs/product/plans_{batch}.parquet'
    df_plans = pd.read_parquet(plans_file)
    agents2keep += list(df_plans.PId.unique())
    del df_plans
agents2keep = list(set(agents2keep))

Load agent ids: 100%|██████████| 10/10 [00:23<00:00,  2.39s/it]


In [4]:
df_pop = pd.read_pickle('dbs/sysmo/syn_pop_all.pkl')

In [5]:
agents2remove = list(df_pop.loc[~df_pop.PId.isin(agents2keep), 'PId'].unique())

In [10]:
tqdm.pandas()
df_pop.loc[df_pop.PId.isin(agents2keep), 'feasibility'] = 1
df_pop.loc[~df_pop.PId.isin(agents2keep), 'feasibility'] = 0
df_pop.drop(columns=['pot_car_driver'], inplace=True)
df_pop.to_parquet('dbs/product/syn_pop_all.parquet', index=False)

## 3. Events filtering

In [10]:
agents2keep_string = list(map(str, agents2keep))
agents2remove_string = list(map(str, agents2remove))

In [12]:
# read network
input_file = 'dbs/scenarios/sweden/output_1/output_network.xml.gz'
net = matsim.read_network(input_file)
geo = net.as_geo() 
link_list = list(geo.link_id.unique())
geo.head()

Unnamed: 0,length,freespeed,capacity,permlanes,oneway,modes,link_id,from_node,to_node,geometry
0,481.239929,11.111111,600.0,1.0,1,car,10,441557672,442330933,"LINESTRING (374101.255 6258377.983, 374133.899..."
1,36.365222,4.166667,600.0,1.0,1,car,100,6159514954,1801777826,"LINESTRING (441341.551 6522798.063, 441346.658..."
2,45.748284,4.166667,600.0,1.0,1,car,1000,7098624155,5304929876,"LINESTRING (455508.389 6929779.879, 455553.788..."
3,45.066036,2.777778,300.0,1.0,1,car,10000,1431711158,1431711274,"LINESTRING (662344.523 6571560.838, 662335.462..."
4,10.098473,6.944444,600.0,1.0,1,car,100000,9682588781,9682001659,"LINESTRING (658965.901 6635300.797, 658969.266..."


In [15]:
geo.to_file('dbs/product/network.shp')
del geo

In [16]:
for batch in tqdm(range(0, 10), desc='Process events'):
    events_file = f'dbs/output/events_{batch}.parquet'
    df_events = pd.read_parquet(events_file)
    df_events = df_events.loc[(~df_events.person.isin(agents2remove_string)) | df_events.person.isna()]
    df_events = df_events.loc[(~df_events.vehicle.isin(agents2remove_string)) | df_events.vehicle.isna()]
    df_events.to_parquet(f'dbs/product/events_{batch}.parquet', index=False)

Process events: 100%|██████████| 10/10 [45:22<00:00, 272.23s/it]
