In [2]:
import sys
import os

# Adjust the path as needed to reach your project root from the notebook's location
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

project_root

'/Users/peterfalterbaum/Documents/Nova/thesis local/implementation/public_implementation'

In [3]:
from config import PROJECT_ROOT
from pathlib import Path

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from pathlib import Path
import logging

In [5]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
# Load the trip data
# Update this path to your CSV file
TRIPS_CSV = str(PROJECT_ROOT) + \
    "/data/processed/final__purpose_person_odt_no_same_od_no_outliers_no_rare_od.csv"
# TRIPS_CSV = str(PROJECT_ROOT) + "/data/processed/odt_no_same_od_no_rare_od.csv"
df_trips = pd.read_csv(TRIPS_CSV)

In [7]:
# Load the GeoJSON file with postal codes
GEOJSON_PATH = "../../data/raw/location/working_zips.geojson"
gdf_netherlands = gpd.read_file(GEOJSON_PATH)

In [8]:
# Ensure pc4_code is string type for matching
gdf_netherlands['pc4_code'] = gdf_netherlands['pc4_code'].astype(
    str).str.zfill(4)

In [9]:
# Get unique city names
CITIES = gdf_netherlands['gem_name'].unique()
print(f"Found {len(CITIES)} unique cities in the dataset")

Found 352 unique cities in the dataset


In [10]:
# Ensure postal codes are strings and properly formatted
df_trips['pc4_arrival'] = df_trips['pc4_arrival'].astype(str).str.zfill(4)
df_trips['pc4_departure'] = df_trips['pc4_departure'].astype(str).str.zfill(4)

In [11]:
# Convert timestamps to datetime
df_trips['timestamp_departure'] = pd.to_datetime(
    df_trips['timestamp_departure'])
df_trips['timestamp_arrival'] = pd.to_datetime(df_trips['timestamp_arrival'])

In [12]:
print(f"Loaded {len(df_trips)} trips")

Loaded 59943 trips


In [13]:
# 1) Get PC4 codes for the cities
city_postcodes = (
    gdf_netherlands
    .loc[gdf_netherlands['gem_name'].isin(CITIES), 'pc4_code']
    .astype(str)
    .unique()
)
print(f"Found {len(city_postcodes)} unique postal codes in the target cities")

Found 4068 unique postal codes in the target cities


In [14]:
# 2) Filter trips that start OR end in those PC4s
trips_for_cities = df_trips[
    df_trips['pc4_departure'].isin(city_postcodes) |
    df_trips['pc4_arrival'].isin(city_postcodes)
].copy()

In [15]:
print(f"Found {len(trips_for_cities)} trips involving the target cities")

Found 59943 trips involving the target cities


In [16]:
# 3) Merge in origin/dest district names
trips_with_districts = (
    trips_for_cities
    .merge(
        gdf_netherlands[['gem_name', 'pc4_code']],
        left_on='pc4_departure',
        right_on='pc4_code',
        how='left'
    )
    .merge(
        gdf_netherlands[['gem_name', 'pc4_code']],
        left_on='pc4_arrival',
        right_on='pc4_code',
        how='left',
        suffixes=('_origin', '_dest')
    )
)

In [17]:
print(f"Merged district information for {len(trips_with_districts)} trips")

Merged district information for 59943 trips


In [18]:
# 4) Initialize stats indexed by CITIES
district_stats = pd.DataFrame(index=CITIES)

In [19]:
# Total departures & arrivals
district_stats['total_departure_trips'] = (
    trips_with_districts.groupby('gem_name_origin').size()
    .reindex(CITIES, fill_value=0).astype(int)
)
district_stats['total_arrival_trips'] = (
    trips_with_districts.groupby('gem_name_dest').size()
    .reindex(CITIES, fill_value=0).astype(int)
)

In [20]:
# Net flow
district_stats['net_flow'] = (
    district_stats['total_arrival_trips'] -
    district_stats['total_departure_trips']
)

In [21]:
# Intra-district trips
intra_mask = trips_with_districts['gem_name_origin'] == trips_with_districts['gem_name_dest']
intra = trips_with_districts[intra_mask]
district_stats['intra_district_trips'] = (
    intra.groupby('gem_name_origin').size()
    .reindex(CITIES, fill_value=0).astype(int)
)

In [22]:
# Same-PC trips
same_pc_mask = intra_mask & (
    trips_with_districts['pc4_departure'] ==
    trips_with_districts['pc4_arrival']
)
same_pc = trips_with_districts[same_pc_mask]
district_stats['same_postal_code_trips'] = (
    same_pc.groupby('gem_name_origin').size()
    .reindex(CITIES, fill_value=0).astype(int)
)

In [23]:
# Different-PC trips
district_stats['different_postal_code_trips'] = (
    district_stats['intra_district_trips'] -
    district_stats['same_postal_code_trips']
)

In [24]:
# Unique intra-PCs
unique_intra = (
    intra.groupby('gem_name_origin')
         .apply(lambda df: pd.unique(
             df[['pc4_departure', 'pc4_arrival']]
             .values.ravel()
         ).size)
)
district_stats['unique_intra_postal_codes'] = (
    unique_intra.reindex(CITIES, fill_value=0).astype(int)
)

  .apply(lambda df: pd.unique(


In [25]:
# Inter-district trips
inter_district_counts = {}
for city in CITIES:
    # Trips where city is either origin or destination
    city_trips = trips_with_districts[(trips_with_districts['gem_name_origin'] == city) |
                                      (trips_with_districts['gem_name_dest'] == city)]
    # Exclude intra-district trips
    inter_district_trips = city_trips[city_trips['gem_name_origin']
                                      != city_trips['gem_name_dest']]
    inter_district_counts[city] = len(inter_district_trips)
district_stats['inter_district_trips'] = pd.Series(inter_district_counts)

In [26]:
# Count outside postal codes for each city
outside_pc_counts = {}
for city in CITIES:
    # Get trips where city is origin
    city_origin_trips = trips_with_districts[trips_with_districts['gem_name_origin'] == city]
    # Get trips where city is destination
    city_dest_trips = trips_with_districts[trips_with_districts['gem_name_dest'] == city]

    # Get unique postal codes from destinations of origin trips
    dest_pcs = set(city_origin_trips['pc4_arrival'].unique())
    # Get unique postal codes from origins of destination trips
    origin_pcs = set(city_dest_trips['pc4_departure'].unique())

    # Get intra-city postal codes
    intra_city_pcs = set(
        gdf_netherlands[gdf_netherlands['gem_name'] == city]['pc4_code'].unique())

    # Combine using XOR (symmetric difference) and remove intra-city postal codes
    outside_pc_counts[city] = len((dest_pcs ^ origin_pcs) - intra_city_pcs)

district_stats['unique_outside_postal_codes'] = pd.Series(outside_pc_counts)

In [27]:
# 5) Calculate percentages and means
with np.errstate(divide='ignore', invalid='ignore'):
    # Calculate mean total trips
    district_stats['mean_total_trips'] = (
        (district_stats['total_departure_trips'] +
         district_stats['total_arrival_trips']) / 2
    ).round(1)

    # Calculate intra district percentage
    district_stats['intra_district_pct'] = (
        district_stats['intra_district_trips'] /
        district_stats['mean_total_trips'] * 100
    ).round(1).fillna(0)

    district_stats['same_postal_code_pct'] = (
        district_stats['same_postal_code_trips']
        / district_stats['intra_district_trips'] * 100
    ).round(1).fillna(0)

In [28]:
# 6) Calculate trips per PC and balanced score
with np.errstate(divide='ignore', invalid='ignore'):
    district_stats['trips_per_pc_intra'] = (
        district_stats['intra_district_trips']
        / district_stats['unique_intra_postal_codes']
    ).round(1).fillna(0)

    # Add balanced score calculation
    district_stats['balanced_score'] = (
        (1 - district_stats['same_postal_code_pct'] / 100) *
        district_stats['trips_per_pc_intra']
    ).round(1).fillna(0)

In [29]:
# 7) Final reorder & sort by balanced_score
column_order = [
    # 'total_departure_trips',
    # 'total_arrival_trips',
    # 'mean_total_trips',
    # 'net_flow',
    'intra_district_trips',
    # 'intra_district_pct',
    # 'same_postal_code_trips',
    # 'same_postal_code_pct',
    # 'different_postal_code_trips',
    'inter_district_trips',
    'unique_intra_postal_codes',
    'unique_outside_postal_codes',
    'trips_per_pc_intra',
    # 'balanced_score'
]

In [30]:
district_stats = (
    district_stats[column_order]
    .sort_values('trips_per_pc_intra', ascending=False)
)

In [31]:
district_stats[district_stats["intra_district_trips"].astype(
    int) > 1000].head(15)

Unnamed: 0,intra_district_trips,inter_district_trips,unique_intra_postal_codes,unique_outside_postal_codes,trips_per_pc_intra
Amersfoort,1056,655,17,27,62.1
Utrecht,2539,2614,45,137,56.4
Rotterdam,3397,2469,69,89,49.2
's-Gravenhage,2690,2420,61,50,44.1
Amsterdam,2391,3378,71,112,33.7
