## Setup

In [1]:
import os
import polars as pl

## paths

In [2]:
region_name = "Ost"
mngmt_name = "Mecklenburg-Vorpommern"
mngmt_filter = "Mecklenburg-Vorpommern"

In [3]:
pth_data = f"C:/Users/z187070/Documents/Projects/PAXCOUNTER/data/"

In [4]:
pth_master = pth_data + f"master/master/{region_name}/master_data/master_data.csv"
pth_ris = pth_data + f"ris/ris/{region_name}/{mngmt_name}/ris_data/ris_data.csv"
pth_pax = pth_data + f"pax/pax/{region_name}/{mngmt_name}/"

## Load data

In [5]:
df_master = pl.read_csv(pth_master, has_header = True, separator=";")

In [6]:
pth_ris

'C:/Users/z187070/Documents/Projects/PAXCOUNTER/data/ris/ris/Ost/Mecklenburg-Vorpommern/ris_data/ris_data.csv'

In [7]:
df_ris = pl.read_csv(pth_ris, has_header = True, separator=";")

In [8]:
df_pax = pl.read_csv(pth_pax + f"*.csv", has_header=True, separator=";")

## Descriptives

In [9]:
df_pax.head()

pax_counter_id,time_iot,data_pax
str,str,i64
"""083af23fd0df""","""2024-12-01 00:…",0
"""083af23fd0df""","""2024-12-01 00:…",0
"""083af23fd0df""","""2024-12-01 00:…",0
"""083af23fd0df""","""2024-12-01 00:…",0
"""083af23fd0df""","""2024-12-01 00:…",0


In [10]:
df_pax.select(pl.count())

count
u32
22961666


In [11]:
df_pax.select(pl.col("pax_counter_id").unique())

pax_counter_id
str
"""a0a3b32f9e87"""
"""e831cdc25807"""
"""244cab0700ef"""
"""a0a3b330e483"""
"""3494545a2a77"""
"""a0a3b32f53fb"""
"""a0a3b32f82e7"""
"""244cab06259f"""
"""c45bbe93286f"""
"""a0a3b38c536f"""


In [12]:
df_master.select(pl.col("management_name").unique()).to_series().to_list()

['Mecklenburg-Vorpommern', 'Cottbus', 'Potsdam']

In [13]:
df_master.filter(pl.col("management_name") == mngmt_filter).select(pl.count())

count
u32
105


In [14]:
mngmt_filter

'Mecklenburg-Vorpommern'

In [15]:
df_master.filter(pl.col("management_name") == mngmt_filter).select(pl.col("station_name").unique())

station_name
str
"""Groß Lüsewitz"""
"""Priemerburg"""
"""Sukow (b Schwe…"
"""Rostock Kasseb…"
"""Schwerin Indus…"
"""Friedrichsruhe…"
"""Herrnburg"""
"""Bentwisch"""
"""Mistorf"""
"""Mölln (Meckl)"""


In [16]:
master_pcid = df_master.filter(pl.col("management_name") == mngmt_filter).select(pl.col("pax_counter_id").unique()).to_series().to_list()

In [17]:
pax_pcid = df_pax.select(pl.col("pax_counter_id").unique()).to_series().to_list()

In [18]:
list(set(master_pcid) - set(pax_pcid))

[]

In [19]:
list(set(pax_pcid) - set(master_pcid))

[]

In [20]:
df_ris.head()

fahrtid,ereignis_station_id,zeit_echt,zeit_echt_verspaetung,gleis_echt,tpname_bahnsteig_echt,gattung,case
str,i64,str,i64,str,str,str,str
"""20241130-6c7c3…",2468,"""2024-12-01 00:…",4,"""02468-01-B02-G…","""Bahnsteig 02""","""RE""","""ankunft"""
"""20241130-6c7c3…",2468,"""2024-12-01 00:…",5,"""02468-01-B02-G…","""Bahnsteig 02""","""RE""","""abfahrt"""
"""20241130-2262f…",5127,"""2024-12-01 00:…",1,"""05127-01-B02-G…","""Bahnsteig 02""","""RB""","""ankunft"""
"""20241130-2262f…",5127,"""2024-12-01 00:…",1,"""05127-01-B02-G…","""Bahnsteig 02""","""RB""","""abfahrt"""
"""20241130-6f3f7…",6407,"""2024-12-01 00:…",18,,,"""RE""","""ankunft"""


## joins

In [21]:
df_master.filter(pl.col("management_name") == mngmt_filter).head()

pax_counter_id,equipmentname,station_id,tplnr,tpname,station_name,station_ril100,management_name,region_name,federal_state_name,station_longitude,station_latitude,produktlinie,untersegment
str,str,i64,str,str,str,str,str,str,str,f64,f64,str,str
"""244cab034443""","""DSA+_Typ3_GL12…",79,"""00079-01-B01""","""Bahnsteig 01""","""Altefähr""","""WAF""","""Mecklenburg-Vo…","""Ost""","""Mecklenburg-Vo…",13.14101,54.322927,"""Zubringerbahnh…","""ländlicher Zub…"
"""244cab067843""","""DSA+_Typ2_GL1_…",121,"""00121-01-B01""","""Bahnsteig 01""","""Althof""","""WAH""","""Mecklenburg-Vo…","""Ost""","""Mecklenburg-Vo…",11.922341,54.090442,"""Zubringerbahnh…","""ländlicher Zub…"
"""a0a3b3311b9f""","""DSA+_Typ2_GL1_…",490,"""00490-30""","""Zuwegung""","""Bentwisch""","""WBE""","""Mecklenburg-Vo…","""Ost""","""Mecklenburg-Vo…",12.210506,54.117193,"""Zubringerbahnh…","""ländlicher Zub…"
"""244cab03327b""","""DSA+_Typ2_GL1_…",719,"""00719-01-B01""","""Bahnsteig 01""","""Bobitz""","""WBO""","""Mecklenburg-Vo…","""Ost""","""Mecklenburg-Vo…",11.3628,53.804534,"""Zubringerbahnh…","""ländlicher Zub…"
"""083af23fd0df""","""DSA+_Typ2_GL2_…",719,"""00719-01-B02""","""Bahnsteig 02""","""Bobitz""","""WBO""","""Mecklenburg-Vo…","""Ost""","""Mecklenburg-Vo…",11.3628,53.804534,"""Zubringerbahnh…","""ländlicher Zub…"


In [34]:
cols_of_interest_master = ["pax_counter_id", "station_id", "station_name", "tpname", "station_longitude", "station_latitude"]

In [35]:
df_jd = df_pax.join(df_master.select(cols_of_interest_master), on = "pax_counter_id", how = "left")
df_jd.head()

pax_counter_id,time_iot,data_pax,station_id,station_name,tpname,station_longitude,station_latitude
str,str,i64,i64,str,str,f64,f64
"""083af23fd0df""","""2024-12-01 00:…",0,719,"""Bobitz""","""Bahnsteig 02""",11.3628,53.804534
"""083af23fd0df""","""2024-12-01 00:…",0,719,"""Bobitz""","""Bahnsteig 02""",11.3628,53.804534
"""083af23fd0df""","""2024-12-01 00:…",0,719,"""Bobitz""","""Bahnsteig 02""",11.3628,53.804534
"""083af23fd0df""","""2024-12-01 00:…",0,719,"""Bobitz""","""Bahnsteig 02""",11.3628,53.804534
"""083af23fd0df""","""2024-12-01 00:…",0,719,"""Bobitz""","""Bahnsteig 02""",11.3628,53.804534


In [25]:
df_jd.select(pl.col("time_iot").min())

time_iot
str
"""2024-12-01 00:…"


### checks

In [31]:
df_jd.group_by("station_name").agg(pl.col("pax_counter_id").unique().count().alias("n_sensors")).sort("n_sensors", descending = True).head()

station_name,n_sensors
str,u32
"""Parchim""",3
"""Holthusen""",3
"""Schwerin Görri…",2
"""Teschenhagen""",2
"""Hagenow Land""",2


In [33]:
df_jd.filter(pl.col("station_name") == "Holthusen")

pax_counter_id,time_iot,data_pax,station_id,station_name,station_longitude,station_latitude
str,str,i64,i64,str,f64,f64
"""3494545a2107""","""2025-03-05 17:…",6,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",6,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",1,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",5,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",4,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",8,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",7,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",6,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",3,2877,"""Holthusen""",11.364809,53.555831
"""3494545a2107""","""2025-03-05 17:…",5,2877,"""Holthusen""",11.364809,53.555831


In [36]:
df_jd.write_csv(f"pax_data_{region_name}_{mngmt_name}.csv")

In [37]:
df_jd.write_parquet(f"pax_data_{region_name}_{mngmt_name}.parquet")