In [45]:
import polars as pl
import datetime as dt
import seaborn as sns

In [46]:
pth_data = f"C:/Users/z187070/Documents/Projects/PAXCOUNTER/data/"

In [47]:
region_name = "Ost"
mngmt_name = "Mecklenburg-Vorpommern"
mngmt_filter = "Mecklenburg-Vorpommern"

In [48]:
#pth_master = pth_data + f"master/master/{region_name}/master_data/master_data.csv"
pth_ris = pth_data + f"ris/ris/{region_name}/{mngmt_name}/ris_data/ris_data.csv"
pth_pax = pth_data + f"pax_data_{region_name}_{mngmt_name}.parquet"

## read data from ris

In [49]:
ris = pl.read_csv(pth_ris, has_header=True, separator=";")

In [50]:
ris.select(pl.col("case").unique())

case
str
"""ankunft"""
"""startbahnhof"""
"""endbahnhof"""
"""abfahrt"""


In [51]:
ris.head(2)

fahrtid,ereignis_station_id,zeit_echt,zeit_echt_verspaetung,gleis_echt,tpname_bahnsteig_echt,gattung,case
str,i64,str,i64,str,str,str,str
"""20241130-6c7c332f-4123-3241-8638-a4aa5eb9f676""",2468,"""2024-12-01 00:03:14""",4,"""02468-01-B02-G02""","""Bahnsteig 02""","""RE""","""ankunft"""
"""20241130-6c7c332f-4123-3241-8638-a4aa5eb9f676""",2468,"""2024-12-01 00:04:21""",5,"""02468-01-B02-G02""","""Bahnsteig 02""","""RE""","""abfahrt"""


In [52]:
ris.head(2).select("zeit_echt").to_series().to_list()

['2024-12-01 00:03:14', '2024-12-01 00:04:21']

In [53]:
data_filter_date = dt.datetime(2025, 4, 1)

In [54]:
base_data = ris.select(
    pl.col("fahrtid"), 
    pl.col("ereignis_station_id"),
    pl.col("zeit_echt").str.to_datetime(), 
    pl.col("case")
).filter(
    pl.col("zeit_echt") >= data_filter_date
)

In [55]:
base_data.select(pl.count())

count
u32
194488


In [56]:
base_data.head()

fahrtid,ereignis_station_id,zeit_echt,case
str,i64,datetime[μs],str
"""20250331-f653a240-d4ff-32f6-9759-0a13726f545d""",3830,2025-04-01 00:01:26,"""ankunft"""
"""20250331-086e5a85-3fa6-338f-affc-f39da060452a""",3559,2025-04-01 00:03:15,"""ankunft"""
"""20250331-086e5a85-3fa6-338f-affc-f39da060452a""",3559,2025-04-01 00:03:15,"""abfahrt"""
"""20250331-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-04-01 00:03:38,"""ankunft"""
"""20250401-ebd72dea-f30f-3725-a9de-5e23c4cbf833""",6050,2025-04-01 00:04:12,"""ankunft"""


In [57]:
base_data.group_by(["fahrtid", "ereignis_station_id"]).agg(pl.count()).filter(pl.col("count") > 2).head()

fahrtid,ereignis_station_id,count
str,i64,u32


checking data quality - some arrival/departure combinations are doubles (4 rows instead of 2) - there is no 3 and no more than 4, the time seems to be the same in all cases, i will just take one of these values

this is not the case for data later than 01.05.2025??

In [58]:
base_data.group_by(["fahrtid", "ereignis_station_id"]).agg(pl.count()).filter(pl.col("count") > 4).head(2).select("fahrtid").to_series().to_list()

[]

In [59]:
base_data.filter(pl.col("fahrtid") == '20250325-353a3053-1386-374c-83d3-c200e93212cb')

fahrtid,ereignis_station_id,zeit_echt,case
str,i64,datetime[μs],str


In [60]:
time_info = base_data.filter(pl.col("case").is_in(["abfahrt", "ankunft"])).\
pivot(columns = "case", index = ["fahrtid", "ereignis_station_id"], values = "zeit_echt", aggregate_function="min").\
with_columns(
    (pl.col("abfahrt")-pl.col("ankunft")).alias("time_diff")
)

In [61]:
time_info.head()

fahrtid,ereignis_station_id,ankunft,abfahrt,time_diff
str,i64,datetime[μs],datetime[μs],duration[μs]
"""20250331-f653a240-d4ff-32f6-9759-0a13726f545d""",3830,2025-04-01 00:01:26,,
"""20250331-086e5a85-3fa6-338f-affc-f39da060452a""",3559,2025-04-01 00:03:15,2025-04-01 00:03:15,0µs
"""20250331-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-04-01 00:03:38,2025-04-01 00:04:35,57s
"""20250401-ebd72dea-f30f-3725-a9de-5e23c4cbf833""",6050,2025-04-01 00:04:12,2025-04-01 00:05:10,58s
"""20250331-9b2fdb57-9909-3d79-92f2-7fd4cdcaa770""",5127,2025-04-01 00:04:31,2025-04-01 00:04:44,13s


## creating the table

In [62]:
minutes_before_ankunft = 10
minutes_after_ankunft = 5

In [63]:
df_timematch = time_info.\
    filter(pl.col("ankunft").is_not_null()).\
    select(
        pl.col("fahrtid"),
        pl.col("ereignis_station_id"),
        pl.col("ankunft"),
        (pl.col("ankunft") - dt.timedelta(minutes=minutes_before_ankunft)).alias("begin_tw"),
        (pl.col("ankunft") + dt.timedelta(minutes=minutes_after_ankunft)).alias("end_tw")
).with_columns(
    pl.col("ankunft").dt.date().alias("date")
)
df_timematch.head()

fahrtid,ereignis_station_id,ankunft,begin_tw,end_tw,date
str,i64,datetime[μs],datetime[μs],datetime[μs],date
"""20250331-f653a240-d4ff-32f6-9759-0a13726f545d""",3830,2025-04-01 00:01:26,2025-03-31 23:51:26,2025-04-01 00:06:26,2025-04-01
"""20250331-086e5a85-3fa6-338f-affc-f39da060452a""",3559,2025-04-01 00:03:15,2025-03-31 23:53:15,2025-04-01 00:08:15,2025-04-01
"""20250331-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-04-01 00:03:38,2025-03-31 23:53:38,2025-04-01 00:08:38,2025-04-01
"""20250401-ebd72dea-f30f-3725-a9de-5e23c4cbf833""",6050,2025-04-01 00:04:12,2025-03-31 23:54:12,2025-04-01 00:09:12,2025-04-01
"""20250331-9b2fdb57-9909-3d79-92f2-7fd4cdcaa770""",5127,2025-04-01 00:04:31,2025-03-31 23:54:31,2025-04-01 00:09:31,2025-04-01


In [64]:
df_pax = pl.read_parquet(pth_pax).\
    select(
        pl.col("pax_counter_id"),
        pl.col("time_iot").str.to_datetime(), 
        pl.col("station_id"),
        pl.col("data_pax")).\
    filter(
        pl.col("time_iot") >= data_filter_date,
        
    ).with_columns(
        pl.col("time_iot").dt.date().alias("date")
    )

df_pax.head()

pax_counter_id,time_iot,station_id,data_pax,date
str,datetime[μs],i64,i64,date
"""083af23fd0df""",2025-04-01 00:00:20,719,0,2025-04-01
"""083af23fd0df""",2025-04-01 00:01:20,719,0,2025-04-01
"""083af23fd0df""",2025-04-01 00:02:20,719,0,2025-04-01
"""083af23fd0df""",2025-04-01 00:03:20,719,0,2025-04-01
"""083af23fd0df""",2025-04-01 00:04:20,719,0,2025-04-01


In [65]:
dates_to_process = df_timematch.select(pl.col("date").unique()).to_series().to_list()
len(dates_to_process)

61

In [66]:
for date in dates_to_process:
    df_joined_date = df_timematch.filter(
            pl.col("date") == date
        ).join(
            df_pax.filter(pl.col("date") == date),
            left_on = "ereignis_station_id",
            right_on = "station_id",
            how = "inner"
        ).filter(
            (pl.col("time_iot") >= pl.col("begin_tw")),
            (pl.col("time_iot") <= pl.col("end_tw"))
        ).with_columns(
            pl.when(pl.col("time_iot") <= pl.col("ankunft")).\
                then("before").\
                otherwise(pl.when(pl.col("time_iot") > pl.col("ankunft")).\
                    then("after").\
                    otherwise("undefined")).alias("pax_status")
        )
    
    df_before = df_joined_date.filter(pl.col("pax_status") == "before").\
        group_by(["fahrtid", "ereignis_station_id", "pax_counter_id"]).\
        agg(pl.col("data_pax").max().alias("data_pax_before"))

    df_after = df_joined_date.filter(pl.col("pax_status") == "after").\
        group_by(["fahrtid", "ereignis_station_id", "pax_counter_id"]).\
        agg(pl.col("data_pax").min().alias("data_pax_after"))
    
    # if len(df_after) != len(df_before):
    #     raise ValueError("number of entries in after/before not the same")#
    
    df_pax_agg = df_before.\
    join(df_after,
    on = ["fahrtid", "ereignis_station_id", "pax_counter_id"],
    how = "outer")

    # if len(df_pax_agg) != len(df_before):
    #     raise ValueError("number of entries changed by joining")
    
    id_table = base_data.with_columns(
            pl.col("zeit_echt").dt.date().alias("date")
        ).filter(
            pl.col("date") == date
        ).\
        group_by(["fahrtid", "ereignis_station_id"]).agg(pl.count()).select(["fahrtid", "ereignis_station_id"])

    df_pax_result_date = df_pax_agg.join(id_table, on = ["fahrtid", "ereignis_station_id"], how = "left")

    df_pax_result_date.write_parquet(f"../../data/congestion_data/congestion_data_{region_name}_{mngmt_name}_{date}.parquet")


    

  then("before").\
  then("after").\
  otherwise("undefined")).alias("pax_status")


### read all the data in again

In [67]:
dat = pl.read_parquet(f"../../data/congestion_data/*.parquet")
dat.head()

fahrtid,ereignis_station_id,pax_counter_id,data_pax_before,data_pax_after
str,i64,str,i64,i64
"""20250401-a76bae02-e295-3772-9940-4166d6a9fc2a""",719,"""083af23fd0df""",1,0
"""20250401-e0f62364-07ae-3c02-9f99-6a22ea6bdbd2""",719,"""083af23fd0df""",0,0
"""20250331-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,"""083af23ff6e7""",0,0
"""20250401-b214bf65-fd12-3014-a05f-8b117c79b311""",2468,"""083af23ff6e7""",21,0
"""20250401-231b1e7f-8706-3724-91ec-b029b643bac2""",2468,"""083af23ff6e7""",4,1


In [68]:
len(dat)

140807

In [69]:
len(base_data)

194488

## data quality checks - what is happening in singular steps

In [70]:
filter_date = date

In [71]:
first_try = df_timematch.filter(
    pl.col("date") == filter_date
).join(
    df_pax.filter(pl.col("date") == filter_date),
    left_on = "ereignis_station_id",
    right_on = "station_id",
    how = "inner"
).filter(
    (pl.col("time_iot") >= pl.col("begin_tw")),
    (pl.col("time_iot") <= pl.col("end_tw"))
).with_columns(
    pl.when(pl.col("time_iot") <= pl.col("ankunft")).\
        then("before").\
        otherwise(pl.when(pl.col("time_iot") > pl.col("ankunft")).\
            then("after").\
            otherwise("undefined")).alias("pax_status")
)

first_try.head()

  then("before").\
  then("after").\
  otherwise("undefined")).alias("pax_status")


fahrtid,ereignis_station_id,ankunft,begin_tw,end_tw,date,pax_counter_id,time_iot,data_pax,date_right,pax_status
str,i64,datetime[μs],datetime[μs],datetime[μs],date,str,datetime[μs],i64,date,str
"""20250530-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-05-31 00:00:25,2025-05-30 23:50:25,2025-05-31 00:05:25,2025-05-31,"""083af23ff6e7""",2025-05-31 00:00:17,0,2025-05-31,"""before"""
"""20250530-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-05-31 00:00:25,2025-05-30 23:50:25,2025-05-31 00:05:25,2025-05-31,"""083af23ff6e7""",2025-05-31 00:01:17,0,2025-05-31,"""after"""
"""20250530-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-05-31 00:00:25,2025-05-30 23:50:25,2025-05-31 00:05:25,2025-05-31,"""083af23ff6e7""",2025-05-31 00:02:17,0,2025-05-31,"""after"""
"""20250530-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-05-31 00:00:25,2025-05-30 23:50:25,2025-05-31 00:05:25,2025-05-31,"""083af23ff6e7""",2025-05-31 00:03:17,0,2025-05-31,"""after"""
"""20250530-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,2025-05-31 00:00:25,2025-05-30 23:50:25,2025-05-31 00:05:25,2025-05-31,"""083af23ff6e7""",2025-05-31 00:04:17,0,2025-05-31,"""after"""


In [72]:
first_try.select(pl.col("pax_status").unique())

pax_status
str
"""after"""
"""before"""


In [73]:
df_before = first_try.filter(pl.col("pax_status") == "before").\
    group_by(["fahrtid", "ereignis_station_id", "pax_counter_id"]).\
    agg(pl.col("data_pax").max().alias("data_pax_before"))

df_after = first_try.filter(pl.col("pax_status") == "after").\
    group_by(["fahrtid", "ereignis_station_id", "pax_counter_id"]).\
    agg(pl.col("data_pax").min().alias("data_pax_after"))

In [74]:
df_before.select(pl.count())

count
u32
2007


In [75]:
df_after.select(pl.count())

count
u32
2007


In [76]:
df_pax_agg = df_before.\
    join(df_after,
    on = ["fahrtid", "ereignis_station_id", "pax_counter_id"],
    how = "outer")

In [77]:
df_pax_agg.select(pl.count())

count
u32
2007


In [78]:
df_pax_agg.filter(pl.col("data_pax_after").is_null())

fahrtid,ereignis_station_id,pax_counter_id,data_pax_before,data_pax_after
str,i64,str,i64,i64


there are multiple sensors on some stations so more rows than in the cleaned timetable are expected


maximum should be 3 though

In [79]:
df_pax_agg.group_by(["fahrtid", "ereignis_station_id"]).agg(pl.count()).sort("count", descending=True).head()

fahrtid,ereignis_station_id,count
str,i64,u32
"""20250531-85bddbbb-93dc-3a50-957d-b6df1875bbee""",2877,3
"""20250531-63ffe076-e958-342c-b7de-ecd4bcaeb56a""",2877,3
"""20250531-a3b5a3e5-ba49-3729-884b-e9c60a104c6e""",2877,3
"""20250530-9b2fdb57-9909-3d79-92f2-7fd4cdcaa770""",2877,3
"""20250531-335a73d9-d4dc-3dcc-b1f8-a27d383cc1fb""",2877,3


In [80]:
id_table = base_data.with_columns(
        pl.col("zeit_echt").dt.date().alias("date")
    ).filter(
        pl.col("date") == filter_date
    ).\
    group_by(["fahrtid", "ereignis_station_id"]).agg(pl.count()).select(["fahrtid", "ereignis_station_id"])

In [81]:
id_table.head()

fahrtid,ereignis_station_id
str,i64
"""20250531-111e729c-ca37-3d2a-b644-96b92a053b1a""",5315
"""20250531-72a03faa-5797-3264-90e9-d7f08d7eac96""",5315
"""20250531-e364c3f3-3d19-3698-91eb-abbd8502077f""",5368
"""20250531-3d2fc978-da63-31e5-af96-d2c37947188e""",2877
"""20250531-869bd327-460e-35d0-89e9-eb9dd3a32c01""",5756


In [82]:
df_pax_result = df_pax_agg.join(id_table, on = ["fahrtid", "ereignis_station_id"], how = "left")

In [83]:
df_pax_result.head()

fahrtid,ereignis_station_id,pax_counter_id,data_pax_before,data_pax_after
str,i64,str,i64,i64
"""20250530-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,"""083af23ff6e7""",0,0
"""20250531-dfaca6cf-e249-3537-8608-07b14302a767""",2468,"""083af23ff6e7""",0,0
"""20250531-98f84d4a-4740-3f3d-8279-168e0e6ecfb7""",2468,"""083af23ff6e7""",9,2
"""20250531-79fc2284-da1f-363c-95a1-4b323356a0e8""",2468,"""083af23ff6e7""",8,3
"""20250531-79356123-042a-34ee-beb1-b24aff8863c1""",2468,"""083af23ff6e7""",6,0


### something isn't quite right with the data it seems - if i merge the other way around, null entries exist but those sensors should work

In [84]:
result = id_table.join(df_pax_agg, 
    on = ["fahrtid", "ereignis_station_id"],
    how = "outer")

result

fahrtid,ereignis_station_id,pax_counter_id,data_pax_before,data_pax_after
str,i64,str,i64,i64
"""20250530-ff899981-92b9-3d82-9d06-6a70b5c37244""",2468,"""083af23ff6e7""",0,0
"""20250531-dfaca6cf-e249-3537-8608-07b14302a767""",2468,"""083af23ff6e7""",0,0
"""20250531-98f84d4a-4740-3f3d-8279-168e0e6ecfb7""",2468,"""083af23ff6e7""",9,2
"""20250531-79fc2284-da1f-363c-95a1-4b323356a0e8""",2468,"""083af23ff6e7""",8,3
"""20250531-79356123-042a-34ee-beb1-b24aff8863c1""",2468,"""083af23ff6e7""",6,0
"""20250531-ae1946cc-8037-3f22-8cb8-2646523b59ca""",2468,"""083af23ff6e7""",4,4
"""20250531-03ae607e-607f-3d23-9d83-a8b9ec6bf7ba""",2468,"""083af23ff6e7""",13,3
"""20250531-e602e1b4-f150-3baa-b31b-4426e89b9bf8""",2468,"""083af23ff6e7""",9,1
"""20250531-c288f1e4-9ee5-38e9-b41e-5fd9898b42f3""",2468,"""083af23ff6e7""",5,0
"""20250531-a01badeb-dc57-3c58-ae7d-c96f06e586c8""",2468,"""083af23ff6e7""",1,0


In [85]:
result_nulls = result.filter(pl.col("pax_counter_id").is_null())
result_nulls.head()

fahrtid,ereignis_station_id,pax_counter_id,data_pax_before,data_pax_after
str,i64,str,i64,i64
"""20250531-9e7c7e60-81e4-3c79-8474-802ea12b78bb""",6961,,,
"""20250531-fce9dd6b-6f59-3939-9fd8-5a13a0b9e35c""",6173,,,
"""20250531-79fc2284-da1f-363c-95a1-4b323356a0e8""",4864,,,
"""20250531-def53227-64a8-3e07-8ab2-f7582fff01be""",4166,,,
"""20250531-83336008-1661-3dd4-93a2-f45fc1538246""",3830,,,


In [86]:
ris.filter(pl.col("ereignis_station_id") == 7983,
           pl.col("fahrtid") == '20250515-51adc984-0a84-3e60-a391-665c5d3c6136')

fahrtid,ereignis_station_id,zeit_echt,zeit_echt_verspaetung,gleis_echt,tpname_bahnsteig_echt,gattung,case
str,i64,str,i64,str,str,str,str
"""20250515-51adc984-0a84-3e60-a391-665c5d3c6136""",7983,"""2025-05-15 09:17:20""",1,"""07983-01-B01-G01""","""Bahnsteig 01""","""RB""","""abfahrt"""


In [87]:
pl.read_parquet(pth_pax).filter(pl.col("station_id") == 7983).\
filter(pl.col("time_iot") > "2025-05-15 09:06:20").\
filter(pl.col("time_iot") < "2025-05-15 09:20:20")

pax_counter_id,time_iot,data_pax,station_id,station_name,tpname,station_longitude,station_latitude
str,str,i64,i64,str,str,f64,f64
"""3494545a2a77""","""2025-05-15 09:07:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:08:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:09:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:10:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:11:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:12:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:13:15""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:14:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:15:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436
"""3494545a2a77""","""2025-05-15 09:16:14""",0,7983,"""Tessin West""","""Bahnsteig 01""",12.44257,54.034436


In [88]:
pl.Config(fmt_str_lengths=100)

<polars.config.Config at 0x1efde1364d0>

### rolling mean

In [98]:
df_pax.select(pl.col("pax_counter_id").unique()).to_series().to_list()

['244cab01f99f',
 '3494545a213b',
 '244cab00fa6b',
 'd48afc8ed833',
 '244cab06371f',
 '244cab06fe0f',
 '244cab068dcb',
 '244cab0228ef',
 '244cab06259f',
 'e831cdc2581f',
 '244cab006177',
 '244cab062a1b',
 '244cab02425b',
 '244cab02113b',
 'a0a3b38c53a3',
 'e831cdc2595b',
 '244cab0700ef',
 '083af23fd0df',
 '1097bdd7722b',
 '244cab06feab',
 '3494545a2a77',
 '244cab0712df',
 '244cab021587',
 '3494545a2107',
 '3494545a2a1f',
 'e831cdc25807',
 '244cab03544b',
 'e831cdc26497',
 'e831cdc25893',
 '244cab05d187',
 '244cab05572b',
 '244cab06b913',
 'a0a3b330e483',
 '244cab022f13',
 '244cab07105b',
 '244cab071093',
 '244cab039cf3',
 'd48afc8ec50f',
 'a0a3b32f794b',
 '34ab9540699b',
 '244cab065dbb',
 '244cab03d43f',
 'a0a3b38c5463',
 '244cab01f457',
 '244cab01f2b7',
 '244cab01a103',
 'c45bbe93286f',
 '083af23ff6e7',
 '244cab03a43f',
 '244cab02bddb',
 'e831cdc25847',
 'a0a3b38c5473',
 'e831cdc2597b',
 'e831cdc2589b',
 'd48afc8efb9f',
 'a0a3b3311b9f',
 '244cab03327b',
 '244cab044207',
 '244cab03b9ff

In [None]:
df_pax.group_by(["pax_counter_id"]).\
    agg(pl.col("data_pax").rolling_mean(window_size=10)).\
    explode(pl.col("data_pax"))

pax_counter_id,station_id,data_pax
str,i64,f64
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,
"""244cab039c97""",4812,0.0
