In [138]:
import pandas as pd
import numpy as np
import os, itertools
import woodwork as ww

In [139]:
# actual data
actual_data = {rname: pd.read_csv(f'actualdata_{rname}.csv') for rname in ['trips', 'stop_times']}

# target data
dir_target = 'target_data_vbn_modified'
rnames = ['trips', 'stops', 'stop_times', 'agency', 'routes', 'transfers']
target_data = {rname: pd.read_csv(f'{dir_target}\\{rname}.csv', low_memory=False) for rname in rnames}

# UCC discovery

In [178]:
def is_valid(r, X):
    return r.groupby(list(X)).size().shape[0] == r.shape[0]

def size_k_candidates(columns, k, uccs):
    candidates = [set(col) for col in itertools.combinations(columns, k)]
    for X in candidates.copy():
        for Y in uccs:
            if Y.issubset(X):
                candidates.remove(X)
                break
    return candidates

def find_uccs(r):
    uccs = []
    columns = list(r.columns)
    for k in range(1, r.shape[1]+1):
        for X in size_k_candidates(columns, k, uccs):
            if is_valid(r, X):
                uccs.append(X)
                if k == 1:
                    columns.remove(*X)
    return uccs

# note: using position list indexes for validation was not as efficient as using the groupby operator

In [141]:
# UCCs
for rname, r in actual_data.items():
    print(rname, find_uccs(r))
for rname, r in target_data.items():
    print(rname, find_uccs(r))

trips [{'EntityId'}, {'TripId'}]
stop_times [{'TripId'}]
trips [{'trip_id'}]
stops [{'stop_id'}]
stop_times [{'trip_id', 'stop_sequence'}]
agency [{'agency_id'}]
routes [{'route_id'}]
transfers []


# data profiling

data types

In [144]:
# tables with actual data
for k, v in actual_data.items():
    print(k)
    print(v.dtypes)
    print()

trips
EntityId      int64
TripId        int64
RouteId       int64
StartTime    object
StartDate     int64
dtype: object

stop_times
TripId                    int64
StopId                    int64
StopSequence            float64
ArrivalDelay            float64
ArrivalTime             float64
DepartureDelay          float64
DepartureTime           float64
ScheduleRelationship     object
dtype: object



In [145]:
# tables with target data
for k, v in target_data.items():
    print(k)
    print(v.dtypes)
    print()

trips
route_id            int64
service_id          int64
trip_id             int64
trip_headsign      object
trip_short_name    object
direction_id        int64
dtype: object

stops
stop_id        int64
stop_name     object
stop_lat     float64
stop_lon     float64
dtype: object

stop_times
trip_id            int64
arrival_time      object
departure_time    object
stop_id            int64
stop_sequence      int64
pickup_type        int64
drop_off_type      int64
dtype: object

agency
agency_id       int64
agency_name    object
dtype: object

routes
route_id             int64
agency_id            int64
route_short_name    object
route_type           int64
dtype: object

transfers
from_stop_id           int64
to_stop_id             int64
transfer_type          int64
min_transfer_time    float64
dtype: object



check for inclusion dependecies

In [173]:
X1 = set(actual_data['trips']['RouteId'])
X2 = set(target_data['trips']['route_id'])
len(X1.intersection(X2)), len(X2), len(X1)
# no inclusion dependency even though this was expected

(191, 2859, 346)

In [174]:
# or rather this can be shown using a join
Y = pd.merge(actual_data['trips'], target_data['trips'], how='left', left_on=['RouteId'], right_on=['route_id'])
Y.loc[Y['route_id'].isna() & ~Y['RouteId'].isna()].empty

False

In [175]:
X1 = set(actual_data['trips']['TripId'])
X2 = set(target_data['trips']['trip_id'])
len(X1.intersection(X2)), len(X2), len(X1)
# no inclusion dependency even though this was expected

(1718, 180017, 2532)

In [176]:
# or rather this can be shown using a join
Y = pd.merge(actual_data['trips'], target_data['trips'], how='left', left_on=['TripId'], right_on=['trip_id'])
Y.loc[Y['trip_id'].isna() & ~Y['TripId'].isna()].empty

False

combine tables using joins

In [185]:
# step 1: join actual trips and stop_times
r1 = pd.merge(actual_data['stop_times'], actual_data['trips'], how='left', left_on=['TripId'], right_on=['TripId'])
r1

Unnamed: 0,TripId,StopId,StopSequence,ArrivalDelay,ArrivalTime,DepartureDelay,DepartureTime,ScheduleRelationship,EntityId,RouteId,StartTime,StartDate
0,254163638,9057862,,,,0.0,,Scheduled,254163638,43975,19:44:00,20230508
1,282351984,9013478,,,,0.0,,Scheduled,282351984,43245,17:12:00,20230509
2,282385362,9058008,5.0,-60.0,,0.0,,Scheduled,282385362,43195,20:03:00,20230508
3,264505093,9050640,11.0,30.0,,30.0,,Scheduled,264505093,59935,19:12:00,20230508
4,282384857,9049320,,,,0.0,,Scheduled,282384857,43216,04:37:00,20230509
...,...,...,...,...,...,...,...,...,...,...,...,...
2527,282384687,9044660,,,,0.0,,Scheduled,282384687,43212,08:36:00,20230509
2528,282181087,9046671,,,,0.0,,Scheduled,282181087,34928,10:54:00,20230509
2529,282351593,8000049,,,,0.0,,Scheduled,282351593,43240,09:51:00,20230509
2530,282181685,9066023,,,,0.0,,Scheduled,282181685,34514,13:37:00,20230509


In [192]:
# step 2a: join routes and agency (target data)
r2 = pd.merge(target_data['routes'], target_data['agency'], how='left', left_on=['agency_id'], right_on=['agency_id'])
r2.head(3)

Unnamed: 0,route_id,agency_id,route_short_name,route_type,agency_name
0,71026,1060,SEV24,1,S-Bahn Hamburg
1,71025,1060,SEV10,1,S-Bahn Hamburg
2,70978,1060,SEV21,2,S-Bahn Hamburg


In [193]:
# step 2b: join the two relations using the Route Id
r3 = pd.merge(r1, r2, how='left', left_on=['RouteId'], right_on=['route_id'])
r3.head(3)

# finally, it is possible to...
# 1. determine the delay according to different agencies
# 2. determine the delay according to means of transportation

Unnamed: 0,TripId,StopId,StopSequence,ArrivalDelay,ArrivalTime,DepartureDelay,DepartureTime,ScheduleRelationship,EntityId,RouteId,StartTime,StartDate,route_id,agency_id,route_short_name,route_type,agency_name
0,254163638,9057862,,,,0.0,,Scheduled,254163638,43975,19:44:00,20230508,,,,,
1,282351984,9013478,,,,0.0,,Scheduled,282351984,43245,17:12:00,20230509,,,,,
2,282385362,9058008,5.0,-60.0,,0.0,,Scheduled,282385362,43195,20:03:00,20230508,,,,,


In [194]:
# step 3: join TARGET trips and stop_times
r4 = pd.merge(target_data['stop_times'], target_data['trips'], how='left', left_on=['trip_id'], right_on=['trip_id'])
r4.head(3)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,route_id,service_id,trip_headsign,trip_short_name,direction_id
0,282300445,23:20:00,23:20:00,8002557,0,0,0,71026,1,Buxtehude,1000,0
1,282300445,23:25:00,23:25:00,2804001,1,0,0,71026,1,Buxtehude,1000,0
2,282300445,23:30:00,23:30:00,2047308,2,0,0,71026,1,Buxtehude,1000,0


In [195]:
# step 4: join r1 and r4 on Trip Id and Stop Id
# their schemata match for the most part
# however, using a left join enables to get all delays even if there is no join match
# (possible because there is no inclusion dependency)
r5 = pd.merge(r1, r4, how='inner', left_on=['TripId', 'StopId'], right_on=['trip_id', 'stop_id'])
r5

Unnamed: 0,TripId,StopId,StopSequence,ArrivalDelay,ArrivalTime,DepartureDelay,DepartureTime,ScheduleRelationship,EntityId,RouteId,...,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,route_id,service_id,trip_headsign,trip_short_name,direction_id
0,264505093,9050640,11.0,30.0,,30.0,,Scheduled,264505093,59935,...,19:34:00,9050640,11,0,0,59935,3514,Cloppenburg Bahnhof,39,0
1,282384857,9049320,,,,0.0,,Scheduled,282384857,43216,...,4:37:00,9049320,0,0,0,43216,1150,Rendsburg,11798,1
2,282188083,9090291,,,,0.0,,Scheduled,282188083,26771,...,20:49:00,9090291,0,0,0,26771,1116,Bremen Hbf,83383,0
3,282351771,9023280,7.0,60.0,,0.0,,Scheduled,282351771,43243,...,20:46:00,9023280,7,0,0,43243,394,Göttingen,14132,1
4,282184222,9049079,,,,0.0,,Scheduled,282184222,68486,...,12:43:00,9049079,0,0,0,68486,77,Lübeck Hbf,21021,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,282347652,9066582,,,,0.0,,Scheduled,282347652,68246,...,1:15:00,9066582,0,0,0,68246,219,Lübeck Hbf,11645,0
1723,279028155,9014285,,,,0.0,,Scheduled,279028155,35759,...,20:36:00,9014285,0,0,0,35759,32,Gröpelingen,01883,1
1724,279027722,9014184,,,,0.0,,Scheduled,279027722,35760,...,21:01:00,9014184,0,0,0,35760,35,Gröpelingen,01407,1
1725,282384687,9044660,,,,0.0,,Scheduled,282384687,43212,...,8:36:00,9044660,0,0,0,43212,1203,Bad St Peter-Ording,11804,1
