In [1]:
import pandas as pd
import numpy as np
import os, itertools
import woodwork as ww

In [19]:
# actual data
actual_data = {rname: pd.read_csv(f'actualdata_{rname}.csv') for rname in ['trips', 'stop_times']}

# target data
dir_target = 'target_data_vbn_modified'
rnames = ['trips', 'stops', 'stop_times', 'agency', 'routes', 'transfers']
target_data = {rname: pd.read_csv(f'{dir_target}\\{rname}.csv', low_memory=False) for rname in rnames}

In [6]:
def is_valid(r, X):
    return r.groupby(list(X)).size().shape[0] == r.shape[0]

def size_k_candidates(columns, k, uccs):
    candidates = [set(col) for col in itertools.combinations(columns, k)]
    for X in candidates.copy():
        for Y in uccs:
            if Y.issubset(X):
                candidates.remove(X)
                break
    return candidates

def find_uccs(r):
    uccs = []
    columns = list(r.columns)
    for k in range(1, r.shape[1]+1):
        for X in size_k_candidates(columns, k, uccs):
            if is_valid(r, X):
                uccs.append(X)
                if k == 1:
                    columns.remove(*X)
    return uccs

In [9]:
# UCCs
for rname, r in actual_data.items():
    print(rname, find_uccs(r))
for rname, r in target_data.items():
    print(rname, find_uccs(r))

trips [{'EntityId'}, {'TripId'}]
stop_times [{'TripId'}]
trips [{'trip_id'}]
stops [{'stop_id'}]
stop_times [{'trip_id', 'stop_sequence'}]
agency [{'agency_id'}]
routes [{'route_id'}]
transfers []


In [28]:
pd.merge(actual_data['trips'], target_data['trips'], how='left', left_on='TripId', right_on='trip_id')

Unnamed: 0,EntityId,TripId,RouteId,StartTime,StartDate,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id
0,254163638,254163638,43975,19:44:00,20230508,,,,,,
1,282351984,282351984,43245,17:12:00,20230509,,,,,,
2,282385362,282385362,43195,20:03:00,20230508,,,,,,
3,264505093,264505093,59935,19:12:00,20230508,59935.0,3514.0,264505093.0,Cloppenburg Bahnhof,39,0.0
4,282384857,282384857,43216,04:37:00,20230509,43216.0,1150.0,282384857.0,Rendsburg,11798,1.0
...,...,...,...,...,...,...,...,...,...,...,...
2527,282384687,282384687,43212,08:36:00,20230509,43212.0,1203.0,282384687.0,Bad St Peter-Ording,11804,1.0
2528,282181087,282181087,34928,10:54:00,20230509,,,,,,
2529,282351593,282351593,43240,09:51:00,20230509,43240.0,71.0,282351593.0,Salzgitter-Lebenstedt,14270,1.0
2530,282181685,282181685,34514,13:37:00,20230509,,,,,,


In [31]:
pd.merge(actual_data['trips'], target_data['routes'], how='left', left_on='RouteId', right_on='route_id')

Unnamed: 0,EntityId,TripId,RouteId,StartTime,StartDate,route_id,agency_id,route_short_name,route_type
0,254163638,254163638,43975,19:44:00,20230508,,,,
1,282351984,282351984,43245,17:12:00,20230509,,,,
2,282385362,282385362,43195,20:03:00,20230508,,,,
3,264505093,264505093,59935,19:12:00,20230508,59935.0,1180.0,930,3.0
4,282384857,282384857,43216,04:37:00,20230509,43216.0,1689.0,RB75,2.0
...,...,...,...,...,...,...,...,...,...
2527,282384687,282384687,43212,08:36:00,20230509,43212.0,1689.0,RB64,2.0
2528,282181087,282181087,34928,10:54:00,20230509,,,,
2529,282351593,282351593,43240,09:51:00,20230509,43240.0,1689.0,RB48,2.0
2530,282181685,282181685,34514,13:37:00,20230509,,,,
