In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/FastCheckTUData.csv', encoding='utf-8')

In [3]:

# Constants for Denmark Bounding Box (Rough approximation)
DK_MIN_LAT, DK_MAX_LAT = 54.5, 57.8
DK_MIN_LON, DK_MAX_LON = 8.0, 15.2

def haversine_np(lat1, lon1, lat2, lon2):
    """
    Vectorized Haversine distance calculation (returns km).
    """
    R = 6371.0  # Earth radius in kilometers

    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def check_bounds(df, lat_col, lon_col):
    """Returns a boolean Series: True if inside Denmark, False otherwise."""
    return (
        (df[lat_col] >= DK_MIN_LAT) & (df[lat_col] <= DK_MAX_LAT) &
        (df[lon_col] >= DK_MIN_LON) & (df[lon_col] <= DK_MAX_LON)
    )

def validate_speeds(df):
    """
    Check implied speed vs transport mode limits.
    Returns a boolean series: True if speed is IMPOSSIBLE/HARD WRONG.
    """
    # Calculate implied speed (km/h)
    # Avoid division by zero
    duration_hours = df['stagedurationmin_raw'] / 60.0
    implied_speed = df['stagelength_raw'] / duration_hours.replace(0, np.nan)
    
    # Define max realistic speeds (km/h) with a buffer
    # Mode 1: Walk, 2: Bike, 3-25: Car/Motorized
    
    # Initialize flags
    is_speed_impossible = pd.Series(False, index=df.index)
    
    # 1. Walking: Flag if > 15 km/h (world record pace buffer)
    walk_mask = (df['transportmiddel'] == 1)
    is_speed_impossible |= (walk_mask & (implied_speed > 15))
    
    # 2. Biking: Flag if > 60 km/h (fast e-bike downhill buffer)
    bike_mask = (df['transportmiddel'] == 2)
    is_speed_impossible |= (bike_mask & (implied_speed > 60))
    
    # 3. Car/Motorized: Flag if > 180 km/h
    car_mask = (df['transportmiddel'].between(3, 25))
    is_speed_impossible |= (car_mask & (implied_speed > 180))

    return is_speed_impossible

def perform_fast_validation(df):
    """
    Main driver for the fast pass. 
    Adds flag columns and a 'classification' column.
    """
    print("--- Starting Fast Validation Pass ---")
    
    # 1. Geo Check (Haversine) - Trip Distance
    df['calc_dist_geo'] = haversine_np(
        df['start_lat'], df['start_lon'], 
        df['til_lat'], df['til_lon']
    )

    # 2. Anchor Checks (Home/Work)
    # Handle NaNs automatically (Haversine returns NaN if input is NaN)
    df['dist_start_home'] = haversine_np(df['start_lat'], df['start_lon'], df['home_lat'], df['home_lon'])
    df['dist_end_home']   = haversine_np(df['til_lat'],   df['til_lon'],   df['home_lat'], df['home_lon'])
    
    # 3. Bounding Box Checks
    start_ok = check_bounds(df, 'start_lat', 'start_lon')
    end_ok   = check_bounds(df, 'til_lat', 'til_lon')
    
    # 4. Speed Checks
    flag_speed_impossible = validate_speeds(df)
    
    # --- FLAGGING LOGIC ---
    
    # Flag: Coordinates outside Denmark or (0,0)
    df['flag_coords_bad'] = (~start_ok) | (~end_ok)
    
    # Flag: Impossible speed
    df['flag_speed_bad'] = flag_speed_impossible
    
    # Flag: Zero distance trip but not marked as such? (Optional sanity check)
    df['flag_zero_dist'] = (df['calc_dist_geo'] < 0.005) & (df['stagelength_raw'] > 1.0)

    # --- CLASSIFICATION ---
    # Default to trusted
    df['validation_status'] = 'trusted'
    
    # Identify Hard Wrongs
    # Criteria: Bad coords OR Impossible Speed
    hard_wrong_mask = (df['flag_coords_bad']) | (df['flag_speed_bad'])
    df.loc[hard_wrong_mask, 'validation_status'] = 'hard_wrong'
    
    # (Optional) Mild Suspicion logic could go here
    # e.g., if dist_start_home > 200km implies suspicious context, etc.
    
    return df


In [4]:
# Run the fast validation on the dataframe
df = perform_fast_validation(df)

--- Starting Fast Validation Pass ---


In [5]:
# Save the validated dataframe as a checkpoint
df.to_csv('validated_df_checkpoint.csv', index=False)
print("Validated dataframe saved to 'validated_df_checkpoint.csv'")

Validated dataframe saved to 'validated_df_checkpoint.csv'


In [6]:
print("Validation Status Counts:")
print(df['validation_status'].value_counts())

print("\nExample Hard Wrong Rows:")
df[df['validation_status'] == 'hard_wrong'].head()

Validation Status Counts:
validation_status
trusted       228204
hard_wrong      7560
Name: count, dtype: int64

Example Hard Wrong Rows:


Unnamed: 0,SessionId,homeText_raw,workText_raw,startDayText_raw,startstedadrsogeord,startTripText_raw,daystartmuncode,turid,tiladrsogeord,tiladrtext_raw,...,start_lon,til_lat,til_lon,calc_dist_geo,dist_start_home,dist_end_home,flag_coords_bad,flag_speed_bad,flag_zero_dist,validation_status
107,347096,"Plantagevej 1, 4941 Bandholm","Kofoedsminde - Udviklingscentret, Højbovej 6, ...","Plantagevej 1, 4941 Bandholm",,"kofoedsminde - udviklingscentret, højbovej 6, ...",,2118225,"Region Sjællands Hovedkontor, Rødbyhavn","kofoedsmindes hovedkontor, rødbyhavn",...,11.351456,0.0,4.511256,6110.319369,20.454239,6130.283579,True,False,False,hard_wrong
108,347096,"Plantagevej 1, 4941 Bandholm","Kofoedsminde - Udviklingscentret, Højbovej 6, ...","Plantagevej 1, 4941 Bandholm","Region Sjællands Hovedkontor, Rødbyhavn","kofoedsmindes hovedkontor, rødbyhavn",,2118226,,"kofoedsminde - udviklingscentret, højbovej 6, ...",...,4.511256,54.662786,11.351456,6110.319369,6130.283579,20.454239,True,False,False,hard_wrong
150,347115,"Toftevangen 77, 4130 Viby Sjælland",,,,"toftevangen 77, 4130 viby sjælland",,2118279,Gelsted 4160,stenagergårdsvej\r\ngelsted\r\n4130 herlufmagle,...,12.032662,0.0,4.511256,6213.935173,0.0,6213.935173,True,False,False,hard_wrong
151,347115,"Toftevangen 77, 4130 Viby Sjælland",,,Gelsted 4160,stenagergårdsvej\r\ngelsted\r\n4130 herlufmagle,,2118280,,"toftevangen 77, 4130 viby sjælland",...,4.511256,55.545711,12.032662,6213.935173,6213.935173,0.0,True,False,False,hard_wrong
227,347147,"Kejlstrupvej 15, 8600 Silkeborg",,,,"kejlstrupvej 15, 8600 silkeborg",,2118373,"Nørrevænget 42, 8600","nørrevænget 42, 8600 silkeborg",...,9.554954,56.183815,9.545761,0.789938,0.0,0.789938,False,True,False,hard_wrong


In [7]:
import correctors

In [8]:
import importlib

In [9]:
hard_wrong_df = df[df['validation_status'] == 'hard_wrong']

print(f"Processing {len(hard_wrong_df)} hard_wrong rows")

corrections = correctors.repair_hard_wrongs(hard_wrong_df)

print("Validation status in corrections:")
print(corrections['validation_status'].value_counts())

print("Corrections completed.")

Processing 7560 hard_wrong rows
--- Starting Correction Pipeline on 7560 rows ---


Correcting rows:   0%|          | 0/7560 [00:00<?, ?it/s]

Loading Address Index...
Building Optimized Postcode Index...
Index Built. Retriever Ready.


Correcting rows:   1%|          | 88/7560 [01:58<4:16:45,  2.06s/it] 

Saved checkpoint at row 3700


Correcting rows:   1%|▏         | 98/7560 [02:13<2:25:52,  1.17s/it]

Saved checkpoint at row 3900


Correcting rows:   3%|▎         | 225/7560 [04:28<1:52:40,  1.09it/s]

Saved checkpoint at row 11100


Correcting rows:   4%|▍         | 288/7560 [05:39<3:21:42,  1.66s/it]

Saved checkpoint at row 14900


Correcting rows:   5%|▌         | 378/7560 [07:08<1:06:07,  1.81it/s]

Saved checkpoint at row 20800


Correcting rows:   7%|▋         | 527/7560 [10:58<1:27:23,  1.34it/s] 

Saved checkpoint at row 27900


Correcting rows:   9%|▉         | 711/7560 [14:38<2:55:08,  1.53s/it]

Saved checkpoint at row 33800


Correcting rows:  10%|▉         | 751/7560 [15:34<4:44:05,  2.50s/it]

Saved checkpoint at row 35000


Correcting rows:  10%|█         | 763/7560 [15:54<3:04:30,  1.63s/it]

Saved checkpoint at row 35400


Correcting rows:  11%|█         | 801/7560 [17:07<4:14:19,  2.26s/it]

Saved checkpoint at row 36400


Correcting rows:  11%|█         | 809/7560 [17:24<6:17:07,  3.35s/it]

Saved checkpoint at row 36500


Correcting rows:  14%|█▎        | 1022/7560 [21:45<1:55:14,  1.06s/it]

Saved checkpoint at row 43500


Correcting rows:  14%|█▍        | 1092/7560 [22:43<1:38:53,  1.09it/s]

Saved checkpoint at row 47000


Correcting rows:  15%|█▌        | 1137/7560 [23:57<1:01:37,  1.74it/s] 

Saved checkpoint at row 49000


Correcting rows:  15%|█▌        | 1148/7560 [24:02<38:12,  2.80it/s]  

Saved checkpoint at row 49500


Correcting rows:  16%|█▌        | 1202/7560 [24:57<1:30:25,  1.17it/s]

Saved checkpoint at row 51300


Correcting rows:  19%|█▉        | 1471/7560 [29:20<2:14:02,  1.32s/it] 

Saved checkpoint at row 58800


Correcting rows:  20%|█▉        | 1494/7560 [29:38<1:22:45,  1.22it/s]

Saved checkpoint at row 59100


Correcting rows:  22%|██▏       | 1655/7560 [32:01<2:16:49,  1.39s/it]

Saved checkpoint at row 63800


Correcting rows:  26%|██▋       | 1990/7560 [37:37<1:22:35,  1.12it/s]

Saved checkpoint at row 73000


Correcting rows:  26%|██▋       | 2003/7560 [37:49<1:19:00,  1.17it/s]

Saved checkpoint at row 73400


Correcting rows:  27%|██▋       | 2016/7560 [38:03<2:13:39,  1.45s/it]

Saved checkpoint at row 73600


Correcting rows:  28%|██▊       | 2099/7560 [39:09<3:07:39,  2.06s/it]

Saved checkpoint at row 75000


Correcting rows:  30%|███       | 2278/7560 [41:57<35:12,  2.50it/s]  

Saved checkpoint at row 79300


Correcting rows:  30%|███       | 2295/7560 [42:15<1:17:38,  1.13it/s]

Saved checkpoint at row 79900


Correcting rows:  31%|███       | 2306/7560 [42:26<1:18:07,  1.12it/s]

Saved checkpoint at row 80200


Correcting rows:  31%|███▏      | 2365/7560 [43:51<1:01:35,  1.41it/s] 

Saved checkpoint at row 82400


Correcting rows:  32%|███▏      | 2440/7560 [45:01<51:05,  1.67it/s]  

Saved checkpoint at row 83900


Correcting rows:  34%|███▍      | 2608/7560 [47:41<49:39,  1.66it/s]  

Saved checkpoint at row 86400


Correcting rows:  35%|███▌      | 2659/7560 [48:19<42:40,  1.91it/s]  

Saved checkpoint at row 87400


Correcting rows:  37%|███▋      | 2812/7560 [51:13<1:37:41,  1.23s/it]

Saved checkpoint at row 90000


Correcting rows:  37%|███▋      | 2816/7560 [51:22<2:42:48,  2.06s/it]

Saved checkpoint at row 90100


Correcting rows:  38%|███▊      | 2838/7560 [51:48<2:23:46,  1.83s/it]

Saved checkpoint at row 91000


Correcting rows:  39%|███▊      | 2925/7560 [53:34<1:00:35,  1.27it/s]

Saved checkpoint at row 92600


Correcting rows:  40%|████      | 3055/7560 [55:27<1:08:46,  1.09it/s]

Saved checkpoint at row 95100


Correcting rows:  41%|████      | 3097/7560 [55:59<51:29,  1.44it/s]  

Saved checkpoint at row 96300


Correcting rows:  42%|████▏     | 3203/7560 [57:49<1:00:57,  1.19it/s]

Saved checkpoint at row 99400


Correcting rows:  43%|████▎     | 3285/7560 [1:00:07<1:08:11,  1.04it/s]

Saved checkpoint at row 101500


Correcting rows:  45%|████▌     | 3435/7560 [1:02:43<1:53:08,  1.65s/it]

Saved checkpoint at row 104700


Correcting rows:  46%|████▌     | 3450/7560 [1:03:01<1:00:12,  1.14it/s]

Saved checkpoint at row 105100


Correcting rows:  47%|████▋     | 3575/7560 [1:04:53<52:30,  1.26it/s]  

Saved checkpoint at row 107000


Correcting rows:  48%|████▊     | 3602/7560 [1:05:20<1:12:53,  1.11s/it]

Saved checkpoint at row 107200


Correcting rows:  48%|████▊     | 3629/7560 [1:06:00<1:47:09,  1.64s/it]

Saved checkpoint at row 107800


Correcting rows:  50%|█████     | 3791/7560 [1:08:40<20:43,  3.03it/s]  

Saved checkpoint at row 110700


Correcting rows:  50%|█████     | 3808/7560 [1:08:55<58:02,  1.08it/s]  

Saved checkpoint at row 110900


Correcting rows:  52%|█████▏    | 3964/7560 [1:11:31<59:09,  1.01it/s]  

Saved checkpoint at row 116100


Correcting rows:  53%|█████▎    | 3978/7560 [1:11:43<42:45,  1.40it/s]  

Saved checkpoint at row 116800


Correcting rows:  53%|█████▎    | 4002/7560 [1:12:06<45:55,  1.29it/s]  

Saved checkpoint at row 117900


Correcting rows:  54%|█████▍    | 4098/7560 [1:13:51<39:03,  1.48it/s]  

Saved checkpoint at row 121300


Correcting rows:  55%|█████▍    | 4146/7560 [1:14:56<1:04:37,  1.14s/it]

Saved checkpoint at row 123000


Correcting rows:  57%|█████▋    | 4284/7560 [1:16:47<40:29,  1.35it/s]  

Saved checkpoint at row 127000


Correcting rows:  58%|█████▊    | 4356/7560 [1:17:59<1:39:02,  1.85s/it]

Saved checkpoint at row 129700


Correcting rows:  59%|█████▉    | 4465/7560 [1:19:39<48:48,  1.06it/s]  

Saved checkpoint at row 132000


Correcting rows:  62%|██████▏   | 4693/7560 [1:23:04<2:34:57,  3.24s/it]

Saved checkpoint at row 137000


Correcting rows:  62%|██████▏   | 4699/7560 [1:23:11<1:33:21,  1.96s/it]

Saved checkpoint at row 137200


Correcting rows:  64%|██████▍   | 4831/7560 [1:24:39<30:01,  1.51it/s]  

Saved checkpoint at row 142300


Correcting rows:  64%|██████▍   | 4838/7560 [1:24:49<56:13,  1.24s/it]  

Saved checkpoint at row 142500


Correcting rows:  66%|██████▋   | 5011/7560 [1:28:23<45:17,  1.07s/it]  

Saved checkpoint at row 147900


Correcting rows:  71%|███████▏  | 5388/7560 [1:34:31<1:04:13,  1.77s/it]

Saved checkpoint at row 159800


Correcting rows:  74%|███████▍  | 5587/7560 [1:37:53<33:44,  1.03s/it]  

Saved checkpoint at row 170300


Correcting rows:  75%|███████▌  | 5690/7560 [1:40:13<24:59,  1.25it/s]  

Saved checkpoint at row 174400


Correcting rows:  76%|███████▌  | 5743/7560 [1:41:02<29:13,  1.04it/s]

Saved checkpoint at row 176400


Correcting rows:  76%|███████▋  | 5777/7560 [1:41:43<19:32,  1.52it/s]  

Saved checkpoint at row 177500


Correcting rows:  80%|████████  | 6068/7560 [1:46:18<19:59,  1.24it/s]  

Saved checkpoint at row 187500


Correcting rows:  81%|████████  | 6092/7560 [1:46:31<09:38,  2.54it/s]

Saved checkpoint at row 188700


Correcting rows:  81%|████████▏ | 6154/7560 [1:47:20<10:05,  2.32it/s]

Saved checkpoint at row 193200


Correcting rows:  82%|████████▏ | 6191/7560 [1:47:54<11:05,  2.06it/s]

Saved checkpoint at row 195100


Correcting rows:  82%|████████▏ | 6197/7560 [1:47:58<16:44,  1.36it/s]

Saved checkpoint at row 195300


Correcting rows:  86%|████████▌ | 6465/7560 [1:51:52<23:54,  1.31s/it]  

Saved checkpoint at row 205000


Correcting rows:  86%|████████▋ | 6532/7560 [1:53:04<09:49,  1.74it/s]

Saved checkpoint at row 206500


Correcting rows:  88%|████████▊ | 6634/7560 [1:54:46<13:29,  1.14it/s]

Saved checkpoint at row 209600


Correcting rows:  88%|████████▊ | 6642/7560 [1:55:16<41:55,  2.74s/it]  

Saved checkpoint at row 210100


Correcting rows:  90%|█████████ | 6818/7560 [1:57:51<05:46,  2.14it/s]

Saved checkpoint at row 215800


Correcting rows:  93%|█████████▎| 6995/7560 [2:00:14<04:41,  2.01it/s]

Saved checkpoint at row 220400


Correcting rows:  95%|█████████▍| 7178/7560 [2:03:05<02:30,  2.55it/s]

Saved checkpoint at row 225700


Correcting rows:  96%|█████████▋| 7289/7560 [2:04:25<01:42,  2.64it/s]

Saved checkpoint at row 229300


Correcting rows:  97%|█████████▋| 7311/7560 [2:04:46<03:52,  1.07it/s]

Saved checkpoint at row 230000


Correcting rows:  98%|█████████▊| 7440/7560 [2:05:50<00:46,  2.56it/s]

Saved checkpoint at row 232500


Correcting rows: 100%|██████████| 7560/7560 [2:07:36<00:00,  1.01s/it]


--- Finished. Corrected 7341 rows. Coords fixed: 6931, OSRM attempted: 7529 ---
Validation status in corrections:
validation_status
corrected     7341
hard_wrong     219
Name: count, dtype: int64
Corrections completed.


In [10]:
df['corrected_dist_km'] = np.nan
df['corrected_time_min'] = np.nan
df['note_correction'] = np.nan

df.update(corrections)

print("DataFrame updated with corrections.")

DataFrame updated with corrections.


  df.update(corrections)


In [11]:
df.columns

Index(['SessionId', 'homeText_raw', 'workText_raw', 'startDayText_raw',
       'startstedadrsogeord', 'startTripText_raw', 'daystartmuncode', 'turid',
       'tiladrsogeord', 'tiladrtext_raw', 'transportmiddel', 'stagelength_raw',
       'stagedurationmin_raw', 'home_lat', 'home_lon', 'work_lat', 'work_lon',
       'start_lat', 'start_lon', 'til_lat', 'til_lon', 'calc_dist_geo',
       'dist_start_home', 'dist_end_home', 'flag_coords_bad', 'flag_speed_bad',
       'flag_zero_dist', 'validation_status', 'corrected_dist_km',
       'corrected_time_min', 'note_correction'],
      dtype='object')

In [12]:
# Save the final updated dataframe
df.to_csv('DET_final_corrected_df.csv', index=False)
print("Final corrected dataframe saved to 'DET_final_corrected_df.csv'")

Final corrected dataframe saved to 'DET_final_corrected_df.csv'


In [13]:
print('Total rows in df:', len(df))
print('NaN in validation_status:', df['validation_status'].isna().sum())
print('Sum of counts:', df['validation_status'].value_counts().sum())

Total rows in df: 235764
NaN in validation_status: 0
Sum of counts: 235764


In [None]:
# Compare validated_df_checkpoint and final_corrected_df
#validated_df = pd.read_csv('validated_df_checkpoint.csv')
final_df = pd.read_csv('DET_final_corrected_df.csv')

#print("Unique SessionId in validated_df_checkpoint:", validated_df['SessionId'].nunique())
print("Unique SessionId in final_corrected_df:", final_df['SessionId'].nunique())

print("\nValidation status in validated_df_checkpoint:")
#print(validated_df['validation_status'].value_counts())
print("\nValidation status in final_corrected_df:")
print(final_df['validation_status'].value_counts())

# Check if there are differences in the data
#differences = (validated_df != final_df).any(axis=1).sum()
#print(f"\nNumber of rows with differences: {differences}")

# Specifically for hard_wrong rows
#hard_wrong_original = validated_df[validated_df['validation_status'] == 'hard_wrong']
hard_wrong_final = final_df[final_df['validation_status'] == 'hard_wrong']
#print(f"Hard wrong rows in original: {len(hard_wrong_original)}")
print(f"Hard wrong rows in final: {len(hard_wrong_final)}")

Unique SessionId in final_corrected_df: 74596

Validation status in validated_df_checkpoint:

Validation status in final_corrected_df:
validation_status
trusted       228204
corrected       7371
hard_wrong       189
Name: count, dtype: int64
Hard wrong rows in final: 189
