In [1]:
from RAG_correctors import repair_hard_wrongs
import pandas as pd
import numpy as np
import importlib

In [None]:
df = pd.read_csv("RAG_final_thesis_results.csv")

In [3]:
print(df.shape)
print(df.columns)
print(df.dtypes)

# Check for required columns
required_cols = ['llm_verdict', 'rag_start_lat', 'rag_start_lon', 'rag_end_lat', 'rag_end_lon', 'transportmiddel']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    print(f"Missing required columns: {missing_cols}")
else:
    print("All required columns present.")

(234531, 39)
Index(['SessionId', 'homeText_raw', 'workText_raw', 'startDayText_raw',
       'startstedadrsogeord', 'startTripText_raw', 'turid', 'tiladrsogeord',
       'tiladrtext_raw', 'transportmiddel', 'stagelength_raw',
       'stagedurationmin_raw', 'home_lat', 'home_lon', 'work_lat', 'work_lon',
       'start_lat', 'start_lon', 'til_lat', 'til_lon', 'calc_dist_geo',
       'dist_start_home', 'dist_end_home', 'flag_coords_bad', 'flag_speed_bad',
       'flag_zero_dist', 'validation_status', 'prompt_text',
       'rag_start_address', 'rag_start_lat', 'rag_start_lon', 'rag_start_conf',
       'rag_end_address', 'rag_end_lat', 'rag_end_lon', 'rag_end_conf',
       'rag_prompt_text', 'llm_full_response', 'llm_verdict'],
      dtype='object')
SessionId                 int64
homeText_raw             object
workText_raw             object
startDayText_raw         object
startstedadrsogeord      object
startTripText_raw        object
turid                     int64
tiladrsogeord         

In [4]:
# Debug: Test OSRM on a sample invalid row
from RAG_correctors import get_osrm_route_bridge

# Find an invalid row
invalid_rows = df[df['llm_verdict'] == 'INVALID']
if not invalid_rows.empty:
    sample_row = invalid_rows.iloc[0]
    print("Sample row values:")
    print(f"Start: {sample_row['rag_start_lat']}, {sample_row['rag_start_lon']}")
    print(f"End: {sample_row['rag_end_lat']}, {sample_row['rag_end_lon']}")
    print(f"Transport: {sample_row['transportmiddel']} (type: {type(sample_row['transportmiddel'])})")
    print(f"NaN in coords: {pd.isna(sample_row[['rag_start_lat', 'rag_start_lon', 'rag_end_lat', 'rag_end_lon']]).any()}")
    print(f"Transport NaN: {pd.isna(sample_row['transportmiddel'])}")
    
    # Test OSRM call
    result = get_osrm_route_bridge(
        sample_row['rag_start_lat'], 
        sample_row['rag_start_lon'], 
        sample_row['rag_end_lat'], 
        sample_row['rag_end_lon'], 
        sample_row['transportmiddel']
    )
    print(f"OSRM result: {result}")
else:
    print("No invalid rows found.")

Sample row values:
Start: 54.66261819, 11.35739183
End: 55.8001054, 10.55570583
Transport: 11.0 (type: <class 'numpy.float64'>)
NaN in coords: False
Transport NaN: False
OSRM result: (211.6, np.float64(230.0))


In [5]:
corrected_df = repair_hard_wrongs(df)

print("Original invalid count:", (df['llm_verdict'] == 'INVALID').sum())
print("Corrected count:", (corrected_df['llm_verdict'] == 'corrected').sum())

--- Starting Correction Pipeline on 8067 rows ---


Correcting rows:   3%|▎         | 204/8067 [00:36<11:41, 11.21it/s] 

Processed 200 rows, fixed 196 so far.


Correcting rows:   5%|▍         | 398/8067 [00:56<16:23,  7.80it/s]

Processed 400 rows, fixed 396 so far.


Correcting rows:   7%|▋         | 602/8067 [01:07<05:23, 23.07it/s]

Processed 600 rows, fixed 596 so far.


Correcting rows:  10%|▉         | 803/8067 [01:18<06:15, 19.33it/s]

Processed 800 rows, fixed 796 so far.


Correcting rows:  12%|█▏        | 1006/8067 [01:27<02:51, 41.07it/s]

Processed 1000 rows, fixed 996 so far.


Correcting rows:  15%|█▍        | 1198/8067 [01:33<02:14, 51.09it/s]

Processed 1200 rows, fixed 1196 so far.


Correcting rows:  17%|█▋        | 1404/8067 [01:38<03:00, 36.84it/s]

Processed 1400 rows, fixed 1396 so far.


Correcting rows:  20%|█▉        | 1601/8067 [01:44<02:56, 36.66it/s]

Processed 1600 rows, fixed 1596 so far.


Correcting rows:  22%|██▏       | 1800/8067 [01:49<02:11, 47.68it/s]

Processed 1800 rows, fixed 1796 so far.


Correcting rows:  25%|██▍       | 2016/8067 [01:53<01:53, 53.46it/s]

Processed 2000 rows, fixed 1996 so far.


Correcting rows:  27%|██▋       | 2210/8067 [01:58<01:43, 56.56it/s]

Processed 2200 rows, fixed 2196 so far.


Correcting rows:  30%|██▉       | 2410/8067 [02:01<01:29, 62.96it/s]

Processed 2400 rows, fixed 2396 so far.


Correcting rows:  32%|███▏      | 2600/8067 [02:06<02:45, 33.09it/s]

Processed 2600 rows, fixed 2596 so far.


Correcting rows:  35%|███▍      | 2802/8067 [02:20<12:25,  7.06it/s]

Processed 2800 rows, fixed 2795 so far.


Correcting rows:  37%|███▋      | 2999/8067 [02:37<04:25, 19.12it/s]

Processed 3000 rows, fixed 2995 so far.


Correcting rows:  40%|███▉      | 3201/8067 [02:50<04:26, 18.28it/s]

Processed 3200 rows, fixed 3194 so far.


Correcting rows:  42%|████▏     | 3406/8067 [02:57<03:10, 24.43it/s]

Processed 3400 rows, fixed 3394 so far.


Correcting rows:  45%|████▍     | 3604/8067 [03:03<01:58, 37.65it/s]

Processed 3600 rows, fixed 3594 so far.


Correcting rows:  47%|████▋     | 3801/8067 [03:08<02:01, 35.16it/s]

Processed 3800 rows, fixed 3794 so far.


Correcting rows:  50%|████▉     | 4005/8067 [03:14<03:12, 21.05it/s]

Processed 4000 rows, fixed 3991 so far.


Correcting rows:  52%|█████▏    | 4208/8067 [03:19<01:17, 49.57it/s]

Processed 4200 rows, fixed 4188 so far.


Correcting rows:  55%|█████▍    | 4400/8067 [03:23<01:13, 49.94it/s]

Processed 4400 rows, fixed 4388 so far.


Correcting rows:  57%|█████▋    | 4609/8067 [03:28<01:02, 54.95it/s]

Processed 4600 rows, fixed 4588 so far.


Correcting rows:  60%|█████▉    | 4805/8067 [03:32<01:08, 47.83it/s]

Processed 4800 rows, fixed 4787 so far.


Correcting rows:  62%|██████▏   | 5002/8067 [03:35<00:40, 75.67it/s]

Processed 5000 rows, fixed 4987 so far.


Correcting rows:  65%|██████▍   | 5205/8067 [03:39<00:51, 56.00it/s]

Processed 5200 rows, fixed 5187 so far.


Correcting rows:  67%|██████▋   | 5410/8067 [03:42<00:42, 62.16it/s]

Processed 5400 rows, fixed 5387 so far.


Correcting rows:  70%|██████▉   | 5609/8067 [03:46<00:42, 57.34it/s]

Processed 5600 rows, fixed 5587 so far.


Correcting rows:  72%|███████▏  | 5809/8067 [03:49<00:39, 57.28it/s]

Processed 5800 rows, fixed 5785 so far.


Correcting rows:  74%|███████▍  | 6002/8067 [03:52<00:31, 65.35it/s]

Processed 6000 rows, fixed 5985 so far.


Correcting rows:  77%|███████▋  | 6205/8067 [03:56<00:32, 57.20it/s]

Processed 6200 rows, fixed 6185 so far.


Correcting rows:  79%|███████▉  | 6409/8067 [03:59<00:26, 62.23it/s]

Processed 6400 rows, fixed 6385 so far.


Correcting rows:  82%|████████▏ | 6612/8067 [04:03<00:20, 70.97it/s]

Processed 6600 rows, fixed 6585 so far.


Correcting rows:  84%|████████▍ | 6802/8067 [04:06<00:19, 65.18it/s]

Processed 6800 rows, fixed 6785 so far.


Correcting rows:  87%|████████▋ | 6998/8067 [04:09<00:17, 62.13it/s]

Processed 7000 rows, fixed 6985 so far.


Correcting rows:  89%|████████▉ | 7211/8067 [04:13<00:11, 73.85it/s]

Processed 7200 rows, fixed 7184 so far.


Correcting rows:  92%|█████████▏| 7410/8067 [04:17<00:12, 53.64it/s]

Processed 7400 rows, fixed 7384 so far.


Correcting rows:  94%|█████████▍| 7609/8067 [04:20<00:06, 65.76it/s]

Processed 7600 rows, fixed 7584 so far.


Correcting rows:  97%|█████████▋| 7811/8067 [04:23<00:03, 76.27it/s]

Processed 7800 rows, fixed 7784 so far.


Correcting rows:  99%|█████████▉| 8006/8067 [04:27<00:02, 29.12it/s]

Processed 8000 rows, fixed 7984 so far.


Correcting rows: 100%|██████████| 8067/8067 [04:28<00:00, 30.00it/s]


--- Finished. Corrected 8051 rows. OSRM attempted: 8067 ---
Original invalid count: 8067
Corrected count: 8051


In [6]:
corrected_df.columns

Index(['SessionId', 'homeText_raw', 'workText_raw', 'startDayText_raw',
       'startstedadrsogeord', 'startTripText_raw', 'turid', 'tiladrsogeord',
       'tiladrtext_raw', 'transportmiddel', 'stagelength_raw',
       'stagedurationmin_raw', 'home_lat', 'home_lon', 'work_lat', 'work_lon',
       'start_lat', 'start_lon', 'til_lat', 'til_lon', 'calc_dist_geo',
       'dist_start_home', 'dist_end_home', 'flag_coords_bad', 'flag_speed_bad',
       'flag_zero_dist', 'validation_status', 'prompt_text',
       'rag_start_address', 'rag_start_lat', 'rag_start_lon', 'rag_start_conf',
       'rag_end_address', 'rag_end_lat', 'rag_end_lon', 'rag_end_conf',
       'rag_prompt_text', 'llm_full_response', 'llm_verdict',
       'corrected_dist_km', 'corrected_time_min', 'note_correction'],
      dtype='object')

In [7]:
df['corrected_dist_km'] = np.nan
df['corrected_time_min'] = np.nan
df['note_correction'] = np.nan

df.update(corrected_df)

  df.update(corrected_df)


In [None]:
df.to_csv('RAG_corrected_df.csv', index=False)
print("Corrected data saved to 'RAG_corrected_df.csv'")

Corrected data saved to 'RAG_corrected_df_v1.csv'


In [9]:
df.columns

Index(['SessionId', 'homeText_raw', 'workText_raw', 'startDayText_raw',
       'startstedadrsogeord', 'startTripText_raw', 'turid', 'tiladrsogeord',
       'tiladrtext_raw', 'transportmiddel', 'stagelength_raw',
       'stagedurationmin_raw', 'home_lat', 'home_lon', 'work_lat', 'work_lon',
       'start_lat', 'start_lon', 'til_lat', 'til_lon', 'calc_dist_geo',
       'dist_start_home', 'dist_end_home', 'flag_coords_bad', 'flag_speed_bad',
       'flag_zero_dist', 'validation_status', 'prompt_text',
       'rag_start_address', 'rag_start_lat', 'rag_start_lon', 'rag_start_conf',
       'rag_end_address', 'rag_end_lat', 'rag_end_lon', 'rag_end_conf',
       'rag_prompt_text', 'llm_full_response', 'llm_verdict',
       'corrected_dist_km', 'corrected_time_min', 'note_correction'],
      dtype='object')