In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
df = pd.read_csv('FastCheckTUData.csv', encoding='utf-8')

In [3]:

# Constants for Denmark Bounding Box (Rough approximation)
DK_MIN_LAT, DK_MAX_LAT = 54.5, 57.8
DK_MIN_LON, DK_MAX_LON = 8.0, 15.2

def haversine_np(lat1, lon1, lat2, lon2):
    """
    Vectorized Haversine distance calculation (returns km).
    """
    R = 6371.0  # Earth radius in kilometers

    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def check_bounds(df, lat_col, lon_col):
    """Returns a boolean Series: True if inside Denmark, False otherwise."""
    return (
        (df[lat_col] >= DK_MIN_LAT) & (df[lat_col] <= DK_MAX_LAT) &
        (df[lon_col] >= DK_MIN_LON) & (df[lon_col] <= DK_MAX_LON)
    )

def validate_speeds(df):
    """
    Check implied speed vs transport mode limits.
    Returns a boolean series: True if speed is IMPOSSIBLE/HARD WRONG.
    """
    # Calculate implied speed (km/h)
    # Avoid division by zero
    duration_hours = df['stagedurationmin_raw'] / 60.0
    implied_speed = df['stagelength_raw'] / duration_hours.replace(0, np.nan)
    
    # Define max realistic speeds (km/h) with a buffer
    # Mode 1: Walk, 2: Bike, 3-25: Car/Motorized
    
    # Initialize flags
    is_speed_impossible = pd.Series(False, index=df.index)
    
    # 1. Walking: Flag if > 15 km/h (world record pace buffer)
    walk_mask = (df['transportmiddel'] == 1)
    is_speed_impossible |= (walk_mask & (implied_speed > 15))
    
    # 2. Biking: Flag if > 60 km/h (fast e-bike downhill buffer)
    bike_mask = (df['transportmiddel'] == 2)
    is_speed_impossible |= (bike_mask & (implied_speed > 60))
    
    # 3. Car/Motorized: Flag if > 180 km/h
    car_mask = (df['transportmiddel'].between(3, 25))
    is_speed_impossible |= (car_mask & (implied_speed > 180))

    return is_speed_impossible

def perform_fast_validation(df):
    """
    Main driver for the fast pass. 
    Adds flag columns and a 'classification' column.
    """
    print("--- Starting Fast Validation Pass ---")
    
    # 1. Geo Check (Haversine) - Trip Distance
    df['calc_dist_geo'] = haversine_np(
        df['start_lat'], df['start_lon'], 
        df['til_lat'], df['til_lon']
    )

    # 2. Anchor Checks (Home/Work)
    # Handle NaNs automatically (Haversine returns NaN if input is NaN)
    df['dist_start_home'] = haversine_np(df['start_lat'], df['start_lon'], df['home_lat'], df['home_lon'])
    df['dist_end_home']   = haversine_np(df['til_lat'],   df['til_lon'],   df['home_lat'], df['home_lon'])
    
    # 3. Bounding Box Checks
    start_ok = check_bounds(df, 'start_lat', 'start_lon')
    end_ok   = check_bounds(df, 'til_lat', 'til_lon')
    
    # 4. Speed Checks
    flag_speed_impossible = validate_speeds(df)
    
    # --- FLAGGING LOGIC ---
    
    # Flag: Coordinates outside Denmark or (0,0)
    df['flag_coords_bad'] = (~start_ok) | (~end_ok)
    
    # Flag: Impossible speed
    df['flag_speed_bad'] = flag_speed_impossible
    
    # Flag: Zero distance trip but not marked as such? (Optional sanity check)
    df['flag_zero_dist'] = (df['calc_dist_geo'] < 0.005) & (df['stagelength_raw'] > 1.0)

    # --- CLASSIFICATION ---
    # Default to trusted
    df['validation_status'] = 'trusted'
    
    # Identify Hard Wrongs
    # Criteria: Bad coords OR Impossible Speed
    hard_wrong_mask = (df['flag_coords_bad']) | (df['flag_speed_bad'])
    df.loc[hard_wrong_mask, 'validation_status'] = 'hard_wrong'
    
    # (Optional) Mild Suspicion logic could go here
    # e.g., if dist_start_home > 200km implies suspicious context, etc.
    
    return df


In [4]:
# Run the fast validation on the dataframe
df = perform_fast_validation(df)

--- Starting Fast Validation Pass ---


In [5]:
print("Validation Status Counts:")
print(df['validation_status'].value_counts())

print("\nExample Hard Wrong Rows:")
df[df['validation_status'] == 'hard_wrong'].head()

Validation Status Counts:
validation_status
trusted       228204
hard_wrong      7560
Name: count, dtype: int64

Example Hard Wrong Rows:


Unnamed: 0,SessionId,homeText_raw,workText_raw,startDayText_raw,startstedadrsogeord,startTripText_raw,daystartmuncode,turid,tiladrsogeord,tiladrtext_raw,...,start_lon,til_lat,til_lon,calc_dist_geo,dist_start_home,dist_end_home,flag_coords_bad,flag_speed_bad,flag_zero_dist,validation_status
107,347096,"Plantagevej 1, 4941 Bandholm","Kofoedsminde - Udviklingscentret, Højbovej 6, ...","Plantagevej 1, 4941 Bandholm",,"kofoedsminde - udviklingscentret, højbovej 6, ...",,2118225,"Region Sjællands Hovedkontor, Rødbyhavn","kofoedsmindes hovedkontor, rødbyhavn",...,11.351456,0.0,4.511256,6110.319369,20.454239,6130.283579,True,False,False,hard_wrong
108,347096,"Plantagevej 1, 4941 Bandholm","Kofoedsminde - Udviklingscentret, Højbovej 6, ...","Plantagevej 1, 4941 Bandholm","Region Sjællands Hovedkontor, Rødbyhavn","kofoedsmindes hovedkontor, rødbyhavn",,2118226,,"kofoedsminde - udviklingscentret, højbovej 6, ...",...,4.511256,54.662786,11.351456,6110.319369,6130.283579,20.454239,True,False,False,hard_wrong
150,347115,"Toftevangen 77, 4130 Viby Sjælland",,,,"toftevangen 77, 4130 viby sjælland",,2118279,Gelsted 4160,stenagergårdsvej\r\ngelsted\r\n4130 herlufmagle,...,12.032662,0.0,4.511256,6213.935173,0.0,6213.935173,True,False,False,hard_wrong
151,347115,"Toftevangen 77, 4130 Viby Sjælland",,,Gelsted 4160,stenagergårdsvej\r\ngelsted\r\n4130 herlufmagle,,2118280,,"toftevangen 77, 4130 viby sjælland",...,4.511256,55.545711,12.032662,6213.935173,6213.935173,0.0,True,False,False,hard_wrong
227,347147,"Kejlstrupvej 15, 8600 Silkeborg",,,,"kejlstrupvej 15, 8600 silkeborg",,2118373,"Nørrevænget 42, 8600","nørrevænget 42, 8600 silkeborg",...,9.554954,56.183815,9.545761,0.789938,0.0,0.789938,False,True,False,hard_wrong


In [6]:
# 1. FILTERING (Crucial First Step)
# Remove Public Transport (>= 26) before doing anything else
print(f"Original Row Count: {len(df)}")
df = df[df['transportmiddel'] < 26].copy()
print(f"Filtered Row Count (No Public Transport): {len(df)}")

# 2. MAPPING FUNCTION
def get_mode_name(code):
    try:
        code = int(code)
    except:
        return "Unknown"
        
    if code == 1:
        return "Walk"
    elif code == 2:
        return "Bike"
    elif 3 <= code <= 25:
        return "Car"
    else:
        return "Other"

# 3. UPDATED LOGIC FUNCTION
def create_training_entry(row):
    # --- A. Translate Mode Code to Text ---
    mode_code = row['transportmiddel']
    mode_name = get_mode_name(mode_code)
    
    # --- B. Standard Math Extraction ---
    user_dist = float(row['stagelength_raw']) if pd.notnull(row['stagelength_raw']) else 0.0
    user_time = float(row['stagedurationmin_raw']) if pd.notnull(row['stagedurationmin_raw']) else 0.0
    system_dist = float(row['calc_dist_geo']) if pd.notnull(row['calc_dist_geo']) else 0.0
    
    speed_kph = (user_dist / (user_time / 60)) if user_time > 0 else 0.0
    dist_diff = abs(user_dist - system_dist)

    # --- C. The Input Prompt (Using Mapped Name) ---
    input_text = (
        f"Trip Mode: {mode_name} (Code {mode_code})\n" # We include both for clarity
        f"From: {row.get('startTripText_raw', 'Unknown')}\n"
        f"To: {row.get('tiladrtext_raw', 'Unknown')}\n"
        f"User Reported: {user_dist} km in {user_time} min "
        f"(Calculated Speed: {speed_kph:.1f} km/h)\n"
        f"System Calculated Distance: {system_dist:.1f} km\n"
        f"Discrepancy: {dist_diff:.1f} km difference."
    )

    # --- D. The Output Logic ---
    verdict = "VALID"
    reasons = []

    if row['flag_speed_bad']:
        verdict = "INVALID"
        reasons.append(f"Speed is unrealistic. {speed_kph:.0f} km/h is too fast for mode '{mode_name}'.")

    if row['flag_coords_bad']:
        verdict = "INVALID"
        reasons.append("Coordinate mismatch. The geocoded location is inconsistent with the address.")

    if row['flag_zero_dist']:
        verdict = "INVALID"
        reasons.append("Zero distance error.")
        
    # Logic for your Trusted/Hard_Wrong labels
    if row['validation_status'] == 'hard_wrong':
        verdict = "INVALID"
        if len(reasons) == 0:
            reasons.append("General data inconsistency detected.")
    elif row['validation_status'] == 'trusted':
        verdict = "VALID"

    if verdict == "VALID":
        output_text = "Verdict: VALID\nAnalysis: The reported metrics are consistent with the transport mode and system calculations."
    else:
        output_text = f"Verdict: INVALID\nAnalysis: {' '.join(reasons)}"

    return {
        "instruction": "Analyze this Danish National Travel Survey entry for logical errors.",
        "input": input_text,
        "output": output_text
    }

# --- EXECUTION ---
# Select your 7560 + 7560 rows FROM THE FILTERED DATAFRAME
df_errors = df[df['validation_status'] == 'hard_wrong'].copy()
df_trusted = df[df['validation_status'] == 'trusted'].copy()

# Ensure we don't crash if you have fewer than 7560 trusted rows after filtering
n_samples = min(len(df_errors), len(df_trusted))
df_train = pd.concat([
    df_errors.sample(n=n_samples, random_state=42), 
    df_trusted.sample(n=n_samples, random_state=42)
]).sample(frac=1).reset_index(drop=True)

# Generate
training_data = df_train.apply(create_training_entry, axis=1).tolist()

# Save
with open('train_challenger.jsonl', 'w', encoding='utf-8') as f:
    for entry in training_data:
        json.dump(entry, f)
        f.write('\n')

Original Row Count: 235764
Filtered Row Count (No Public Transport): 234531


In [7]:
print(df_train['validation_status'].value_counts())

validation_status
hard_wrong    7538
trusted       7538
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np

# --- 1. DEFINE HELPER FUNCTIONS ---

def get_mode_name(code):
    """Maps transport codes to text names."""
    try:
        code = int(code)
    except:
        return "Unknown"
        
    if code == 1:
        return "Walk"
    elif code == 2:
        return "Bike"
    elif 3 <= code <= 25:
        return "Car"
    else:
        return "Other"

def create_inference_prompt(row):
    """Generates the prompt text matching the training format."""
    # A. Map Mode
    mode_code = row['transportmiddel']
    mode_name = get_mode_name(mode_code)
    
    # B. Safe Math Extraction
    user_dist = float(row['stagelength_raw']) if pd.notnull(row['stagelength_raw']) else 0.0
    user_time = float(row['stagedurationmin_raw']) if pd.notnull(row['stagedurationmin_raw']) else 0.0
    system_dist = float(row['calc_dist_geo']) if pd.notnull(row['calc_dist_geo']) else 0.0
    
    # Calculate Speed (km/h)
    speed_kph = (user_dist / (user_time / 60)) if user_time > 0 else 0.0
    
    # Calculate Discrepancy
    dist_diff = abs(user_dist - system_dist)

    # C. Build the Input Context (Exact match to training data)
    # Using clean text columns: 'startTripText_raw' and 'tiladrtext_raw'
    input_context = (
        f"Trip Mode: {mode_name} (Code {mode_code})\n"
        f"From: {row.get('startTripText_raw', 'Unknown')}\n"
        f"To: {row.get('tiladrtext_raw', 'Unknown')}\n"
        f"User Reported: {user_dist} km in {user_time} min "
        f"(Calculated Speed: {speed_kph:.1f} km/h)\n"
        f"System Calculated Distance: {system_dist:.1f} km\n"
        f"Discrepancy: {dist_diff:.1f} km difference."
    )

    # D. Wrap in Alpaca Template (Response left empty for LLM to fill)
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Analyze this Danish National Travel Survey entry for logical errors.

### Input:
{input_context}

### Response:
"""
    return prompt

# --- 2. EXECUTION ---

print(f"Total rows before filtering: {len(df)}")

# A. Filter out Public Transport (Codes >= 26)
# We only want to process Walk, Bike, and Car to match the training logic
df_inference = df[df['transportmiddel'] < 26].copy()
print(f"Rows after filtering Public Transport: {len(df_inference)}")

# B. Handle NaNs in text columns to prevent errors in the prompt
df_inference['startTripText_raw'] = df_inference['startTripText_raw'].fillna("Unknown Start")
df_inference['tiladrtext_raw'] = df_inference['tiladrtext_raw'].fillna("Unknown Dest")

# C. Generate the Prompts
print("Generating prompts... this might take a moment.")
df_inference['prompt_text'] = df_inference.apply(create_inference_prompt, axis=1)

# D. Save to CSV
output_filename = "full_200k_dataset_with_prompts.csv"
df_inference.to_csv(output_filename, index=False)

print(f"Success! Saved '{output_filename}' with {len(df_inference)} rows.")
print("Example Prompt:")
print("-" * 50)
print(df_inference['prompt_text'].iloc[0])
print("-" * 50)

Total rows before filtering: 234531
Rows after filtering Public Transport: 234531
Generating prompts... this might take a moment.
Success! Saved 'full_200k_dataset_with_prompts.csv' with 234531 rows.
Example Prompt:
--------------------------------------------------
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Analyze this Danish National Travel Survey entry for logical errors.

### Input:
Trip Mode: Walk (Code 1.0)
From: Bellisvej 38, 2970 Hørsholm
To: ahornvej 30, 2970 hørsholm
User Reported: 1.0 km in 10.0 min (Calculated Speed: 6.0 km/h)
System Calculated Distance: 0.6 km
Discrepancy: 0.4 km difference.

### Response:

--------------------------------------------------


: 