In [23]:
# In notebooks/02_data_preparation.ipynb

import pandas as pd
import json
import numpy as np

# --- 1. Load the raw data ---
# We'll only load the first 5000 rows for this example to keep it fast.
# For your final training, you might want to use more.
print("Loading raw data...")
df_raw = pd.read_csv('../data/raw/train.csv', nrows=5000)
print("Data loaded.")

# --- 2. Create a function to parse the POLYLINE string ---
def parse_polyline(polyline_str):
    """
    Converts the string representation of a polyline into a list of tuples.
    Returns an empty list if the string is invalid or empty.
    """
    # A trajectory needs at least two points to be useful
    if not isinstance(polyline_str, str) or len(polyline_str) < 10:
        return []
    try:
        # The json library is perfect for parsing this string format
        return json.loads(polyline_str)
    except json.JSONDecodeError:
        return []

# --- 3. Apply the function and clean up the data ---
print("Parsing polylines...")
df_raw['trajectory'] = df_raw['POLYLINE'].apply(parse_polyline)

# Filter out any trips that had empty or invalid polylines
df_clean = df_raw[df_raw['trajectory'].apply(len) > 1].copy()

print(f"Original rows: {len(df_raw)}, Cleaned rows: {len(df_clean)}")

# --- 4. Convert trajectories into a more useful format ---
# We want a DataFrame where each row is a single GPS point.
# This will make it easier to simulate anomalies.

all_trajectories = []
for index, row in df_clean.iterrows():
    trip_id = row['TRIP_ID']
    points = row['trajectory']
    
    # Create a DataFrame for this single trip
    trip_df = pd.DataFrame(points, columns=['longitude', 'latitude'])
    trip_df['trip_id'] = trip_id
    
    # Add a timestamp (taxis report every 15 seconds)
    trip_df['timestamp'] = pd.to_datetime(row['TIMESTAMP'], unit='s') + pd.to_timedelta(np.arange(len(points)) * 15, unit='s')
    
    all_trajectories.append(trip_df)

# Combine all the individual trip DataFrames into one big one
df_full = pd.concat(all_trajectories, ignore_index=True)

print("\nSample of the final pre-processed data:")
df_full.head()

Loading raw data...
Data loaded.
Parsing polylines...
Original rows: 5000, Cleaned rows: 4921

Sample of the final pre-processed data:


Unnamed: 0,longitude,latitude,trip_id,timestamp
0,-8.618643,41.141412,1372636858620000589,2013-07-01 00:00:58
1,-8.618499,41.141376,1372636858620000589,2013-07-01 00:01:13
2,-8.620326,41.14251,1372636858620000589,2013-07-01 00:01:28
3,-8.622153,41.143815,1372636858620000589,2013-07-01 00:01:43
4,-8.623953,41.144373,1372636858620000589,2013-07-01 00:01:58


In [24]:
# In notebooks/02_data_preparation.ipynb

def simulate_inactivity(traj_df: pd.DataFrame):
    """Simulates prolonged inactivity by duplicating a point."""
    if len(traj_df) < 20: # Need a long enough trip
        return None
    
    # Pick a random point in the middle to stop at
    stop_index = len(traj_df) // 2
    
    # 'Stop' for 10 timestamps (10 * 15s = 2.5 minutes)
    for i in range(10):
        traj_df.iloc[stop_index + i] = traj_df.iloc[stop_index]
        
    return traj_df

def simulate_teleport(traj_df: pd.DataFrame):
    """Simulates a teleport by moving a single point far away."""
    if len(traj_df) < 10:
        return None
    
    # --- FIX: Reset the index to work with .loc ---
    traj_df = traj_df.reset_index(drop=True)
    
    point_to_change = len(traj_df) // 2
    
    # Add a large offset to create a huge jump
    traj_df.loc[point_to_change, 'latitude'] += 0.5
    traj_df.loc[point_to_change, 'longitude'] += 0.5
    
    return traj_df

def simulate_deviation(traj_df: pd.DataFrame):
    """Simulates a route deviation by adding an offset to a segment."""
    if len(traj_df) < 20:
        return None
        
    # --- FIX: Reset the index to work with .loc ---
    traj_df = traj_df.reset_index(drop=True)
        
    start_index = len(traj_df) // 2
    
    # Deviate for 5 points
    for i in range(5):
        traj_df.loc[start_index + i, 'latitude'] += 0.005
        traj_df.loc[start_index + i, 'longitude'] += 0.005
        
    return traj_df

print("Anomaly simulation functions are ready.")

Anomaly simulation functions are ready.


In [25]:
# In notebooks/02_data_preparation.ipynb

import sys
import os
import numpy as np

# Adds the main project folder to the path to find the 'app' module.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Now, the import will work correctly
from app.preprocessing import prepare_training_data

# Get a list of unique trip IDs to iterate over
unique_trip_ids = df_clean['TRIP_ID'].unique()

all_X = []
all_y = []

print(f"Processing {len(unique_trip_ids)} trips to generate training data...")

for i, trip_id in enumerate(unique_trip_ids):
    # Get the original, normal trajectory
    normal_traj = df_full[df_full['trip_id'] == trip_id].copy()
    
    # --- Process 4 versions for each trip ---
    
    # 1. Normal (Label 0)
    X, y = prepare_training_data(normal_traj)
    # --- FIX: Only append if the array is not empty ---
    if X.shape[0] > 0:
        all_X.append(X)
        all_y.append(y)

    # 2. Inactivity (Label 1)
    inactive_traj = simulate_inactivity(normal_traj.copy())
    if inactive_traj is not None:
        X, y = prepare_training_data(inactive_traj)
        # --- FIX: Only append if the array is not empty ---
        if X.shape[0] > 0:
            all_X.append(X)
            all_y.append(y)

    # 3. Teleport (Label 2)
    teleport_traj = simulate_teleport(normal_traj.copy())
    if teleport_traj is not None:
        X, y = prepare_training_data(teleport_traj)
        # --- FIX: Only append if the array is not empty ---
        if X.shape[0] > 0:
            all_X.append(X)
            all_y.append(y)
        
    # 4. Deviation (Label 3)
    deviation_traj = simulate_deviation(normal_traj.copy())
    if deviation_traj is not None:
        X, y = prepare_training_data(deviation_traj)
        # --- FIX: Only append if the array is not empty ---
        if X.shape[0] > 0:
            all_X.append(X)
            all_y.append(y)

    if (i + 1) % 100 == 0:
        print(f"  Processed {i+1}/{len(unique_trip_ids)} trips...")


# --- Combine and Save ---
final_X = np.concatenate(all_X, axis=0)
final_y = np.concatenate(all_y, axis=0)

# Make sure the processed data folder exists
os.makedirs('../data/processed', exist_ok=True)

# Save the final arrays
np.save('../data/processed/X.npy', final_X)
np.save('../data/processed/y.npy', final_y)

print("\n--- All Done! ---")
print(f"Shape of final X (features): {final_X.shape}")
print(f"Shape of final y (labels): {final_y.shape}")
print("You are now ready to run the training script!")

Processing 4920 trips to generate training data...
  Processed 100/4920 trips...
  Processed 200/4920 trips...
  Processed 300/4920 trips...
  Processed 400/4920 trips...
  Processed 500/4920 trips...
  Processed 600/4920 trips...
  Processed 700/4920 trips...
  Processed 800/4920 trips...
  Processed 900/4920 trips...
  Processed 1000/4920 trips...
  Processed 1100/4920 trips...
  Processed 1200/4920 trips...
  Processed 1300/4920 trips...
  Processed 1400/4920 trips...
  Processed 1500/4920 trips...
  Processed 1600/4920 trips...
  Processed 1700/4920 trips...
  Processed 1800/4920 trips...
  Processed 1900/4920 trips...
  Processed 2000/4920 trips...
  Processed 2100/4920 trips...
  Processed 2200/4920 trips...
  Processed 2300/4920 trips...
  Processed 2400/4920 trips...
  Processed 2500/4920 trips...
  Processed 2600/4920 trips...
  Processed 2700/4920 trips...
  Processed 2800/4920 trips...
  Processed 2900/4920 trips...
  Processed 3000/4920 trips...
  Processed 3100/4920 trips.