In [2]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import haversine as hs   
from haversine import Unit
import folium

## Clean Data

In [2]:
MIN_TRIP_DURATION = 60 * 2 # seconds
MAX_TRIP_DURATION = 60 * 60 * 2 # seconds
MIN_TRIP_DISTANCE = 1000 # meters
MAX_DISTANCE_BETWEEN_POINTS = 1000 # meters

In [3]:
def convert_coordinates(string):
    """
    Loads list of coordinates from given string and swap out longitudes & latitudes.
    We do the swapping because the standard is to have latitude values first, but
    the original datasets provided in the competition have it backwards.
    """
    return [(lat, long) for (long, lat) in json.loads(string)]

In [4]:
# Calculate distance of the complete trip by summing up the distances between each pair of coordinates
def calculate_trip_distance(coords):
    return sum(
        hs.haversine(coords[i], coords[i + 1], unit=Unit.METERS)
        for i in range(len(coords) - 1)
    )

In [5]:
# Calculate the maximum distance between two consecutive points for each trip
def calculate_max_distance(coords):
    return max(
        hs.haversine(coords[i], coords[i + 1], unit=Unit.METERS)
        for i in range(len(coords) - 1)
    )

In [6]:
def remove_outliers(df, min_duration=MIN_TRIP_DURATION, max_duration=MAX_TRIP_DURATION, min_distance=MIN_TRIP_DISTANCE, max_distance_between_points=MAX_DISTANCE_BETWEEN_POINTS):
    """
    Remove some outliers that could otherwise undermine the training's results.
    """
    # Remove trips that are either extremely long or short (potentially due to GPS recording issue)
    indices = np.where((df["TRIP_DURATION"] > min_duration) & (df["TRIP_DURATION"] <= max_duration))
    df = df.iloc[indices]

    # Remove trips that are too far away from Porto (also likely due to GPS issues)
    bounds = (  # Bounds retrieved using http://boundingbox.klokantech.com
        (41.052431, -8.727951),
        (41.257678, -8.456039)
    )
    indices = np.where(
        (df["MIN_LAT"] > bounds[0][0]) & (df["MAX_LAT"] < bounds[1][0]) &
        (df["MIN_LONG"] > bounds[0][1]) & (df["MAX_LONG"] < bounds[1][1])
    )
    df = df.iloc[indices]

    # Remove trips that are too short (likely due to GPS issues)
    indices = np.where(df["TRIP_DISTANCE"] > min_distance)
    df = df.iloc[indices]

    # Remove trips that have two consecutive points that are too far away from each other
    indices = np.where(df["MAX_DISTANCE_BETWEEN_POINTS"] < max_distance_between_points)
    df = df.iloc[indices]

    return df

In [7]:
# Load data
df = pd.read_csv(
    "../data/train.csv",
    # converters={"POLYLINE": lambda x: json.loads(x)},
)
# df = df.sample(frac=0.1, random_state=42)

In [8]:
# Remove trips with less than 4 coordinates
df = df[df["POLYLINE"].apply(len) > 3]

In [9]:
df["POLYLINE"] = df["POLYLINE"].apply(convert_coordinates)

In [10]:
df["TRIP_LENGTH"] = df["POLYLINE"].apply(len)
df["TRIP_DURATION"] = df["TRIP_LENGTH"] * 15

In [11]:
df["TRIP_DISTANCE"] = df["POLYLINE"].apply(calculate_trip_distance)
df = df[df["TRIP_DISTANCE"] > 0] # Remove trips with 0 distance

df["MAX_DISTANCE_BETWEEN_POINTS"] = df["POLYLINE"].apply(calculate_max_distance)

In [12]:
df["MIN_LAT"] = df["POLYLINE"].apply(lambda x: min(lat for (lat, long) in x))
df["MAX_LAT"] = df["POLYLINE"].apply(lambda x: max(lat for (lat, long) in x))
df["MIN_LONG"] = df["POLYLINE"].apply(lambda x: min(long for (lat, long) in x))
df["MAX_LONG"] = df["POLYLINE"].apply(lambda x: max(long for (lat, long) in x))
df = remove_outliers(df)

In [13]:
df["START_POSITION"] = df["POLYLINE"].apply(lambda x: x[0])
df["END_POSITION"] = df["POLYLINE"].apply(lambda x: x[-1])

In [14]:
df = df[['TRIP_ID', 'TIMESTAMP', 'TRIP_DURATION', 'TRIP_LENGTH', 'TRIP_DISTANCE', 'START_POSITION', 'END_POSITION', 'POLYLINE']]

In [15]:
df.shape

(157542, 8)

In [16]:
df.sample(5)

Unnamed: 0,TRIP_ID,TIMESTAMP,TRIP_DURATION,TRIP_LENGTH,TRIP_DISTANCE,START_POSITION,END_POSITION,POLYLINE
341109,1379275625620000153,1379275625,480,32,2465.178624,"(41.150088, -8.62092)","(41.138469, -8.610651)","[(41.150088, -8.62092), (41.150214, -8.620902)..."
1295932,1397030593620000174,1397030593,540,36,2904.774752,"(41.149143, -8.599257)","(41.150484, -8.627877)","[(41.149143, -8.599257), (41.148927, -8.598906..."
611286,1383931847620000074,1383931847,1110,74,3519.244823,"(41.148036, -8.619795)","(41.14944, -8.599869)","[(41.148036, -8.619795), (41.148027, -8.619804..."
1038247,1392185293620000454,1392185293,165,11,1349.137435,"(41.149341, -8.6112)","(41.160267, -8.608779)","[(41.149341, -8.6112), (41.149332, -8.611245),..."
426258,1380713223620000421,1380713223,600,40,4503.157047,"(41.16213, -8.649855)","(41.145777, -8.622594)","[(41.16213, -8.649855), (41.161833, -8.649954)..."


In [17]:
# Plot trips on map

def plot_trips(df, n=100, lat_range=(41.052431, 41.257678), lon_range=(-8.727951, -8.456039)):
    m = folium.Map(
        location=[np.mean(lat_range), np.mean(lon_range)],
        zoom_start=12,
        tiles="cartodbpositron",
    )

    for i, row in df.sample(n).iterrows():
        folium.PolyLine(
            locations=[[p[0], p[1]] for p in row["POLYLINE"]],
            color="blue",
            weight=3,
            opacity=0.3,
        ).add_to(m)

    return m

In [18]:
plot_trips(df, n=1000)

In [19]:
# Export data
df.to_csv("../data/train_cleaned.csv", index=False)