In [1]:
import os
import pandas as pd
import folium
import math
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.cm as cm
from shapely.geometry import LineString

# Set up file paths and results directory
train_file = 'data/train-1500.csv'
map_file = 'data/map_matching_1500.csv'
output_dir = 'data/results_task6'
os.makedirs(output_dir, exist_ok=True)

# Utility function for distance calculation
def euclidean_distance(point_a, point_b):
    return math.hypot(point_a[0] - point_b[0], point_a[1] - point_b[1])

# Function to load CSV data
def read_csv_data(filepath):
    return pd.read_csv(filepath)

# Parse LINESTRING format into coordinates
def parse_linestring(geometry_str):
    if geometry_str.startswith("LINESTRING"):
        coordinates = geometry_str.replace("LINESTRING(", "").replace(")", "").split(",")
        return LineString([(float(coord.split()[0]), float(coord.split()[1])) for coord in coordinates])
    return None

# Calculate plot boundaries for folium maps
def determine_boundaries(boundary_data, lon, lat):
    boundary_data["min_lon"] = min(boundary_data["min_lon"], lon)
    boundary_data["max_lon"] = max(boundary_data["max_lon"], lon)
    boundary_data["min_lat"] = min(boundary_data["min_lat"], lat)
    boundary_data["max_lat"] = max(boundary_data["max_lat"], lat)

# Create a histogram of consecutive distances
def plot_distance_histogram(data, filepath):
    plt.hist(data, bins=20, color='steelblue', edgecolor='black', range=(0, 0.02))
    plt.title("Distance Histogram")
    plt.xlabel("Distance")
    plt.ylabel("Frequency")
    plt.savefig(filepath, dpi=240)
    plt.close()

# Process data for consecutive distance calculations
def calculate_consecutive_distances(data, sample_size=1500):
    distances = []
    for i in range(min(sample_size, len(data))):
        points = eval(data['POLYLINE'][i])
        distances.extend([euclidean_distance(points[j], points[j+1]) for j in range(len(points)-1)])
    return distances

# Filter closely spaced points within trajectories
def clean_trajectories(data, min_threshold=0.01, max_threshold=0.02, step=0.002):
    for idx, row in data.iterrows():
        points = eval(row['POLYLINE'])
        if len(points) < 2:
            continue

        filtered_points = [points[0]]
        last_point = points[0]
        threshold = min_threshold

        for point in points[1:]:
            if euclidean_distance(last_point, point) >= threshold:
                filtered_points.append(point)
                last_point = point
                threshold = min_threshold
            else:
                threshold = min(threshold + step, max_threshold)
        
        data.at[idx, 'POLYLINE'] = str(filtered_points)

# Save cleaned data to CSV
def save_data(data, filepath):
    data.to_csv(filepath, index=False)

# Generate folium maps for individual trajectories
def plot_folium_trajectories(data, num_plots=15):
    for i in range(num_plots):
        points = eval(data['POLYLINE'][i])
        fmap = folium.Map(location=points[0], zoom_start=13.5)
        for point in points:
            folium.CircleMarker(location=point, radius=1, color='blue', fill=True).add_to(fmap)
        fmap.save(os.path.join(output_dir, f'task6_plot_trajectory_{i}.html'))

# Visualize the first 15 routes on a static map
def plot_first_15_routes(filepath):
    df = read_csv_data(filepath)
    df['geometry'] = df['mgeom'].apply(parse_linestring)
    df.dropna(subset=['geometry'], inplace=True)

    if df.empty:
        print("No valid routes found.")
        return

    # Initialize boundaries
    boundary_data = {"min_lon": float('inf'), "max_lon": float('-inf'),
                     "min_lat": float('inf'), "max_lat": float('-inf')}
    colors = cm.jet(np.linspace(0, 1, min(15, len(df))))

    # Set up plot
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.set_facecolor('black')

    for i, route in enumerate(df['geometry'].head(15)):
        x, y = route.xy
        ax.plot(x, y, color=colors[i], linewidth=2)
        ax.scatter(x[0], y[0], color='green', marker='o', s=50)
        ax.scatter(x[-1], y[-1], color='red', marker='X', s=50)
        determine_boundaries(boundary_data, x[0], y[0])
        determine_boundaries(boundary_data, x[-1], y[-1])

    ax.set_xlim(boundary_data["min_lon"] - 0.01, boundary_data["max_lon"] + 0.01)
    ax.set_ylim(boundary_data["min_lat"] - 0.01, boundary_data["max_lat"] + 0.01)
    ax.axis('off')
    ax.set_title("Top 15 Routes with Start and End Points", color='white', fontsize=14)
    plt.savefig(os.path.join(output_dir, 'task6_plot_Mapped_Trajectories_of_first_15_trips.png'), format='png', dpi=300)
    plt.show()

# Run the processing pipeline
train_data = read_csv_data(train_file)
consecutive_distances = calculate_consecutive_distances(train_data)
plot_distance_histogram(consecutive_distances, os.path.join(output_dir, 'task6_plot_distance_histogram.png'))
clean_trajectories(train_data)
save_data(train_data, os.path.join(output_dir, 'modified_train-1500.csv'))
plot_folium_trajectories(train_data)
plot_first_15_routes(map_file)


IndexError: list index out of range