In [None]:
# Folium and Data Processing Imports
import os
import numpy as np
import pandas as pd
import osmnx as ox
from fmm import (
    Network,
    NetworkGraph,
    UBODTGenAlgorithm,
    UBODT,
    FastMapMatch,
    FastMapMatchConfig,
    STMATCH,
    STMATCHConfig,
)
import folium
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors  # Import for rgb2hex
from shapely.geometry import Polygon

# Additional Import for JSON handling
import json

# Suppress warnings from osmnx
import warnings
warnings.filterwarnings('ignore')

# Define the paths to your network data and UBODT file
folder = '/content/data'  # Update this to your folder path
network_file_path = os.path.join(folder, "porto", "edges.shp")
ubodt_file_path = os.path.join(folder, "ubodt.txt")


def load_graph(bounds):
    """
    Create the street network within the bounding box.

    Args:
        bounds (tuple): A tuple containing the bounding box coordinates (x1, x2, y1, y2).

    Returns:
        networkx.MultiDiGraph: The street network graph.
    """
    x1, x2, y1, y2 = bounds
    boundary_polygon = Polygon([(x1, y1), (x2, y1), (x2, y2), (x1, y2)])
    return ox.graph_from_polygon(boundary_polygon, network_type='drive')


def load_data(file_path, nrows=None):
    """
    Load the trajectory data from a CSV file.

    Args:
        file_path (str): Path to the CSV file.
        nrows (int, optional): Number of rows to read. Defaults to None (all rows).

    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    return pd.read_csv(file_path, nrows=nrows)


def clean_trajectory_data(df):
    """
    Convert trajectory data from string to NumPy arrays.

    Args:
        df (pd.DataFrame): DataFrame containing trajectory data.

    Returns:
        np.ndarray: Array with cleaned trajectory data.
    """
    train_data = df.to_numpy()

    for i in range(len(train_data)):
        # Extract the trajectory string from column 8 (0-based index)
        traj_str = train_data[i, 8]
        # Remove leading/trailing characters and split by comma
        data = traj_str[2:-2].replace(']', '').replace('[', '').split(',')

        if len(data) > 1:
            try:
                data = np.asarray(data, dtype=float).reshape((len(data) // 2, 2))
            except ValueError:
                # Handle cases where data cannot be reshaped properly
                data = np.asarray([[0.0, 0.0]])
        else:
            data = np.asarray([[0.0, 0.0]])

        train_data[i, 8] = data

    return train_data


def remove_outliers(train_data, threshold_multiplier=5):
    """
    Remove outlying GPS coordinates based on a distance threshold.

    Args:
        train_data (np.ndarray): Array containing trajectory data.
        threshold_multiplier (float, optional): Multiplier for the average distance to determine outliers. Defaults to 5.

    Returns:
        np.ndarray: Array with outliers removed.
    """
    for i in range(len(train_data)):
        GPS_trajectory = train_data[i, 8]
        num_points = len(GPS_trajectory)

        if num_points > 1:
            route_dist = 0.0
            # Calculate the total route distance
            for j in range(num_points - 1):
                lon_1, lat_1 = GPS_trajectory[j]
                lon_2, lat_2 = GPS_trajectory[j + 1]
                route_dist += np.linalg.norm([lon_1 - lon_2, lat_1 - lat_2])

            ave_dist = route_dist / (num_points - 1)
            j = 0

            # Compare points and remove outliers
            while j < (len(GPS_trajectory) - 1):
                lon_1, lat_1 = GPS_trajectory[j]
                lon_2, lat_2 = GPS_trajectory[j + 1]
                dist = np.linalg.norm([lon_1 - lon_2, lat_1 - lat_2])

                if dist > threshold_multiplier * ave_dist:
                    GPS_trajectory = np.delete(GPS_trajectory, j + 1, 0)
                    if j > 0:
                        j -= 1  # Move back one step to re-evaluate after deletion
                else:
                    j += 1  # Move to the next point

            train_data[i, 8] = GPS_trajectory

    return train_data


def save_cleaned_data(train_data, original_df, output_path):
    """
    Save the cleaned train_data back to a CSV file in the original format.

    Args:
        train_data (np.ndarray): The cleaned trajectory data.
        original_df (pd.DataFrame): The original DataFrame loaded from the CSV.
        output_path (str): The path where the cleaned CSV will be saved.
    """
    # Create a copy of the original DataFrame to maintain all columns
    cleaned_df = original_df.copy()

    # Convert the NumPy arrays back to string format for column 8
    cleaned_df.iloc[:, 8] = [str(traj.tolist()) for traj in train_data[:, 8]]

    # Save the cleaned DataFrame to a new CSV file
    cleaned_df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")


def load_cleaned_data(cleaned_file_path):
    """
    Load the cleaned CSV file and convert trajectory strings back to NumPy arrays.

    Args:
        cleaned_file_path (str): Path to the cleaned CSV file.

    Returns:
        tuple: (train_data_cleaned as np.ndarray, cleaned_df as pd.DataFrame)
    """
    cleaned_df = pd.read_csv(cleaned_file_path)
    train_data_cleaned = cleaned_df.to_numpy()

    for i in range(len(train_data_cleaned)):
        # Extract the trajectory string from column 8
        traj_str = train_data_cleaned[i, 8]
        # Remove leading/trailing characters and split by comma
        data = traj_str[2:-2].replace(']', '').replace('[', '').split(',')

        if len(data) > 1:
            try:
                data = np.asarray(data, dtype=float).reshape((len(data) // 2, 2))
            except ValueError:
                # Handle cases where data cannot be reshaped properly
                data = np.asarray([[0.0, 0.0]])
        else:
            data = np.asarray([[0.0, 0.0]])

        train_data_cleaned[i, 8] = data

    return train_data_cleaned, cleaned_df


def plot_trajectories_folium(G, train_data, traj_indices, output_path, title='Trajectories'):
    """
    Plot the trajectories using Folium.

    Args:
        G (networkx.MultiDiGraph): The street network graph.
        train_data (np.ndarray): Array containing trajectory data.
        traj_indices (list): List of indices of trajectories to plot.
        output_path (str): Path to save the Folium HTML map.
        title (str, optional): Title of the plot. Defaults to 'Trajectories'.
    """
    # Calculate map boundaries
    all_lons = []
    all_lats = []
    for traj_idx in traj_indices:
        traj = train_data[traj_idx, 8]
        if isinstance(traj, np.ndarray):
            all_lons.extend(traj[:, 0])
            all_lats.extend(traj[:, 1])

    if not all_lons or not all_lats:
        raise ValueError("No valid trajectories to plot.")

    x_min, x_max = min(all_lons), max(all_lons)
    y_min, y_max = min(all_lats), max(all_lats)
    map_center = [(y_min + y_max) / 2, (x_min + x_max) / 2]

    # Initialize Folium map
    folium_map = folium.Map(location=map_center, zoom_start=14, control_scale=True)

    # Fit map to bounds
    folium_map.fit_bounds([[y_min, x_min], [y_max, x_max]])

    # Generate a color map with a specific number of colors
    num_traj = len(traj_indices)  # Number of trajectories
    color_map = plt.get_cmap('tab10')(np.linspace(0, 1, num_traj))  # Use np.linspace to sample the colormap

    # Plot each trajectory using Folium
    for i, traj_idx in enumerate(traj_indices):
        traj = train_data[traj_idx, 8]
        if isinstance(traj, np.ndarray) and len(traj) > 1:
            # Convert to list of [lat, lon] pairs for Folium
            traj_coords = traj[:, [1, 0]].tolist()  # Folium expects [lat, lon]
            folium.PolyLine(
                traj_coords,
                color=mcolors.rgb2hex(color_map[i]),  # Use mcolors.rgb2hex
                weight=5,
                opacity=0.8,
                tooltip=f'Trip {traj_idx + 1}'
            ).add_to(folium_map)

            # Add green marker for the start (first point)
            start_point = traj[0]
            folium.Marker(
                location=[start_point[1], start_point[0]],  # [lat, lon]
                icon=folium.Icon(color='green', icon='circle'),
                popup=f"Start of Trip {traj_idx + 1}"
            ).add_to(folium_map)

            # Add red marker for the end (last point)
            end_point = traj[-1]
            folium.Marker(
                location=[end_point[1], end_point[0]],  # [lat, lon]
                icon=folium.Icon(color='red', icon='circle'),
                popup=f"End of Trip {traj_idx + 1}"
            ).add_to(folium_map)

    # Add a legend
    legend_html = '''
     <div style="
     position: fixed;
     bottom: 50px; left: 50px; width: 150px; height: auto;
     background-color: white; z-index:9999; font-size:14px;
     border:2px solid grey;
     padding: 10px;
     ">
         <p style="margin: 0;"><b>Legend</b></p>
         <ul style="list-style: none; padding-left: 0;">
    '''
    for i, traj_idx in enumerate(traj_indices):
        color_hex = mcolors.rgb2hex(color_map[i])  # Corrected hex conversion
        legend_html += f'<li><span style="background-color:{color_hex};width:20px;height:5px;display:inline-block;margin-right:5px;"></span> Trip {traj_idx + 1}</li>'
    legend_html += '</ul></div>'

    folium_map.get_root().html.add_child(folium.Element(legend_html))

    # Add Layer Control
    folium.LayerControl().add_to(folium_map)

    # Save the map to an HTML file
    folium_map.save(output_path)
    print(f"Interactive Folium map saved to {output_path}")


def prepare_fmm_data(train_data):
    """
    Prepare the data for FastMapMatch.

    Args:
        train_data (np.ndarray): Array containing trajectory data.

    Returns:
        pd.DataFrame: DataFrame formatted for FMM.
    """
    input_data = []
    for i in range(len(train_data)):
        trajectory = train_data[i, 8]
        if isinstance(trajectory, np.ndarray):
            trajectory_str = ','.join([f"{coord[0]} {coord[1]}" for coord in trajectory.tolist()])
            trajectory_wkt = f"LINESTRING({trajectory_str})"
        else:
            trajectory_wkt = "LINESTRING(0 0, 0 0)"  # Default if trajectory is invalid
        input_data.append({'id': train_data[i, 0], 'geom': trajectory_wkt})

    return pd.DataFrame(input_data)


def map_matching(input_data, network, graph, ubodt, fmm_config):
    """
    Perform Map Matching using FastMapMatch.

    Args:
        input_data (pd.DataFrame): DataFrame containing 'id' and 'geom' columns.
        network (Network): The network data.
        graph (NetworkGraph): The network graph.
        ubodt (UBODT): The UBODT data.
        fmm_config (FastMapMatchConfig): Configuration for FMM.

    Returns:
        list: List of flattened match results.
    """
    model = FastMapMatch(network, graph, ubodt)
    results = []

    for idx, geom in enumerate(input_data['geom'].values):
        try:
            result = model.match_wkt(geom, fmm_config)
            
            # Assuming 'result' has an attribute or method to access "MATCHED_RESULTS"
            # This part may need adjustment based on the actual structure of 'result'
            # For demonstration, assuming 'result' has a 'MATCHED_RESULTS' attribute
            if hasattr(result, "MATCHED_RESULTS"):
                matched_results = result.MATCHED_RESULTS
                flattened_row = {
                    "id": matched_results["id"],
                    "ogeom": matched_results["ogeom"],
                    "opath": json.dumps(matched_results["opath"]),
                    "error": json.dumps(matched_results["error"]),
                    "offset": json.dumps(matched_results["offset"]),
                    "length": json.dumps(matched_results["length"]),
                    "spdist": json.dumps(matched_results["spdist"]),
                    "duration": json.dumps(matched_results.get("duration", [])),  # Use .get to handle missing keys
                    "speed": json.dumps(matched_results.get("speed", [])),
                    "pgeom": matched_results["pgeom"],
                    "cpath": json.dumps(matched_results["cpath"]),
                    "tpath": json.dumps(matched_results["tpath"]),
                    "mgeom": matched_results["mgeom"],
                    "ep": json.dumps(matched_results["ep"]),
                    "tp": json.dumps(matched_results["tp"]),
                    "MATCHING_ALGORITHM": matched_results["matching_algorithm"],
                    "eid": json.dumps(matched_results["eid"]),
                    "source": json.dumps(matched_results["source"]),
                    "target": json.dumps(matched_results["target"])
                }
                results.append(flattened_row)
            else:
                # Handle cases where "MATCHED_RESULTS" is not in the result
                print(f"No MATCHED_RESULTS in trajectory for index {idx}")
                results.append(None)
        
        except Exception as e:
            print(f"Map matching failed for index {idx}: {e}")
            results.append(None)  # Append None or handle as needed

    return results


def plot_map_matching_folium(G, results, traj_indices, output_path, title='Map Matching Results'):
    """
    Plot the map-matched trajectories using Folium.

    Args:
        G (networkx.MultiDiGraph): The street network graph.
        results (list): List of map matching results.
        traj_indices (list): List of trajectory indices to plot.
        output_path (str): Path to save the Folium HTML map.
        title (str, optional): Title of the plot. Defaults to 'Map Matching Results'.
    """
    # Extract matched geometries and calculate map boundaries
    all_lons = []
    all_lats = []
    matched_trajs = {}

    for traj_idx in traj_indices:
        if traj_idx < len(results) and results[traj_idx] and "mgeom" in results[traj_idx]:
            mgeom = results[traj_idx]["mgeom"]
            if mgeom.startswith('LINESTRING'):
                coords = mgeom.replace('LINESTRING(', '').replace(')', '').split(',')
                traj_coords = []
                for coord in coords:
                    try:
                        lon, lat = map(float, coord.strip().split())
                        traj_coords.append([lat, lon])
                        all_lons.append(lon)
                        all_lats.append(lat)
                    except ValueError:
                        # Log invalid coordinate or skip if there's an issue
                        print(f"Skipping invalid coordinate: {coord.strip()}")
                        continue
                if traj_coords:
                    matched_trajs[traj_idx] = traj_coords

    if not all_lons or not all_lats:
        raise ValueError("No valid map-matched trajectories to plot.")

    x_min, x_max = min(all_lons), max(all_lons)
    y_min, y_max = min(all_lats), max(all_lats)
    map_center = [(y_min + y_max) / 2, (x_min + x_max) / 2]

    # Initialize Folium map
    folium_map = folium.Map(location=map_center, zoom_start=14, control_scale=True)

    # Fit map to bounds
    folium_map.fit_bounds([[y_min, x_min], [y_max, x_max]])

    # Generate a color map with a specific number of colors
    num_traj = len(traj_indices)  # Number of trajectories
    color_map = plt.get_cmap('tab10')(np.linspace(0, 1, num_traj))  # Use np.linspace to sample the colormap

    # Plot each map-matched trajectory
    for i, traj_idx in enumerate(traj_indices):
        traj = matched_trajs.get(traj_idx, [])
        if traj:
            folium.PolyLine(
                traj,
                color=mcolors.rgb2hex(color_map[i]),  # Corrected hex conversion
                weight=5,
                opacity=0.8,
                tooltip=f'Match Trip {traj_idx + 1}'
            ).add_to(folium_map)

    # Add a legend
    legend_html = '''
     <div style="
     position: fixed;
     bottom: 50px; left: 50px; width: 150px; height: auto;
     background-color: white; z-index:9999; font-size:14px;
     border:2px solid grey;
     padding: 10px;
     ">
         <p style="margin: 0;"><b>Legend</b></p>
         <ul style="list-style: none; padding-left: 0;">
    '''
    for i, traj_idx in enumerate(traj_indices):
        color_hex = mcolors.rgb2hex(color_map[i])  # Corrected hex conversion
        legend_html += f'<li><span style="background-color:{color_hex};width:20px;height:5px;display:inline-block;margin-right:5px;"></span> Match Trip {traj_idx + 1}</li>'
    legend_html += '</ul></div>'

    folium_map.get_root().html.add_child(folium.Element(legend_html))

    # Add Layer Control
    folium.LayerControl().add_to(folium_map)

    # Save the map to an HTML file
    folium_map.save(output_path)
    print(f"Interactive Folium map with map matching saved to {output_path}")


def generate_ubodt(network_file_path, ubodt_file_path):
    """
    Generate UBODT if it does not exist.

    Args:
        network_file_path (str): Path to the network shapefile.
        ubodt_file_path (str): Path to save the UBODT file.

    Returns:
        tuple: (ubodt, network, graph)
    """
    # Ensure the network file exists
    if not os.path.exists(network_file_path):
        raise FileNotFoundError(f"Network file not found: {network_file_path}")

    # Read the network data
    network = Network(network_file_path, "fid", "u", "v")
    graph = NetworkGraph(network)

    # Check if UBODT file exists
    if os.path.isfile(ubodt_file_path):
        ubodt = UBODT.read_ubodt_csv(ubodt_file_path)
        print("Read the UBODT file")
    else:
        print("Generating UBODT file...")
        # Generate the UBODT using the UBODTGenAlgorithm
        ubodt_gen = UBODTGenAlgorithm(network, graph)
        status = ubodt_gen.generate_ubodt(ubodt_file_path, 0.03, binary=False, use_omp=True)
        print(f"UBODT Generation Status: {status}")

        # After generating the UBODT, load it
        ubodt = UBODT.read_ubodt_csv(ubodt_file_path)
        print("UBODT file generated and loaded.")

    return ubodt, network, graph  # Return UBODT, Network, and Graph


def main():
    """
    Main function to execute the data cleaning and FastMapMatch process.
    """
    # Define bounding box: (x1, x2, y1, y2)
    bounds = (-8.70, -8.57, 41.13, 41.19)  # Corrected y1 and y2 to be in increasing order

    # Define file paths
    train_file = os.path.join(folder, "train-1500.csv")
    cleaned_train_file = os.path.join(folder, "train-1500-cleaned.csv")  # Path for cleaned data
    output_dir = '/content/data/'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_file = os.path.join(output_dir, 'Trajectories_Before_Outlier_Removal_Folium.html')
    outlier_file = os.path.join(output_dir, 'Trajectories_After_Outlier_Removal_Folium.html')
    fmm_output_file = os.path.join(output_dir, 'Trajectories_After_FMM_Folium.html')
    trips_csv = os.path.join(output_dir, 'trips2.csv')

    # Generate UBODT and get network and graph
    ubodt, network, graph = generate_ubodt(network_file_path, ubodt_file_path)

    # Load graph and data
    G = load_graph(bounds)
    df = load_data(train_file, nrows=1500)  # Load first 1500 rows of the dataset

    # Clean trajectory data
    train_data = clean_trajectory_data(df)

    # Select the top 15 trajectories (can be based on any criterion, here just the first 15)
    traj_indices = list(range(15))  # Select first 15 trajectories

    # Plot initial trips before outlier removal
    plot_trajectories_folium(
        G,
        train_data,
        traj_indices,
        output_file,
        title='Top 15 Trajectories Before Outlier Removal'
    )

    # Remove outliers
    train_data = remove_outliers(train_data)

    # Plot trips after removing outliers
    plot_trajectories_folium(
        G,
        train_data,
        traj_indices,
        outlier_file,
        title='Top 15 Trajectories After Outlier Removal'
    )

    # Save the cleaned data to a new CSV file
    save_cleaned_data(train_data, df, cleaned_train_file)

    # Load the cleaned data for FMM
    train_data_cleaned, clean_df = load_cleaned_data(cleaned_train_file)

    # Prepare data for FMM
    input_data = prepare_fmm_data(train_data_cleaned)
    input_data.to_csv(trips_csv, index=False, sep=';')
    print(f"Prepared FMM data saved to {trips_csv}")

    # Map Matching with FMM
    fmm_config = FastMapMatchConfig(16, 0.005, 0.0005)  # Adjust parameters as needed
    results = map_matching(input_data, network, graph, ubodt, fmm_config)
    print("Map matching completed.")
    
    # Convert results to DataFrame, excluding None entries
    matched_df = pd.DataFrame([res for res in results if res is not None])
    
    # Save matched results to a CSV file
    matched_results_file = os.path.join(output_dir, 'matched_results.csv')
    matched_df.to_csv(matched_results_file, index=False)
    print(f"Matched results saved to {matched_results_file}")
    
    # Plot Map Matching results
    plot_map_matching_folium(
        G,
        results,
        traj_indices,
        fmm_output_file,
        title='Top 15 Map Matching Results'
    )


if __name__ == "__main__":
    main()
