In [155]:
import numpy as np
import pandas as pd
import osmnx as ox
import geopandas as gpd
from shapely.geometry import Point, LineString, box
import networkx as nx
from dataclasses import dataclass
import ast
import rtree
import logging
from typing import List, Dict, Tuple, Optional
from math import pi
import folium
from folium.plugins import MarkerCluster
from folium.features import DivIcon
from branca.colormap import LinearColormap
import os
from tqdm import tqdm

SIGMA_Z = 15.0  # Increased sigma_z for more tolerance in emission
MAX_DISTANCE = 50.0  # Increased max_distance for broader candidate search
TURN_ANGLE_THRESHOLD = pi / 4  # 45 degrees threshold for transition penalty
MIN_TRANSITION_PROB = 1e-5  # Non-zero transition probability for flexibility

class EnhancedViterbiMatcher:
    def __init__(self, graph, edges_gdf, config=None):
        """Initialize matcher with improved configuration"""
        self.graph = graph
        self.edges_gdf = edges_gdf.copy()
        
        if isinstance(self.edges_gdf.index, pd.MultiIndex):
            self.edges_gdf = self.edges_gdf.reset_index(drop=True)
        self.edges_gdf.index = range(len(self.edges_gdf))
        
        # Enhanced default configuration
        default_config = {
            'max_candidates': 20,          # Increased from 10
            'max_distance': 1000.0,         # Increased from 50.0
            'sigma_z': 50.0,              # Adjusted for better GPS noise handling
            'beta': 2.0,                  # Increased for better transition scoring
            'min_prob_norm': 1e-7,        # Lowered for more flexibility
            'max_speed': 50.0,            # Maximum expected speed (m/s)
            'min_speed': 0.1,             # Minimum expected speed (m/s)
            'angle_tolerance': np.pi/2,    # 90 degrees angle tolerance
            'max_angle_penalty': 0.5,      # Maximum penalty for sharp turns
            'distance_decay': 0.85,        # Distance decay factor
            'sequential_matching': True    # Enable sequential matching for long trajectories
        }
        
        if config:
            default_config.update(config)
        self.config = default_config
        
        self._init_spatial_index()
        self.edge_to_nodes = self._build_edge_to_nodes()
        self.node_to_edges = self._build_node_to_edges()
        
        self.logger = logging.getLogger(__name__)
    
    def _init_spatial_index(self):
        """Initialize R-tree spatial index with improved error handling"""
        try:
            self.spatial_index = rtree.index.Index()
            for idx, edge in self.edges_gdf.iterrows():
                if edge.geometry is not None and not edge.geometry.is_empty:
                    self.spatial_index.insert(idx, edge.geometry.bounds)
        except Exception as e:
            self.logger.error(f"Error initializing spatial index: {str(e)}")
            raise

    def _build_edge_to_nodes(self) -> Dict[int, set]:
        """Build mapping from edge IDs to their endpoint nodes with validation"""
        edge_to_nodes = {}
        for idx, edge in self.edges_gdf.iterrows():
            if edge.geometry is not None and not edge.geometry.is_empty:
                coords = list(edge.geometry.coords)
                if len(coords) >= 2:  # Ensure valid linestring
                    edge_to_nodes[idx] = {
                        self._get_node_id(coords[0]),
                        self._get_node_id(coords[-1])
                    }
        return edge_to_nodes

    def _build_node_to_edges(self) -> Dict[tuple, set]:
        """Build mapping from nodes to connected edge IDs"""
        node_to_edges = {}
        for edge_id, nodes in self.edge_to_nodes.items():
            for node in nodes:
                if node not in node_to_edges:
                    node_to_edges[node] = set()
                node_to_edges[node].add(edge_id)
        return node_to_edges

    def _get_node_id(self, coord: tuple) -> tuple:
        """Convert coordinate to node ID with improved precision"""
        return tuple(round(x, 6) for x in coord)

    def _find_candidates(self, point: Point) -> List[dict]:
        """Enhanced candidate finding with adaptive search radius"""
        candidates = []
        initial_distance = 30.0  # Start with a reduced search radius
        max_attempts = 3
        current_distance = initial_distance
        
        for attempt in range(max_attempts):
            bounds = (
                point.x - current_distance,
                point.y - current_distance,
                point.x + current_distance,
                point.y + current_distance
            )
            
            for idx in self.spatial_index.intersection(bounds):
                edge = self.edges_gdf.loc[idx]
                if edge.geometry is not None:
                    dist = point.distance(edge.geometry)
                    if dist <= current_distance:
                        proj_point = edge.geometry.interpolate(
                            edge.geometry.project(point)
                        )
                        candidates.append({
                            'edge_id': idx,
                            'distance': dist,
                            'proj_point': proj_point,
                            'edge': edge
                        })
            
            if candidates:
                break
                
            current_distance *= 1.5  # Increase search radius for next attempt
        
        candidates.sort(key=lambda x: x['distance'])
        return candidates[:self.config['max_candidates']]

    def _calculate_emission_prob(self, point: Point, candidate: dict) -> float:
        """Enhanced emission probability calculation with improved scaling"""
        distance = candidate['distance']
        sigma_z = self.config['sigma_z']
        
        # Distance-based probability with decay
        distance_factor = np.exp(-distance * self.config['distance_decay'])
        
        # Gaussian probability
        gaussian_prob = np.exp(-0.5 * (distance / sigma_z) ** 2)
        
        # Combined probability
        prob = gaussian_prob * distance_factor
        
        return max(prob, self.config['min_prob_norm'])

    def _calculate_transition_prob(self, prev_edge: int, curr_edge: int,
                                 prev_point: Point, curr_point: Point) -> float:
        """Enhanced transition probability with improved angle handling"""
        prev_nodes = self.edge_to_nodes[prev_edge]
        curr_nodes = self.edge_to_nodes[curr_edge]
        
        connected = bool(prev_nodes.intersection(curr_nodes))
        connectivity_score = 1.0 if connected else 0.3
        
        dir1 = np.array(prev_point.coords[-1]) - np.array(prev_point.coords[0])
        dir2 = np.array(curr_point.coords[-1]) - np.array(curr_point.coords[0])
        
        norm1 = np.linalg.norm(dir1)
        norm2 = np.linalg.norm(dir2)
        
        if norm1 == 0 or norm2 == 0:
            angle_score = 1.0
        else:
            cos_angle = np.dot(dir1, dir2) / (norm1 * norm2)
            angle = np.arccos(np.clip(cos_angle, -1.0, 1.0))
            
            angle_score = 1.0 - (angle / self.config['angle_tolerance']) * self.config['max_angle_penalty']
            angle_score = max(angle_score, 1.0 - self.config['max_angle_penalty'])
        
        prob = connectivity_score * angle_score
        return max(prob, self.config['min_prob_norm'])

    def _viterbi_matching(self, points: List[Point], candidates_by_point: List[List[dict]]) -> List[Dict]:
        """Improved Viterbi algorithm with better numerical stability"""
        n_points = len(points)
        states = [{} for _ in range(n_points)]
        
        # Initialize first state
        for candidate in candidates_by_point[0]:
            edge_id = candidate['edge_id']
            log_emission = np.log(self._calculate_emission_prob(points[0], candidate))
            states[0][edge_id] = {
                'log_prob': log_emission,
                'prev': None,
                'emission': log_emission,
                'transition': 0.0
            }
        
        # Forward pass with log probabilities
        for t in range(1, n_points):
            for candidate in candidates_by_point[t]:
                curr_edge = candidate['edge_id']
                log_emission = np.log(self._calculate_emission_prob(points[t], candidate))
                
                best_log_prob = float('-inf')
                best_prev = None
                best_transition = None
                
                for prev_edge, prev_state in states[t-1].items():
                    trans_prob = self._calculate_transition_prob(
                        prev_edge, curr_edge, points[t-1], points[t]
                    )
                    log_transition = np.log(trans_prob)
                    
                    log_prob = prev_state['log_prob'] + log_transition + log_emission
                    
                    if log_prob > best_log_prob:
                        best_log_prob = log_prob
                        best_prev = prev_edge
                        best_transition = log_transition
                
                if best_prev is not None:
                    states[t][curr_edge] = {
                        'log_prob': best_log_prob,
                        'prev': best_prev,
                        'emission': log_emission,
                        'transition': best_transition
                    }
        
        # Convert log probabilities to normalized confidence scores
        if states[-1]:
            log_probs = np.array([state['log_prob'] for state in states[-1].values()])
            max_log_prob = np.max(log_probs)
            normalized_probs = np.exp(log_probs - max_log_prob)
            normalized_probs /= np.sum(normalized_probs)
            
            for edge_id, norm_prob in zip(states[-1].keys(), normalized_probs):
                states[-1][edge_id]['confidence'] = norm_prob
        
        return states

    def _backtrack(self, states: List[Dict]) -> List[int]:
        """Backtrack to find the best path with improved handling of edge cases"""
        if not states or not states[-1]:
            return []
        
        path = []
        current_edge = max(states[-1].items(), key=lambda x: x[1]['log_prob'])[0]
        
        for t in range(len(states) - 1, -1, -1):
            path.append(current_edge)
            if t > 0 and states[t][current_edge]['prev'] is not None:
                current_edge = states[t][current_edge]['prev']
        
        return list(reversed(path))

    def _sequential_matching(self, points: List[Point]) -> Dict:
        """Match long trajectories in sequential segments with overlap"""
        segment_size = 30
        overlap = 10
        all_edges = []
        segment_confidences = []
        
        for i in range(0, len(points), segment_size - overlap):
            segment = points[i:i + segment_size]
            if len(segment) < 2:
                continue
                
            candidates = [self._find_candidates(p) for p in segment]
            if not all(candidates):
                continue
                
            states = self._viterbi_matching(segment, candidates)
            path = self._backtrack(states)
            
            if path:
                if states[-1] and path[-1] in states[-1]:
                    segment_confidences.append(states[-1][path[-1]].get('confidence', 0.0))
                    
                if all_edges and overlap > 0:
                    all_edges = all_edges[:-overlap]
                all_edges.extend(path)
        
        if not all_edges:
            return {'success': False, 'edges': [], 'confidence': 0.0}
        
        overall_confidence = np.mean(segment_confidences) if segment_confidences else 0.0
            
        return {
            'success': True,
            'edges': all_edges,
            'confidence': overall_confidence
        }

    def match_trajectory(self, points: List[Tuple[float, float]]) -> Dict:
        """Match trajectory with improved error handling and validation"""
        try:
            if len(points) < 2:
                return {'success': False, 'edges': [], 'confidence': 0.0}

            point_objects = [Point(p) for p in points]
            
            if self.config['sequential_matching'] and len(points) > 50:
                return self._sequential_matching(point_objects)
            
            candidates_by_point = [self._find_candidates(p) for p in point_objects]
            
            if not all(candidates_by_point):
                return {'success': False, 'edges': [], 'confidence': 0.0}
            
            states = self._viterbi_matching(point_objects, candidates_by_point)
            path = self._backtrack(states)
            
            if not path:
                return {'success': False, 'edges': [], 'confidence': 0.0}
            
            confidence = states[-1][path[-1]].get('confidence', 0.0)
            
            return {
                'success': True,
                'edges': path,
                'confidence': confidence
            }
                
        except Exception as e:
            self.logger.error(f"Error in match_trajectory: {str(e)}")
            return {'success': False, 'edges': [], 'confidence': 0.0}

In [156]:
def get_edge_identifier(edge_data) -> str:
    """Get a unique identifier for an edge, falling back to alternatives if OSMID is not available"""
    if 'OSMID' in edge_data:
        return str(edge_data['OSMID'])
    elif 'osmid' in edge_data:  # Try lowercase version
        return str(edge_data['osmid'])
    elif 'name' in edge_data:
        return f"road_{edge_data['name']}"
    else:
        # Create a unique identifier from the edge geometry
        coords = list(edge_data.geometry.coords)
        start = coords[0]
        end = coords[-1]
        return f"edge_{start[0]:.4f}_{start[1]:.4f}_{end[0]:.4f}_{end[1]:.4f}"






import folium
from folium.plugins import MarkerCluster
from folium.features import DivIcon
from branca.colormap import LinearColormap






def process_trajectory_with_time(df):
    """Process trajectories with proper edge ID and timestamp handling"""
    try:
        processed_data = []
        
        for idx, row in df.iterrows():
            try:
                # Get edge IDs and convert to ints
                edge_ids = ast.literal_eval(row['eid'])
                edge_ids = [int(eid) for eid in edge_ids]
                
                # Get timestamps and convert to seconds
                timestamps = [t * 15 for t in ast.literal_eval(row['tpath'])]  # Each unit is 15 seconds
                
                # Get matched geometry
                mgeom = row['mgeom']
                if not isinstance(mgeom, str) or not mgeom.startswith('LINESTRING'):
                    continue
                    
                processed_data.append({
                    'edge_ids': edge_ids,
                    'timestamps': timestamps,
                    'duration': timestamps[-1] - timestamps[0],
                    'mgeom': mgeom
                })
                
            except Exception as e:
                print(f"Error processing trajectory {idx}: {str(e)}")
                continue
                
        return processed_data
        
    except Exception as e:
        print(f"Error in process_trajectory_with_time: {str(e)}")
        return []



In [157]:
class RouteAnalyzer:
    def __init__(self, matcher, df: pd.DataFrame):
        """Initialize RouteAnalyzer with matcher and trajectory data"""
        self.matcher = matcher
        self.graph = matcher.graph
        self.edges_gdf = matcher.edges_gdf.copy()
        self.edges_utm = self.edges_gdf.to_crs('EPSG:32629')  # UTM zone 29N for Porto
        self.processed_df = self._process_trajectory_data(df)
        self.segment_stats = self._initialize_segment_stats()
        
        # Create output directory
        self.output_dir = 'map_matching_results'
        self.analysis_dir = os.path.join(self.output_dir, 'route_analysis')
        os.makedirs(self.analysis_dir, exist_ok=True)
    
    def _process_trajectory_data(self, df: pd.DataFrame) -> pd.DataFrame:
        processed_data = []
        for _, row in df.iterrows():
            try:
                edge_ids = self._parse_edge_list(row['eid'])
                timestamps = self._parse_timestamp_list(row['tpath'])
                
                if edge_ids and timestamps and len(edge_ids) > 0 and len(timestamps) > 0:
                    processed_data.append({
                        'edge_ids': edge_ids,
                        'timestamps': timestamps,
                        'duration': max(timestamps) - min(timestamps)
                    })
            except Exception as e:
                continue
        return pd.DataFrame(processed_data)

    def _parse_edge_list(self, edge_str: str) -> List[int]:
        try:
            if isinstance(edge_str, str):
                clean_str = edge_str.strip().replace(' ', '')
                if clean_str.startswith('[') and clean_str.endswith(']'):
                    edges = ast.literal_eval(clean_str)
                    return [int(edge) for edge in edges if str(edge).strip()]
            return []
        except:
            return []

    def _parse_timestamp_list(self, time_str: str) -> List[int]:
        try:
            if isinstance(time_str, str):
                clean_str = time_str.strip().replace(' ', '')
                if clean_str.startswith('[') and clean_str.endswith(']'):
                    times = ast.literal_eval(clean_str)
                    return [int(t) * 15 for t in times if str(t).strip()]
            return []
        except:
            return []

    def _initialize_segment_stats(self) -> Dict:
        stats = {}
        for _, row in self.processed_df.iterrows():
            try:
                edge_ids = row['edge_ids']
                timestamps = row['timestamps']
                
                if not edge_ids or len(timestamps) < 2:
                    continue
                
                total_length = 0
                valid_edges = []
                
                for eid in edge_ids:
                    if eid in self.edges_utm.index:
                        length = self.edges_utm.loc[eid].geometry.length
                        if length > 0:
                            total_length += length
                            valid_edges.append(eid)
                
                if total_length > 0 and valid_edges:
                    duration = timestamps[-1] - timestamps[0]
                    
                    for edge_id in valid_edges:
                        if edge_id not in stats:
                            stats[edge_id] = {
                                'count': 0,
                                'total_time': 0,
                                'length': self.edges_utm.loc[edge_id].geometry.length
                            }
                        
                        segment_time = duration * (self.edges_utm.loc[edge_id].geometry.length / total_length)
                        stats[edge_id]['count'] += 1
                        stats[edge_id]['total_time'] += segment_time
                        
            except Exception as e:
                continue
        
        for edge_id in stats:
            if stats[edge_id]['count'] > 0:
                stats[edge_id]['avg_time'] = stats[edge_id]['total_time'] / stats[edge_id]['count']
                if stats[edge_id]['length'] > 0:
                    stats[edge_id]['time_per_100m'] = (stats[edge_id]['avg_time'] / stats[edge_id]['length']) * 100
                    stats[edge_id]['speed_ms'] = stats[edge_id]['length'] / stats[edge_id]['avg_time']
        
        return stats

    def get_most_traversed_segments(self, n: int = 10) -> List[Dict]:
        segments = []
        for edge_id, stats in self.segment_stats.items():
            if stats['count'] > 0 and stats['length'] > 0:
                segments.append({
                    'edge_id': edge_id,
                    'count': stats['count'],
                    'length': stats['length'],
                    'avg_time': stats['avg_time'],
                    'speed_ms': stats.get('speed_ms', 0),
                    'time_per_100m': stats.get('time_per_100m', 0),
                    'geometry': self.edges_utm.loc[edge_id].geometry
                })
        
        segments.sort(key=lambda x: x['count'], reverse=True)
        return segments[:n]

    def get_slowest_segments(self, n: int = 10) -> List[Dict]:
        segments = []
        min_length = 50
        min_speed = 0.1
        max_speed = 33.3
        
        for edge_id, stats in self.segment_stats.items():
            if (stats['count'] > 0 and 
                stats['length'] >= min_length and 
                'speed_ms' in stats and
                min_speed <= stats['speed_ms'] <= max_speed):
                
                segments.append({
                    'edge_id': edge_id,
                    'count': stats['count'],
                    'length': stats['length'],
                    'avg_time': stats['avg_time'],
                    'speed_ms': stats['speed_ms'],
                    'time_per_100m': stats['time_per_100m'],
                    'geometry': self.edges_utm.loc[edge_id].geometry
                })
        
        segments.sort(key=lambda x: x['time_per_100m'], reverse=True)
        return segments[:n]

    def visualize_segments_enhanced(self, segments: List[Dict], title: str, 
                              filename: str, metric_name: str, 
                              color_scheme: List[str]) -> Optional[str]:
        """
        Enhanced visualization method with fixed color mapping
        """
        if not segments:
            print(f"No segments to visualize for {title}")
            return None
        
        # Convert edges to WGS84 for visualization
        edges_wgs84 = self.edges_gdf.to_crs('EPSG:4326')
        
        # Create base map
        center_lat, center_lon = 41.1579, -8.6291
        m = folium.Map(
            location=[center_lat, center_lon],
            zoom_start=13,
            tiles='cartodbpositron'
        )
        
        # Add background network
        for _, edge in edges_wgs84.iterrows():
            if edge.geometry is not None:
                coords = [(y, x) for x, y in edge.geometry.coords]
                folium.PolyLine(
                    coords,
                    weight=1,
                    color='lightgray',
                    opacity=0.1
                ).add_to(m)
        
        # Calculate colormap values based on metric
        if 'time' in metric_name.lower():
            metric_values = [seg['time_per_100m'] for seg in segments]
            caption = 'Time per 100m (seconds)'
        else:
            metric_values = [seg['count'] for seg in segments]
            caption = 'Traverse Count'
        
        # Create colormap
        colormap = LinearColormap(
            colors=color_scheme,
            vmin=min(metric_values),
            vmax=max(metric_values),
            caption=caption
        )
        
        # Add the legend
        legend_html = """
        <div style="
            position: fixed;
            bottom: 50px;
            right: 50px;
            width: 300px;
            z-index: 1000;
            background-color: white;
            padding: 15px;
            border-radius: 6px;
            box-shadow: 0 2px 5px rgba(0,0,0,0.2);
            font-family: Arial;
            font-size: 12px;
            max-height: 400px;
            overflow-y: auto;
        ">
            <h4 style="margin: 0 0 10px 0;"><strong>Road Segments</strong></h4>
        """
        
        for rank, segment in enumerate(segments, 1):
            try:
                geom = segment['geometry']
                if geom is not None:
                    geom_wgs84 = gpd.GeoSeries([geom], crs='EPSG:32629').to_crs('EPSG:4326')[0]
                    coords = [(y, x) for x, y in geom_wgs84.coords]
                    
                    # Get the metric value and color for this segment
                    value = (segment['time_per_100m'] if 'time' in metric_name.lower() 
                            else segment['count'])
                    color = colormap(value)
                    
                    # Calculate time in minutes
                    time_minutes = segment['avg_time'] / 60
                    
                    # Add segment to legend
                    legend_html += f"""
                    <div style="margin-bottom: 12px;">
                        <div style="margin-bottom: 4px;">
                            <span style="
                                display: inline-block;
                                width: 12px;
                                height: 12px;
                                background-color: {color};
                                margin-right: 5px;
                                border: 1px solid #666;
                            "></span>
                            <strong>#{rank}</strong>
                        </div>
                        <div style="margin-left: 17px;">
                            <strong>OSMID:</strong> {segment['edge_id']}<br>
                            <strong>Speed:</strong> {segment['speed_ms']:.1f} m/s<br>
                            <strong>Length:</strong> {segment['length']:.0f}m<br>
                            <strong>Time:</strong> {time_minutes:.1f} min<br>
                            <strong>Time/100m:</strong> {segment['time_per_100m']:.1f} sec
                        </div>
                    </div>
                    """
                    
                    # Add segment to map
                    popup_text = (
                        f"<div style='font-family: Arial; font-size: 12px;'>"
                        f"<strong>Rank: {rank}</strong><br>"
                        f"Edge ID: {segment['edge_id']}<br>"
                        f"Count: {segment['count']}<br>"
                        f"Length: {segment['length']:.1f}m<br>"
                        f"Speed: {segment['speed_ms']:.1f} m/s<br>"
                        f"Time/100m: {segment['time_per_100m']:.1f}s"
                        f"</div>"
                    )
                    
                    folium.PolyLine(
                        coords,
                        weight=6,
                        color=color,
                        opacity=0.9,
                        popup=popup_text
                    ).add_to(m)
                    
                    # Add rank label
                    if len(coords) > 1:
                        mid_point = coords[len(coords)//2]
                        folium.DivIcon(
                            html=f"""
                            <div style="
                                background-color: rgba(255, 255, 255, 0);
                                border: 2px solid {color};
                                border-radius: 4px;
                                padding: 3px 6px;
                                font-size: 10px;
                                font-weight: bold;
                            ">
                                #{rank}
                            </div>
                            """
                        ).add_to(folium.Marker(
                            mid_point,
                            icon=DivIcon(
                                icon_size=(30, 20),
                                icon_anchor=(15, 10)
                            )
                        ).add_to(m))
                        
            except Exception as e:
                print(f"Error visualizing segment: {str(e)}")
                continue
        
        # Close the legend div
        legend_html += "</div>"
        m.get_root().html.add_child(folium.Element(legend_html))
        
        # Add title
        title_html = f"""
        <div style="
            position: fixed; 
            top: 20px; 
            left: 60px; 
            width: 320px;
            z-index: 1000;
            padding: 15px;
            background-color: white;
            border-radius: 6px;
            box-shadow: 0 2px 5px rgba(0,0,0,0.2);
        ">
            <h3 style="margin: 0;">{title}</h3>
            <p style="margin: 8px 0 0 0;">
                Showing top {len(segments)} segments
            </p>
        </div>
        """
        m.get_root().html.add_child(folium.Element(title_html))
        
        # Add colormap to map
        colormap.add_to(m)
        
        # Save and return
        output_path = os.path.join(self.analysis_dir, filename)
        m.save(output_path)
        return output_path

# Now define the debug analysis function
def debug_analysis():
    try:
        print("Starting debug analysis...")
        
        file_path = "Perry_Github/Urban_Assignment2/data/map_matching_1500.csv"
        print("\nLoading trajectory data...")
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} trajectories")
        
        print("\nLoading road network...")
        G = ox.graph_from_place('Porto, Portugal', network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        print(f"Loaded network with {len(edges)} edges")
        
        print("\nInitializing matcher and analyzer...")
        matcher = EnhancedViterbiMatcher(G, edges)
        analyzer = RouteAnalyzer(matcher, df)
        
        print("\nAnalysis Results:")
        print(f"Total segments analyzed: {len(analyzer.segment_stats)}")
        
        most_traversed = analyzer.get_most_traversed_segments()
        slowest_segments = analyzer.get_slowest_segments()
        
        print(f"Most traversed segments: {len(most_traversed)}")
        print(f"Slowest segments: {len(slowest_segments)}")
        
        most_traversed_map = analyzer.visualize_segments_enhanced(
            most_traversed,
            "Most Frequently Traversed Road Segments",
            "most_traversed_segments.html",
            "traverse frequency",
            ['#fff7ec', '#fee8c8', '#fdd49e', '#fdbb84', '#fc8d59', '#ef6548', '#d7301f', '#990000']
        )
        
        slowest_segments_map = analyzer.visualize_segments_enhanced(
            slowest_segments,
            "Slowest Road Segments",
            "slowest_segments.html",
            "average travel time",
            ['#f7fcfd', '#e0ecf4', '#bfd3e6', '#9ebcda', '#8c96c6', '#8c6bb1', '#88419d', '#6e016b']
        )
        
        results = {
            'most_traversed': {
                'segments': most_traversed,
                'map_path': most_traversed_map
            },
            'slowest_segments': {
                'segments': slowest_segments,
                'map_path': slowest_segments_map
            }
        }
        
        if most_traversed:
            print("\nTop 3 most traversed segments:")
            for i, seg in enumerate(most_traversed[:3], 1):
                print(f"{i}. Count: {seg['count']}, "
                      f"Length: {seg['length']:.2f}m, "
                      f"Speed: {seg['speed_ms']:.2f} m/s")
        
        if slowest_segments:
            print("\nTop 3 slowest segments:")
            for i, seg in enumerate(slowest_segments[:3], 1):
                print(f"{i}. Time per 100m: {seg['time_per_100m']:.2f}s, "
                      f"Speed: {seg['speed_ms']:.2f} m/s, "
                      f"Length: {seg['length']:.2f}m")
        
        return results
        
    except Exception as e:
        print(f"Error in debug analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

In [158]:


def run_map_matching_pipeline(input_file: str, place: str, nrows: int = None) -> Dict:
    """
    Run the complete map matching pipeline with the given parameters
    """
    try:
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        logger = logging.getLogger(__name__)
        
        # Load road network
        logger.info(f"Loading road network for {place}...")
        G = ox.graph_from_place(place, network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        
        # Add edge IDs if they don't exist
        if 'OSMID' not in edges.columns and 'osmid' not in edges.columns:
            logger.info("Creating unique identifiers for edges...")
            edges['OSMID'] = edges.apply(get_edge_identifier, axis=1)
        
        # Convert to UTM coordinates
        utm_crs = 'EPSG:32629'  # UTM zone 29N for Porto
        edges = edges.to_crs(utm_crs)
        
        # Load trajectory data
        logger.info(f"Loading trajectory data from {input_file}...")
        df = pd.read_csv(input_file, nrows=nrows)
        
        # Initialize matcher with default configuration
        matcher = EnhancedViterbiMatcher(G, edges)
        
        # Process trajectories
        matched_results = []
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            trajectory_data = process_trajectory_with_time(row)
            if trajectory_data:
                point_gdf = gpd.GeoDataFrame(
                    geometry=[Point(x, y) for x, y in trajectory_data['coords']],
                    crs='EPSG:4326'
                ).to_crs(utm_crs)
                
                utm_coords = [(p.x, p.y) for p in point_gdf.geometry]
                result = matcher.match_trajectory(utm_coords)
                
                if result['success']:
                    matched_results.append({
                        'match_result': result,
                        'original_coords': trajectory_data['coords'],
                        'timestamps': trajectory_data['timestamps'],
                        'start_timestamp': trajectory_data['start_timestamp']
                    })
        
        if not matched_results:
            logger.warning("No trajectories were successfully matched")
            return None
        
        # Perform analysis
        output_dir = 'map_matching_results'
        route_analyzer = RouteAnalyzer(matcher, matched_results, output_dir)
        analysis_results = route_analyzer.analyze_and_visualize()
        
        return {
            'matched_results': matched_results,
            'analysis_results': analysis_results,
            'output_dir': output_dir
        }
        
    except Exception as e:
        logger.error(f"Error in map matching pipeline: {str(e)}")
        return None


def process_map_matching_file(file_path: str) -> pd.DataFrame:
    """Process the map matching file with improved data parsing"""
    try:
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        print(f"Initial shape: {df.shape}")
        
        def extract_duration(row):
            """Calculate total duration from matched segments"""
            try:
                # Get timestamps from tpath
                tpath = ast.literal_eval(row['tpath'])
                if not tpath:
                    return 0
                # Each increment in tpath represents 15 seconds
                return max(tpath) * 15  # Convert to seconds
            except:
                return 0
        
        def extract_edge_ids(row):
            """Extract valid edge IDs"""
            try:
                return [int(eid) for eid in ast.literal_eval(row['eid'])]
            except:
                return []
        
        print("Processing trajectories...")
        processed_data = []
        
        for idx, row in df.iterrows():
            try:
                duration = extract_duration(row)
                edge_ids = extract_edge_ids(row)
                
                if duration > 0 and edge_ids:
                    # Extract matched geometry
                    mgeom = row['mgeom']
                    if isinstance(mgeom, str) and mgeom.startswith('LINESTRING'):
                        processed_data.append({
                            'trajectory_id': idx,
                            'duration': duration,
                            'edge_ids': edge_ids,
                            'mgeom': mgeom
                        })
            except Exception as e:
                print(f"Error processing row {idx}: {str(e)}")
                continue
        
        # Create processed DataFrame
        processed_df = pd.DataFrame(processed_data)
        
        print("\nProcessing results:")
        print(f"Initial rows: {len(df)}")
        print(f"Valid trajectories: {len(processed_df)}")
        
        if len(processed_df) == 0:
            print("\nDetailed error analysis:")
            sample_row = df.iloc[0]
            print("\nSample row contents:")
            print(f"tpath: {sample_row['tpath']}")
            print(f"eid: {sample_row['eid']}")
            print(f"mgeom: {sample_row['mgeom']}")
        
        return processed_df
    
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame()


def parse_edge_list(edge_str: str) -> List[int]:
    """
    Safely parse edge ID list from string representation
    """
    try:
        if isinstance(edge_str, str):
            # Remove any whitespace and handle potential formatting issues
            clean_str = edge_str.strip().replace(' ', '')
            if clean_str.startswith('[') and clean_str.endswith(']'):
                # Parse string as list
                edges = ast.literal_eval(clean_str)
                # Ensure all elements are integers
                return [int(edge) for edge in edges if str(edge).strip()]
        return []
    except:
        return []

def parse_timestamp_list(time_str: str) -> List[int]:
    """
    Safely parse timestamp list from string representation
    """
    try:
        if isinstance(time_str, str):
            clean_str = time_str.strip().replace(' ', '')
            if clean_str.startswith('[') and clean_str.endswith(']'):
                times = ast.literal_eval(clean_str)
                # Convert to seconds (each unit represents 15 seconds)
                return [int(t) * 15 for t in times if str(t).strip()]
        return []
    except:
        return []

def process_trajectory_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process trajectory data with proper validation
    """
    processed_data = []
    
    for _, row in df.iterrows():
        try:
            # Parse edge IDs and timestamps
            edge_ids = parse_edge_list(row['eid'])
            timestamps = parse_timestamp_list(row['tpath'])
            
            # Validate data
            if edge_ids and timestamps and len(edge_ids) > 0 and len(timestamps) > 0:
                processed_data.append({
                    'edge_ids': edge_ids,
                    'timestamps': timestamps,
                    'duration': max(timestamps) - min(timestamps),
                    'mgeom': row['mgeom'] if 'mgeom' in row else None
                })
        except Exception as e:
            logging.debug(f"Error processing row: {str(e)}")
            continue
    
    return pd.DataFrame(processed_data)





def debug_analysis():
    """Run debug analysis with proper initialization"""
    try:
        print("Starting debug analysis...")
        
        # Load trajectory data
        file_path = "Perry_Github/Urban_Assignment2/data/map_matching_1500.csv"
        print("\nLoading trajectory data...")
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} trajectories")
        
        # Load road network
        print("\nLoading road network...")
        G = ox.graph_from_place('Porto, Portugal', network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        print(f"Loaded network with {len(edges)} edges")
        
        # Initialize matcher
        print("\nInitializing matcher and analyzer...")
        matcher = EnhancedViterbiMatcher(G, edges)
        
        # Initialize analyzer with processed data
        analyzer = RouteAnalyzer(matcher, df)
        
        # Get analysis results
        most_traversed = analyzer.get_most_traversed_segments()
        slowest_segments = analyzer.get_slowest_segments()
        
        print("\nAnalysis Results:")
        print(f"Total segments analyzed: {len(analyzer.segment_stats)}")
        print(f"Most traversed segments: {len(most_traversed)}")
        print(f"Slowest segments: {len(slowest_segments)}")
        
        results = {
            'most_traversed': {'segments': most_traversed},
            'slowest_segments': {'segments': slowest_segments}
        }
        
        # Print detailed results
        if most_traversed:
            print("\nTop 3 most traversed segments:")
            for i, seg in enumerate(most_traversed[:3], 1):
                print(f"{i}. Count: {seg['count']}, "
                      f"Length: {seg['length']:.2f}m, "
                      f"Speed: {seg['speed_ms']:.2f} m/s")
        
        if slowest_segments:
            print("\nTop 3 slowest segments:")
            for i, seg in enumerate(slowest_segments[:3], 1):
                print(f"{i}. Time per 100m: {seg['time_per_100m']:.2f}s, "
                      f"Speed: {seg['speed_ms']:.2f} m/s, "
                      f"Length: {seg['length']:.2f}m")
        
        return results
        
    except Exception as e:
        print(f"Error in debug analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# Function to run the complete analysis
def run_analysis(file_path: str, place: str):
    """Run the complete analysis with proper initialization"""
    try:
        print("Loading road network...")
        G = ox.graph_from_place(place, network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        print(f"Loaded network with {len(edges)} edges")
        
        print("\nProcessing map matching data...")
        df = pd.read_csv(file_path)
        
        print("\nInitializing matcher and analyzer...")
        matcher = EnhancedViterbiMatcher(G, edges)
        analyzer = RouteAnalyzer(matcher, df)
        
        most_traversed = analyzer.get_most_traversed_segments()
        slowest_segments = analyzer.get_slowest_segments()
        
        return {
            'most_traversed': {'segments': most_traversed},
            'slowest_segments': {'segments': slowest_segments}
        }
        
    except Exception as e:
        print(f"Error in analysis pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def analyze_trajectories(file_path: str, place: str) -> Dict:
    """
    Main analysis function with improved error handling
    """
    try:
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        logger = logging.getLogger(__name__)
        
        # Load road network
        logger.info("Loading road network...")
        G = ox.graph_from_place(place, network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        
        # Load and process trajectory data
        logger.info("Processing trajectory data...")
        raw_df = pd.read_csv(file_path)
        processed_df = process_trajectory_data(raw_df)
        
        if processed_df.empty:
            logger.error("No valid trajectories found in the data")
            return None
        
        # Initialize analyzer and run analysis
        logger.info("Running analysis...")
        analyzer = RouteAnalyzer(G, edges, processed_df)
        
        most_traversed = analyzer.get_most_traversed_segments()
        slowest_segments = analyzer.get_slowest_segments()
        
        return {
            'most_traversed': most_traversed,
            'slowest_segments': slowest_segments,
            'total_trajectories': len(processed_df),
            'total_segments': len(analyzer.segment_stats)
        }
        
    except Exception as e:
        logger.error(f"Error in analysis: {str(e)}")
        return None



def verify_file_structure():
    """
    Verify file structure and data format
    """
    file_path = "Perry_Github/Urban_Assignment2/data/map_matching_1500.csv"
    
    try:
        if not os.path.exists(file_path):
            alternative_path = "data/map_matching_1500.csv"
            if os.path.exists(alternative_path):
                file_path = alternative_path
            else:
                raise FileNotFoundError(f"File not found in either path: {file_path} or {alternative_path}")
        
        print(f"Reading file from: {file_path}")
        df = pd.read_csv(file_path)
        print("\nFile Information:")
        print(f"Number of rows: {len(df)}")
        print(f"Columns: {df.columns.tolist()}")
        print("\nFirst few rows:")
        print(df.head())
        
        return file_path, df
        
    except Exception as e:
        print(f"Error verifying file: {str(e)}")
        return None, None


# 
def debug_analysis():
    """Run debug analysis with visualization"""
    try:
        print("Starting debug analysis...")
        
        # Load trajectory data
        file_path = "Perry_Github/Urban_Assignment2/data/map_matching_1500.csv"
        print("\nLoading trajectory data...")
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} trajectories")
        
        # Load road network
        print("\nLoading road network...")
        G = ox.graph_from_place('Porto, Portugal', network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        print(f"Loaded network with {len(edges)} edges")
        
        # Initialize matcher and analyzer
        print("\nInitializing matcher and analyzer...")
        matcher = EnhancedViterbiMatcher(G, edges)
        analyzer = RouteAnalyzer(matcher, df)
        
        print("\nAnalysis Results:")
        print(f"Total segments analyzed: {len(analyzer.segment_stats)}")
        
        # Get analysis results
        most_traversed = analyzer.get_most_traversed_segments()
        slowest_segments = analyzer.get_slowest_segments()
        
        print(f"Most traversed segments: {len(most_traversed)}")
        print(f"Slowest segments: {len(slowest_segments)}")
        
        # Create visualizations
        most_traversed_map = analyzer.visualize_segments_enhanced(
            most_traversed,
            "Most Frequently Traversed Road Segments",
            "most_traversed_segments.html",
            "traverse frequency",
            ['#fff7ec', '#fee8c8', '#fdd49e', '#fdbb84', '#fc8d59', '#ef6548', '#d7301f', '#990000']
        )
        
        slowest_segments_map = analyzer.visualize_segments_enhanced(
            slowest_segments,
            "Slowest Road Segments",
            "slowest_segments.html",
            "average travel time",
            ['#f7fcfd', '#e0ecf4', '#bfd3e6', '#9ebcda', '#8c96c6', '#8c6bb1', '#88419d', '#6e016b']
        )
        
        # Store results
        results = {
            'most_traversed': {
                'segments': most_traversed,
                'map_path': most_traversed_map
            },
            'slowest_segments': {
                'segments': slowest_segments,
                'map_path': slowest_segments_map
            }
        }
        
        # Print detailed statistics
        if most_traversed:
            print("\nTop 3 most traversed segments:")
            for i, seg in enumerate(most_traversed[:3], 1):
                print(f"{i}. Count: {seg['count']}, "
                      f"Length: {seg['length']:.2f}m, "
                      f"Speed: {seg['speed_ms']:.2f} m/s")
        
        if slowest_segments:
            print("\nTop 3 slowest segments:")
            for i, seg in enumerate(slowest_segments[:3], 1):
                print(f"{i}. Time per 100m: {seg['time_per_100m']:.2f}s, "
                      f"Speed: {seg['speed_ms']:.2f} m/s, "
                      f"Length: {seg['length']:.2f}m")
        
        print("\nVisualization files created:")
        if most_traversed_map:
            print(f"Most traversed segments map: {most_traversed_map}")
        if slowest_segments_map:
            print(f"Slowest segments map: {slowest_segments_map}")
        
        return results
        
    except Exception as e:
        print(f"Error in debug analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


def analyze_and_visualize_map_matching(file_path: str, place: str):
    """Main analysis function with improved error handling"""
    try:
        # Load road network
        print("Loading road network...")
        G = ox.graph_from_place(place, network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        print(f"Loaded network with {len(edges)} edges")
        
        # Process trajectory data
        print("\nProcessing map matching data...")
        df = pd.read_csv(file_path)
        
        # Convert edge IDs and paths from string to list
        df['edge_ids'] = df['eid'].apply(ast.literal_eval)
        df['timestamps'] = df['tpath'].apply(lambda x: [t * 15 for t in ast.literal_eval(x)])
        
        print(f"Initial shape: {df.shape}")
        
        # Initialize matcher and analyzer
        print("\nInitializing matcher and analyzer...")
        matcher = EnhancedViterbiMatcher(G, edges)
        analyzer = RouteAnalyzer(matcher, df)
        
        print("\nRunning analysis...")
        most_traversed = analyzer.get_most_traversed_segments()
        slowest_segments = analyzer.get_slowest_segments()
        
        # Generate visualizations
        most_traversed_map = analyzer.visualize_segments_enhanced(
            most_traversed,
            "Most Frequently Traversed Road Segments",
            "most_traversed_segments.html",
            "traverse frequency",
            ['#fff7ec', '#fee8c8', '#fdd49e', '#fdbb84', '#fc8d59', '#ef6548', '#d7301f', '#990000']
        )
        
        slowest_segments_map = analyzer.visualize_segments_enhanced(
            slowest_segments,
            "Slowest Road Segments",
            "slowest_segments.html",
            "average travel time",
            ['#f7fcfd', '#e0ecf4', '#bfd3e6', '#9ebcda', '#8c96c6', '#8c6bb1', '#88419d', '#6e016b']
        )
        
        # Print results
        print("\nAnalysis Results:")
        print(f"Most traversed segments: {len(most_traversed)}")
        print("Top 3 most traversed:")
        for i, seg in enumerate(most_traversed[:3], 1):
            print(f"{i}. Count: {seg['count']}, Length: {seg['length']:.2f}m")
        
        print(f"\nSlowest segments: {len(slowest_segments)}")
        print("Top 3 slowest:")
        for i, seg in enumerate(slowest_segments[:3], 1):
            print(f"{i}. Time per 100m: {seg['time_per_100m']:.2f}s, Speed: {seg['speed_ms']:.2f}m/s")
        
        return {
            'most_traversed': {'segments': most_traversed, 'map_path': most_traversed_map},
            'slowest_segments': {'segments': slowest_segments, 'map_path': slowest_segments_map}
        }
        
    except Exception as e:
        print(f"Error in analysis: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


# Add this function to test the data processing
def test_data_processing():
    """
    Test the data processing to verify correct handling
    """
    file_path = "Perry_Github/Urban_Assignment2/data/map_matching_1500.csv"
    df = pd.read_csv(file_path)
    
    print("\nTesting data processing...")
    print("\nSample raw duration:")
    print(df['duration'].iloc[0])
    
    print("\nSample raw tpath:")
    print(df['tpath'].iloc[0])
    
    print("\nSample raw eid:")
    print(df['eid'].iloc[0])
    
    processed_df = process_map_matching_file(file_path)
    
    if not processed_df.empty:
        print("\nProcessed data sample:")
        sample = processed_df.iloc[0]
        print(f"Duration: {sample['duration']}")
        print(f"Path: {sample['tpath'][:5]}...")
        print(f"Edge IDs: {sample['eid'][:5]}...")
    
    return processed_df


def parse_mgeom(value: str) -> Optional[LineString]:
    """
    Parse LINESTRING format into a LineString object with improved error handling.
    """
    try:
        if not isinstance(value, str):
            return None
            
        # Handle empty LINESTRING
        if value == 'LINESTRING()':
            return None
            
        # Handle single point LINESTRING
        if 'LINESTRING(' in value and ')' in value:
            # Extract coordinates
            coords_str = value[value.find('(')+1:value.find(')')]
            if not coords_str:
                return None
                
            coords = []
            for coord in coords_str.split(','):
                if coord.strip():
                    try:
                        x, y = map(float, coord.strip().split())
                        coords.append((x, y))
                    except ValueError:
                        continue
            
            # Need at least 2 points for a valid LineString
            if len(coords) >= 2:
                return LineString(coords)
            else:
                logging.debug(f"Not enough points in LINESTRING: {value}")
                return None
    except Exception as e:
        logging.debug(f"Error parsing geometry: {str(e)} for value: {value}")
        return None
    return None






    



In [159]:
def validate_segment_statistics(segment_stats: Dict) -> Dict:
    """
    Validate and debug segment statistics
    Returns a dictionary with validation results
    """
    validation_results = {
        'traverse_counts': [],
        'times_per_100m': [],
        'raw_data': []
    }
    
    for edge_id, stats in segment_stats.items():
        if stats['traverse_count'] > 0:
            # Validate traverse count
            validation_results['traverse_counts'].append({
                'edge_id': edge_id,
                'count': stats['traverse_count'],
                'length': stats['length']
            })
            
            # Validate timing calculations
            if 'avg_time' in stats and stats['length'] > 0:
                time_per_100m = (stats['avg_time'] / stats['length']) * 100
                validation_results['times_per_100m'].append({
                    'edge_id': edge_id,
                    'time_per_100m': time_per_100m,
                    'avg_time': stats['avg_time'],
                    'length': stats['length']
                })
                
            # Store raw data for debugging
            validation_results['raw_data'].append({
                'edge_id': edge_id,
                'stats': stats
            })
    
    return validation_results

def analyze_segment_metrics(results: Dict) -> Dict:
    """
    Analyze and verify segment metrics
    """
    metrics = {
        'traverse_counts': {
            'max': 0,
            'min': float('inf'),
            'total_segments': 0
        },
        'timing': {
            'max_time_per_100m': 0,
            'min_time_per_100m': float('inf'),
            'total_segments': 0
        }
    }
    
    # Analyze traverse counts
    if 'most_traversed' in results and 'segments' in results['most_traversed']:
        counts = [seg['count'] for seg in results['most_traversed']['segments']]
        metrics['traverse_counts'].update({
            'max': max(counts) if counts else 0,
            'min': min(counts) if counts else 0,
            'total_segments': len(counts),
            'all_counts': counts
        })
    
    # Analyze timing
    if 'slowest_segments' in results and 'segments' in results['slowest_segments']:
        times = [seg['time_per_100m'] for seg in results['slowest_segments']['segments']]
        metrics['timing'].update({
            'max_time_per_100m': max(times) if times else 0,
            'min_time_per_100m': min(times) if times else 0,
            'total_segments': len(times),
            'all_times': times
        })
    
    return metrics

def debug_trajectory_timing(matched_results: List[Dict]) -> Dict:
    """
    Debug timing calculations from raw trajectory data
    """
    timing_debug = {
        'trajectory_times': [],
        'segment_times': [],
        'anomalies': []
    }
    
    for result in matched_results:
        if not result['match_result']['success']:
            continue
            
        timestamps = result['timestamps']
        if not timestamps or len(timestamps) < 2:
            continue
            
        total_time = timestamps[-1] - timestamps[0]
        timing_debug['trajectory_times'].append(total_time)
        
        # Calculate segment times
        edges = result['match_result']['edges']
        if edges:
            avg_time_per_segment = total_time / len(edges)
            timing_debug['segment_times'].append(avg_time_per_segment)
            
            # Flag potential anomalies
            if avg_time_per_segment < 1.0 or avg_time_per_segment > 300.0:  # Less than 1 second or more than 5 minutes
                timing_debug['anomalies'].append({
                    'trajectory_id': len(timing_debug['trajectory_times']) - 1,
                    'avg_time': avg_time_per_segment,
                    'total_time': total_time,
                    'num_segments': len(edges)
                })
    
    return timing_debug


def run_analysis_with_debugging():
    """
    Run analysis with enhanced debugging and data validation
    """
    try:
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        
        logging.info("Starting analysis...")
        
        # Verify file path
        file_path = "Perry_Github/Urban_Assignment2/data/map_matching_1500.csv"
        if not os.path.exists(file_path):
            logging.error(f"File not found: {file_path}")
            return None
            
        logging.info(f"File found: {file_path}")
        
        # Run analysis
        results = analyze_and_visualize_map_matching(file_path, 'Porto, Portugal')
        
        if results:
            logging.info("\nAnalysis completed successfully")
            
            # Validate and display results
            for segment_type, data in results.items():
                segments = data['segments']
                if segments:
                    logging.info(f"\n{segment_type.replace('_', ' ').title()} Segments:")
                    logging.info("Top 3 segments:")
                    for i, seg in enumerate(segments[:3], 1):
                        logging.info(f"\nSegment {i}:")
                        logging.info(f"- Length: {seg['length']:.2f} meters")
                        logging.info(f"- Average time: {seg['avg_time']:.2f} seconds")
                        logging.info(f"- Time per 100m: {seg['time_per_100m']:.2f} seconds")
                        logging.info(f"- Speed: {seg['speed_ms']:.2f} m/s")
                        logging.info(f"- Traverse count: {seg['count']}")
            
            return results
        else:
            logging.error("Analysis failed to produce results")
            return None
            
    except Exception as e:
        logging.error(f"Error running analysis: {str(e)}")
        logging.error("Stack trace:", exc_info=True)
        return None


# Function to run the complete analysis
def run_analysis(file_path: str, place: str):
    """Run the complete analysis pipeline with detailed logging"""
    try:
        print("Loading road network...")
        G = ox.graph_from_place(place, network_type='drive')
        nodes, edges = ox.graph_to_gdfs(G)
        print(f"Loaded network with {len(edges)} edges")
        
        print("\nProcessing map matching data...")
        processed_df = process_map_matching_file(file_path)
        
        if processed_df.empty:
            print("No valid trajectories found in the data")
            return None
            
        print("\nInitializing matcher and analyzer...")
        matcher = EnhancedViterbiMatcher(G, edges)
        analyzer = RouteAnalyzer(matcher, processed_df)
        
        print("\nRunning analysis...")
        results = analyzer.analyze_and_visualize()
        
        return results
        
    except Exception as e:
        print(f"Error in analysis pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


def run_analysis_with_validation():
    """
    Run analysis with enhanced validation and debugging
    """
    try:
        results = analyze_and_visualize_map_matching(
            file_path="Perry_Github/Urban_Assignment2/data/map_matching_1500.csv",
            place='Porto, Portugal'
        )

        if results:
            print("\nRunning validation checks...")
            
            # Detailed validation of timing calculations
            for segment_type, data in results.items():
                segments = data['segments']
                if segments:
                    print(f"\n{segment_type.replace('_', ' ').title()} Segments:")
                    print("Sample of segment statistics:")
                    for i, seg in enumerate(segments[:3], 1):
                        print(f"\nSegment {i}:")
                        print(f"- Length: {seg['length']:.2f} meters")
                        print(f"- Average time: {seg['avg_time']:.2f} seconds")
                        print(f"- Time per 100m: {seg['time_per_100m']:.2f} seconds")
                        print(f"- Speed: {seg['speed_ms']:.2f} m/s")
                        print(f"- Traverse count: {seg['count']}")
            
            # Validate overall metrics
            most_traversed = results['most_traversed']['segments']
            slowest = results['slowest_segments']['segments']
            
            print("\nOverall Statistics:")
            if most_traversed:
                print(f"Maximum traverse count: {max(seg['count'] for seg in most_traversed)}")
            if slowest:
                max_time = max(seg['time_per_100m'] for seg in slowest)
                print(f"Maximum time per 100m: {max_time:.2f} seconds")
                print(f"Corresponding to speed: {(100 / max_time):.2f} m/s")
            
            return results
            
    except Exception as e:
        print(f"Error running analysis with validation: {str(e)}")
        return None
    
    
def get_slowest_segments(self, n: int = 10) -> List[Dict]:
    """Return the n slowest road segments with fixed time calculation"""
    segments = []
    min_length = 50  # Only consider segments longer than 50m
    
    for edge_id, stats in self.segment_stats.items():
        if (stats['traverse_count'] > 0 and 
            stats.get('avg_time', 0) > 0 and 
            stats['length'] >= min_length):
            
            # Validate length and time
            length = float(stats['length'])
            avg_time = float(stats['avg_time'])
            
            if length > 0 and avg_time > 0:
                # Calculate time per 100m with validation
                time_per_100m = (avg_time / length) * 100
                
                # Add reasonable bounds
                min_speed = 0.1  # m/s (very slow walking)
                max_speed = 33.3  # m/s (120 km/h)
                
                # Convert time_per_100m to speed for validation
                speed = 100 / time_per_100m if time_per_100m > 0 else float('inf')
                
                # Only add if speed is within reasonable bounds
                if min_speed <= speed <= max_speed:
                    segments.append({
                        'edge_id': edge_id,
                        'OSMID': str(edge_id),
                        'avg_time': avg_time,
                        'length': length,
                        'geometry': self.edges_gdf.loc[edge_id].geometry,
                        'count': stats['traverse_count'],
                        'speed_ms': speed,
                        'time_per_100m': time_per_100m
                    })
    
    # Sort by time per 100m (higher times = slower segments)
    segments.sort(key=lambda x: x['time_per_100m'], reverse=True)
    return segments[:n]

In [160]:
# Run the analysis with debugging
if __name__ == "__main__":
   # Run debug analysis
    results = debug_analysis()
    
    if results:
        print("\nFinal Results Summary:")
        for key, data in results.items():
            print(f"\n{key}:")
            segments = data['segments']
            print(f"Number of segments: {len(segments)}")
            if segments:
                print("Top segment statistics:")
                segment = segments[0]
                print(f"- Count: {segment['count']}")
                print(f"- Time per 100m: {segment['time_per_100m']:.2f}s")
                print(f"- Speed: {segment['speed_ms']:.2f}m/s")
        
        analyze_and_visualize_map_matching("map_matching_1500.csv","Porto, Portugal")


Starting debug analysis...

Loading trajectory data...
Loaded 1497 trajectories

Loading road network...
Loaded network with 10533 edges

Initializing matcher and analyzer...

Analysis Results:
Total segments analyzed: 2651
Most traversed segments: 10
Slowest segments: 10

Top 3 most traversed segments:
1. Count: 732, Length: 64.00m, Speed: 3.39 m/s
2. Count: 401, Length: 29.52m, Speed: 1.94 m/s
3. Count: 376, Length: 13.23m, Speed: 2.52 m/s

Top 3 slowest segments:
1. Time per 100m: 778.00s, Speed: 0.13 m/s, Length: 162.25m
2. Time per 100m: 679.73s, Speed: 0.15 m/s, Length: 79.54m
3. Time per 100m: 462.18s, Speed: 0.22 m/s, Length: 84.88m

Visualization files created:
Most traversed segments map: map_matching_results/route_analysis/most_traversed_segments.html
Slowest segments map: map_matching_results/route_analysis/slowest_segments.html

Final Results Summary:

most_traversed:
Number of segments: 10
Top segment statistics:
- Count: 732
- Time per 100m: 29.54s
- Speed: 3.39m/s

slow