In [7]:
""" 
Railway Vibration Analysis - Implementation with Tkinter GUI
====================================================================

This script implements requirements for railway vibration analysis:
1. Load GPS and vibration data from valid folders (identified by Code 1)
2. GUI folder selection using Tkinter
3. Label GPS points based on proximity to infrastructure 
4. Create vibration segments with categorical labels
5. Provide interactive visualization with clickable GPS points
6. Save interactive HTML documentation of vibration plots

The HTML export provides superior documentation with:
- Interactive zoom, pan, and hover capabilities
- Vector graphics quality at any resolution
- Cross-platform compatibility (any browser)
- Self-contained files with embedded functionality
- Professional presentation for technical reports

Author: Studenka Lundahl
"""

import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import dash
from dash import dcc, html, Input, Output
from scipy.spatial import cKDTree
import tkinter as tk
from tkinter import ttk, messagebox
from datetime import datetime

#############################################################
# MAIN EXECUTION: CONFIGURATION AND SETUP
#############################################################
print("🚀 RAILWAY VIBRATION ANALYSIS - WITH GUI")
print("="*60)
print(f"Execution time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

# Set display
pd.set_option('display.width', 73)
pd.set_option('display.max_columns', None)
np.set_printoptions(linewidth=73)

# Define sampling rates for GPS and vibration data
GPS_SAMPLING_RATE = 20  # Hz (20 samples per second)
VIBRATION_SAMPLING_RATE = 500  # Hz (500 samples per second)
dt_gps = 1.0 / GPS_SAMPLING_RATE  # 0.05 seconds per GPS sample
dt_vibration = 1.0 / VIBRATION_SAMPLING_RATE  # 0.002 seconds per vibration sample

# Define type-specific thresholds based on infrastructure characteristics
""" REMOVED TRIED DIFFERENT THRESHOLDS
# INITIAL
TYPE_SPECIFIC_THRESHOLDS = {
        'Bridge': 350,      # Larger structures, detectable from further away
        'Turnout': 300,     # Medium-sized mechanical structures  
        'RailJoint': 150    # Small precise connection points
    }

# LOW
TYPE_SPECIFIC_THRESHOLDS = {
        'Bridge': 300,      # Larger structures, detectable from further away
        'Turnout': 250,     # Medium-sized mechanical structures  
        'RailJoint': 120    # Small precise connection points
    }

# MID
TYPE_SPECIFIC_THRESHOLDS = {
        'Bridge': 250,      # Larger structures, detectable from further away
        'Turnout': 200,     # Medium-sized mechanical structures  
        'RailJoint': 100    # Small precise connection points
    }


# FAST
TYPE_SPECIFIC_THRESHOLDS = {
        'Bridge': 200,      # Larger structures, detectable from further away
        'Turnout': 150,     # Medium-sized mechanical structures  
        'RailJoint': 80    # Small precise connection points
    }

# VERY FAST
TYPE_SPECIFIC_THRESHOLDS = {
        'Bridge': 100,      # Larger structures, detectable from further away
        'Turnout': 60,     # Medium-sized mechanical structures  
        'RailJoint': 40    # Small precise connection points
    }
"""
# Final thershold delection
TYPE_SPECIFIC_THRESHOLDS = {
        'Bridge': 150,     # Larger structures, detectable from further away
        'Turnout': 90,     # Medium-sized mechanical structures  
        'RailJoint': 60    # Small precise connection points
    }

# Vibration analysis parameters
SEGMENT_DURATION_SECONDS = 10  # Duration of each vibration segment for analysis
MAX_VIBRATION_SAMPLES = 1_000_000  # Memory safety limit for vibration data

# ====================
# DATA LOADING AND VALIDATION
# ====================

# Set up file paths
script_dir = os.getcwd()
data2_root = os.path.join(script_dir, "Data 2")

# Check if Code 1 outputs exist
required_files = ["valid_folders.txt", "infrastructure_points.csv"]
missing_files = [f for f in required_files if not os.path.exists(f)]

if missing_files:
    print(f"❌ Missing Code 1 output files: {missing_files}")
    print("Please run Code 1 first to generate these files!")
    exit()

# Load valid folders identified by Code 1
with open("valid_folders.txt", "r") as f:
    valid_folders = [line.strip() for line in f if line.strip()] 

if not valid_folders:
    print("❌ No valid folders found in valid_folders.txt")
    exit()

# ====================
# UTILITY FUNCTIONS
# ====================

def get_route_name_from_coordinates(lat_min, lat_max, lon_min, lon_max, lat_start, lon_start):
    """
    Generate a meaningful route name based on GPS coordinate ranges and starting point.

    Args:
        lat_min, lat_max: Latitude range of the GPS track
        lon_min, lon_max: Longitude range of the GPS track
        lat_start, lon_start: Coordinates of the first GPS point (index 0)

    Returns:
        str: Human-readable route name
    """
    # Approximate coordinates for Borlänge and Mora
    borlange_lat, borlange_lon = 60.485, 15.437
    mora_lat, mora_lon = 61.004, 14.537

    def is_near(lat1, lon1, lat2, lon2, tol=0.05):
        return abs(lat1 - lat2) < tol and abs(lon1 - lon2) < tol

    if 60.4 <= lat_min <= 61.0 and 14.5 <= lon_min <= 15.2:
        if is_near(lat_start, lon_start, borlange_lat, borlange_lon):
            return "Borlänge to Mora"
        elif is_near(lat_start, lon_start, mora_lat, mora_lon):
            return "Mora to Borlänge"
        else:
            return f"Borlänge-Mora Route ({lat_min:.2f}°N {lon_min:.2f}°E)"
    
    elif 59.0 <= lat_min <= 60.0 and 17.0 <= lon_min <= 18.5:
        return "Stockholm Area"
    elif 57.5 <= lat_min <= 58.5 and 11.5 <= lon_min <= 13.0:
        return "Gothenburg Area"
    elif 55.5 <= lat_min <= 56.0 and 12.5 <= lon_min <= 13.5:
        return "Malmö Area"
    else:
        return f"Swedish Railway Route ({lat_min:.2f}°N {lon_min:.2f}°E)"
    
def resolve_subfolder(base_root, folder):
    """
    Find the correct path to GPS data files within a folder.
    Handles both flat and nested folder structures.
    
    Args:
        base_root: Root directory path
        folder: Folder name to search in
    
    Returns:
        str: Path to subfolder containing GPS files, or None if not found
    """
    possible_paths = [
        os.path.join(base_root, folder),
        os.path.join(base_root, folder, folder)  # Handle nested structure
    ]
    
    for path in possible_paths:
        gps_file = os.path.join(path, "GPS.latitude.csv")
        if os.path.isfile(gps_file):
            return path
    
    return None

# ====================
# TKINTER FOLDER SELECTION GUI
# ====================

def create_folder_selection_gui(valid_folders, data2_root):
    """
    Create a Tkinter GUI for selecting which folder to analyze.
    
    Args:
        valid_folders: List of valid folder names from Code 1
        data2_root: Path to Data 2 directory
    
    Returns:
        str: Selected folder name, or None if cancelled
    """
    
    class FolderSelector:
        def __init__(self):
            self.selected_folder = None
            self.root = tk.Tk()
            self.root.title("Railway Analysis - Folder Selection")
            self.root.geometry("900x650")
            self.root.resizable(True, True)
            
            # Make window appear on top and centered
            self.root.lift()
            self.root.attributes('-topmost', True)
            self.root.after_idle(lambda: self.root.attributes('-topmost', False))
            
            # Center the window on screen
            self.center_window()
            
            self.setup_gui()
            
        def center_window(self):
            """Center the window on the screen"""
            self.root.update_idletasks()
            x = (self.root.winfo_screenwidth() // 2) - (900 // 2)
            y = (self.root.winfo_screenheight() // 2) - (650 // 2)
            self.root.geometry(f"900x650+{x}+{y}")
            
        def setup_gui(self):
            """Set up the GUI elements"""
            
            # Main frame with padding
            main_frame = ttk.Frame(self.root, padding="15")
            main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
            
            # Configure grid weights for resizing
            self.root.columnconfigure(0, weight=1)
            self.root.rowconfigure(0, weight=1)
            main_frame.columnconfigure(0, weight=1)
            main_frame.rowconfigure(2, weight=1)
            
            # Title
            title_label = ttk.Label(main_frame, 
                                  text="🚂 Railway Vibration Analysis - Folder Selection", 
                                  font=('Arial', 16, 'bold'))
            title_label.grid(row=0, column=0, pady=(0, 15), sticky=tk.W)
            
            # Subtitle with folder count
            subtitle_label = ttk.Label(main_frame, 
                                     text=f"Found {len(valid_folders)} valid folders with GPS and vibration data.\n"
                                          f"Select a folder to analyze train vibration patterns:",
                                     font=('Arial', 10))
            subtitle_label.grid(row=1, column=0, pady=(0, 15), sticky=tk.W)
            
            # Create treeview for folder list with details
            tree_frame = ttk.Frame(main_frame)
            tree_frame.grid(row=2, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(0, 15))
            tree_frame.columnconfigure(0, weight=1)
            tree_frame.rowconfigure(0, weight=1)
            
            # Treeview with scrollbar
            self.tree = ttk.Treeview(tree_frame, 
                                   columns=('Index', 'Date', 'Time', 'GPS_Size', 'Vib_Size'), 
                                   show='headings', 
                                   height=18)
            
            # Define column headings
            self.tree.heading('Index', text='#')
            self.tree.heading('Date', text='Date')
            self.tree.heading('Time', text='Time') 
            self.tree.heading('GPS_Size', text='GPS Data')
            self.tree.heading('Vib_Size', text='Vibration Data')
            
            # Configure column widths
            self.tree.column('Index', width=50, anchor='center')
            self.tree.column('Date', width=120, anchor='center')
            self.tree.column('Time', width=100, anchor='center')
            self.tree.column('GPS_Size', width=100, anchor='center')
            self.tree.column('Vib_Size', width=120, anchor='center')
            
            # Add scrollbar
            scrollbar = ttk.Scrollbar(tree_frame, orient=tk.VERTICAL, command=self.tree.yview)
            self.tree.configure(yscrollcommand=scrollbar.set)
            
            # Grid treeview and scrollbar
            self.tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))
            scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S))
            
            # Populate treeview with folder information
            self.populate_folder_list()
            
            # Instructions
            instructions = ttk.Label(main_frame, 
                                   text="💡 Double-click a folder or select and click 'Analyze Selected Folder' to continue",
                                   foreground='blue',
                                   font=('Arial', 9))
            instructions.grid(row=3, column=0, pady=(0, 15), sticky=tk.W)
            
            # Buttons frame
            button_frame = ttk.Frame(main_frame)
            button_frame.grid(row=4, column=0, sticky=tk.E)
            
            # Buttons with better styling
            cancel_btn = ttk.Button(button_frame, text="❌ Cancel", command=self.cancel)
            cancel_btn.grid(row=0, column=0, padx=(0, 15))
            
            select_btn = ttk.Button(button_frame, text="✅ Analyze Selected Folder", command=self.select_folder)
            select_btn.grid(row=0, column=1)
            
            # Bind double-click to selection
            self.tree.bind('<Double-1>', lambda e: self.select_folder())
            
        def populate_folder_list(self):
            """Populate the treeview with folder information"""
            
            for i, folder in enumerate(valid_folders):
                try:
                    # Parse folder name for date and time
                    # Expected format: "2024-12-10 10-00-00 (1)"
                    parts = folder.split()
                    if len(parts) >= 2:
                        date_part = parts[0]  # "2024-12-10"
                        time_part = parts[1]  # "10-00-00"
                        # Convert time format: "10-00-00" -> "10:00:00"
                        time_formatted = time_part.replace('-', ':')
                    else:
                        date_part = folder[:10] if len(folder) >= 10 else folder
                        time_formatted = "Unknown"
                    
                    # Get file sizes
                    subfolder = resolve_subfolder(data2_root, folder)
                    if subfolder:
                        try:
                            # GPS file size
                            lat_file = os.path.join(subfolder, "GPS.latitude.csv")
                            gps_size = os.path.getsize(lat_file) / (1024*1024) if os.path.exists(lat_file) else 0
                            gps_size_str = f"{gps_size:.1f} MB"
                            
                            # Vibration file size
                            vib_file = os.path.join(subfolder, "CH1_ACCEL1Z1.csv")
                            vib_size = os.path.getsize(vib_file) / (1024*1024*1024) if os.path.exists(vib_file) else 0
                            vib_size_str = f"{vib_size:.2f} GB"
                            
                        except Exception:
                            gps_size_str = "Unknown"
                            vib_size_str = "Unknown"
                    else:
                        gps_size_str = "Error"
                        vib_size_str = "Error"
                    
                    # Insert into treeview with index
                    self.tree.insert('', 'end', values=(f"{i+1}", date_part, time_formatted, gps_size_str, vib_size_str))
                        
                except Exception as e:
                    # Handle parsing errors gracefully
                    self.tree.insert('', 'end', values=(f"{i+1}", folder, "Parse Error", "Unknown", "Unknown"))
        
        def select_folder(self):
            """Handle folder selection"""
            selection = self.tree.selection()
            if not selection:
                messagebox.showwarning("No Selection", "Please select a folder from the list.")
                return
            
            # Get selected item index
            item = selection[0]
            item_index = self.tree.index(item)
            
            if 0 <= item_index < len(valid_folders):
                self.selected_folder = valid_folders[item_index]
                
                # Show confirmation with details
                folder_name = self.selected_folder
                values = self.tree.item(item)['values']
                date_time = f"{values[1]} {values[2]}"
                
                result = messagebox.askyesno("Confirm Selection", 
                                           f"🚂 Analyze this train run?\n\n"
                                           f"📁 Folder: {folder_name}\n"
                                           f"📅 Date/Time: {date_time}\n"
                                           f"📊 GPS Data: {values[3]}\n"
                                           f"⚡ Vibration Data: {values[4]}\n\n"
                                           f"This will start the vibration analysis...")
                
                if result:
                    self.root.quit()
                    self.root.destroy()
            else:
                messagebox.showerror("Error", "Invalid folder selection.")
        
        def cancel(self):
            """Handle cancellation"""
            result = messagebox.askyesno("Cancel Analysis", 
                                       "❌ Are you sure you want to cancel?\n\n"
                                       "The railway analysis program will exit.")
            if result:
                self.selected_folder = None
                self.root.quit()    # Stop the Tkinter main loop
                self.root.destroy()
        
        def run(self):
            """Run the GUI and return selected folder"""
            try:
                # Select first item by default for convenience
                if self.tree.get_children():
                    first_item = self.tree.get_children()[0]
                    self.tree.selection_set(first_item)
                    self.tree.focus_set()
                    self.tree.focus(first_item)
                
                self.root.mainloop()
                return self.selected_folder
            except Exception as e:
                print(f"GUI Error: {e}")
                return None
    
    # Create and run the GUI
    selector = FolderSelector()
    return selector.run()

# ====================
# FOLDER SELECTION WITH GUI
# ====================

print(f"📂 Loaded {len(valid_folders)} valid folders from Code 1")
print("📂 First 5 folders:", valid_folders[:5])

# Launch Tkinter GUI for folder selection
print("\n🖥️ Opening folder selection window...")
print("   💡 A GUI window will appear - please select a folder to analyze")

try:
    selected_folder_name = create_folder_selection_gui(valid_folders, data2_root)
    
    if selected_folder_name is None:
        print("❌ No folder selected. Exiting...")
        raise SystemExit("User cancelled folder selection")

    
    print(f"✅ User selected folder: {selected_folder_name}")

except SystemExit:
    # Re-raise SystemExit to stop execution
    raise

except Exception as e:
    print(f"❌ GUI Error: {e}")
    print("📝 Falling back to first folder...")
    selected_folder_name = valid_folders[0]

# ====================
# FOLDER VALIDATION AND FILE LOADING  
# ====================

print(f"\n🔍 Validating selected folder: {selected_folder_name}")

if selected_folder_name is None:
        print("❌ No valid folder name available. Cannot continue.")
        raise SystemExit("No valid folder selected")

files = {}
selected_folder = None

# Validate the selected folder has all required files
subfolder = resolve_subfolder(data2_root, selected_folder_name)
if subfolder:
    # Define all required files for complete analysis
    potential_files = {
        "latitude": os.path.join(subfolder, "GPS.latitude.csv"),
        "longitude": os.path.join(subfolder, "GPS.longitude.csv"),
        "vibration1": os.path.join(subfolder, "CH1_ACCEL1Z1.csv"),
        "vibration2": os.path.join(subfolder, "CH2_ACCEL1Z2.csv"),
        "speed": os.path.join(subfolder, "GPS.speed.csv"),
        "satellites": os.path.join(subfolder, "GPS.satellites.csv")  # For quality assessment
    }

    # Check if all files exist
    if all(os.path.exists(path) for path in potential_files.values()):
        files = potential_files
        selected_folder = selected_folder_name
        print(f"\n✅ Selected folder validated: {selected_folder_name}")
        for key, path in files.items():
            file_size = os.path.getsize(path) / (1024*1024)  # Convert to MB
            print(f"  • {key}: {os.path.basename(path)} ({file_size:.1f} MB)")
    else:
        # Try without satellites file (it's optional)
        required_files = {k: v for k, v in potential_files.items() if k != "satellites"}
        if all(os.path.exists(path) for path in required_files.values()):
            files = required_files
            selected_folder = selected_folder_name
            print(f"\n✅ Selected folder validated: {selected_folder_name} (no satellites file)")
            for key, path in files.items():
                file_size = os.path.getsize(path) / (1024*1024)
                print(f"  • {key}: {os.path.basename(path)} ({file_size:.1f} MB)")
        else:
            print(f"❌ Selected folder missing required files!")
            missing = [k for k, v in required_files.items() if not os.path.exists(v)]
            print(f"   Missing files: {missing}")
            exit()
else:
    print(f"❌ Could not find subfolder structure for: {selected_folder_name}")
    exit()

if not files:
    print("🚫 No valid folder selected or found.")
    exit()

# ====================
# GPS DATA LOADING AND PREPROCESSING
# ====================

print(f"\n🌍 Loading GPS data from {selected_folder}...")

try:
    # Load individual GPS data files
    df_lat = pd.read_csv(files["latitude"], header=None, names=["latitude"])
    df_lon = pd.read_csv(files["longitude"], header=None, names=["longitude"])
    df_speed = pd.read_csv(files["speed"], header=None, names=["speed"])
    
    # Load GPS satellites data for quality assessment (if available)
    if "satellites" in files:
        df_satellites = pd.read_csv(files["satellites"], header=None, names=["satellites"])
        print("📡 GPS satellites data loaded for quality assessment")
    else:
        df_satellites = None
        print("⚠️ GPS satellites data not available - quality assessment limited")
    
    # Ensure all GPS files have the same length and create timestamps
    min_len = min(len(df_lat), len(df_lon), len(df_speed))
    if df_satellites is not None:
        min_len = min(min_len, len(df_satellites))
    
    # Create timestamp array based on GPS sampling rate
    timestamps = np.arange(min_len) * dt_gps
    
    # Combine all GPS data into a single DataFrame
    gps_data = {
        "Latitude": pd.to_numeric(df_lat["latitude"][:min_len], errors="coerce"),
        "Longitude": pd.to_numeric(df_lon["longitude"][:min_len], errors="coerce"),
        "Speed": pd.to_numeric(df_speed["speed"][:min_len], errors="coerce"),
        "timestamp": timestamps
    }
    
    # Add satellite count if available (for GPS quality assessment)
    if df_satellites is not None:
        gps_data["Satellites"] = pd.to_numeric(df_satellites["satellites"][:min_len], errors="coerce")
    
    df_gps = pd.DataFrame(gps_data)
    
    # Clean GPS data - remove invalid coordinates
    df_gps = df_gps.dropna(subset=["Latitude", "Longitude"])
    
    # Filter to reasonable Swedish coordinates (remove GPS errors)
    df_gps = df_gps[
        (df_gps["Latitude"] > 55) & (df_gps["Latitude"] < 70) &
        (df_gps["Longitude"] > 10) & (df_gps["Longitude"] < 25)
    ].reset_index(drop=True)
    
    # Add index tracking and clean speed data
    df_gps["OriginalIndex"] = df_gps.index
    df_gps['Speed'] = df_gps['Speed'].fillna(0.0)  # Replace NaN speeds with 0
    
    # GPS Quality Assessment (if satellite data is available)
    if "Satellites" in df_gps.columns:
        df_gps['Satellites'] = df_gps['Satellites'].fillna(0)
        
        def gps_quality(sat_count):
            """Categorize GPS quality based on satellite count"""
            if sat_count >= 10: return "Excellent"
            elif sat_count >= 7: return "Good" 
            elif sat_count >= 4: return "Acceptable"
            else: return "Poor"
        
        df_gps['GPS_Quality'] = df_gps['Satellites'].apply(gps_quality)
        
        # Print quality statistics
        quality_stats = df_gps['GPS_Quality'].value_counts()
        print(f"📡 GPS Quality Assessment:")
        print(f"   • Satellite count range: {df_gps['Satellites'].min():.0f} to {df_gps['Satellites'].max():.0f}")
        print(f"   • Average satellites: {df_gps['Satellites'].mean():.1f}")
        print(f"   • Quality distribution:")
        for quality, count in quality_stats.items():
            print(f"     - {quality}: {count} points ({count/len(df_gps)*100:.1f}%)")
        
        # Filter out poor quality GPS points (less than 4 satellites)
        high_quality_gps = df_gps[df_gps['Satellites'] >= 4]
        low_quality_count = len(df_gps) - len(high_quality_gps)
        
        if low_quality_count > 0:
            print(f"   • Filtered out {low_quality_count} poor quality GPS points (<4 satellites)")
            df_gps = high_quality_gps.reset_index(drop=True)
            df_gps["OriginalIndex"] = df_gps.index
    
    # Calculate data statistics
    gps_duration = df_gps['timestamp'].max()
    print(f"✅ GPS DataFrame created: {len(df_gps)} valid points")
    print(f"📏 GPS temporal range: 0 to {gps_duration:.1f} seconds ({gps_duration/60:.1f} minutes)")
    print(f"📍 GPS range: Lat {df_gps['Latitude'].min():.6f} to {df_gps['Latitude'].max():.6f}")
    print(f"📍 GPS range: Lon {df_gps['Longitude'].min():.6f} to {df_gps['Longitude'].max():.6f}")
    
    # Generate route name based on coordinates
    route_name = get_route_name_from_coordinates(
        df_gps['Latitude'].min(), df_gps['Latitude'].max(),
        df_gps['Longitude'].min(), df_gps['Longitude'].max(),
        df_gps.iloc[0]['Latitude'],
        df_gps.iloc[0]['Longitude']
    )

    print(f"🚂 Detected route: {route_name}")
     
except Exception as e:
    print(f"❌ Error loading GPS data: {e}")
    exit()

# ====================
# VIBRATION DATA LOADING
# ====================

print(f"\n⚡ Loading vibration data...")
try:
    # Check vibration file sizes for memory planning
    vib1_size = os.path.getsize(files["vibration1"]) / (1024*1024*1024)  # Convert to GB
    vib2_size = os.path.getsize(files["vibration2"]) / (1024*1024*1024)  # Convert to GB
    print(f"📏 Vibration file sizes: Ch1={vib1_size:.2f}GB, Ch2={vib2_size:.2f}GB")
    
    # Calculate how many samples we need to match GPS duration
    required_vib_samples = int(gps_duration / dt_vibration)
    
    # Apply memory safety limit to prevent crashes
    max_reasonable_samples = min(required_vib_samples, MAX_VIBRATION_SAMPLES)
    
    print(f"📏 Required vibration samples: {required_vib_samples:,} ({gps_duration:.1f}s)")
    print(f"📏 Loading {max_reasonable_samples:,} samples for safe memory usage")
    print(f"💾 Estimated RAM usage: ~{max_reasonable_samples * 16 / 1024 / 1024:.0f}MB")
    
    # Load vibration data with row limit for memory safety
    df_vib1 = pd.read_csv(files["vibration1"], header=None, names=["vibration1"], nrows=max_reasonable_samples)
    df_vib2 = pd.read_csv(files["vibration2"], header=None, names=["vibration2"], nrows=max_reasonable_samples)
    
    # Ensure both channels have the same length
    min_vib_len = min(len(df_vib1), len(df_vib2))
    
    # Create combined vibration DataFrame with timestamps
    df_vibration = pd.DataFrame({
        "vibration1": pd.to_numeric(df_vib1["vibration1"][:min_vib_len], errors="coerce"),
        "vibration2": pd.to_numeric(df_vib2["vibration2"][:min_vib_len], errors="coerce"),
        "timestamp": np.arange(min_vib_len) * dt_vibration
    })
    
    # Calculate overlap between GPS and vibration data
    vib_duration = df_vibration['timestamp'].max()
    overlap_duration = min(gps_duration, vib_duration)
    
    # Trim both datasets to overlap period for synchronization
    df_gps = df_gps[df_gps['timestamp'] <= overlap_duration].reset_index(drop=True)
    df_gps["OriginalIndex"] = df_gps.index
    df_vibration = df_vibration[df_vibration['timestamp'] <= overlap_duration]
    
    print(f"✅ Vibration DataFrame created: {len(df_vibration):,} samples")
    print(f"📏 Vibration temporal range: 0 to {vib_duration:.1f} seconds ({vib_duration/60:.1f} minutes)")
    print(f"⚖️ Data overlap: {overlap_duration:.1f} seconds ({overlap_duration/60:.1f} minutes)")
    print(f"📊 Synchronized data: GPS {len(df_gps)} points, Vibration {len(df_vibration):,} samples")
    
except Exception as e:
    print(f"❌ Error loading vibration data: {e}")
    exit()

# ====================
# INFRASTRUCTURE DATA LOADING
# ====================

print("🏗️ Loading infrastructure points from Code 1...")
try:
    # Load infrastructure points generated by Code 1
    df_infra = pd.read_csv("infrastructure_points.csv")
    print(f"🏗️ Loaded {len(df_infra)} infrastructure points from Code 1")
    
    # Show the statistics
    category_counts = df_infra['Category'].value_counts().to_dict()
    print(f"   • Categories: {category_counts}")
    
    # Calculate infrastructure density
    total_points = len(df_infra)
    for category, count in category_counts.items():
        percentage = count / total_points * 100
        print(f"   • {category}: {count} points ({percentage:.1f}%)")
        
    # Show the dramatic improvement xls vs csv-files
    railjoints = category_counts.get('RailJoint', 0)
    if railjoints > 100:
        print(f"   🎉 MAJOR IMPROVEMENT: {railjoints} RailJoint points (vs ~20 previously when not using excel_comprehensive)")
        print(f"   📈 Expect significantly more RailJoint segments!")
        
except Exception as e:
    print(f"❌ Error loading infrastructure points: {e}")
    print("Creating empty infrastructure DataFrame")
    df_infra = pd.DataFrame(columns=["Latitude", "Longitude", "Category"])

# ====================
# GPS POINT LABELING WITH INFRASTRUCTURE PROXIMITY
# ====================

print("\n🏷️ Labeling GPS points based on infrastructure proximity...")

def find_infrastructure_with_adaptive_thresholds(lat, lon, df_infra, type_thresholds=TYPE_SPECIFIC_THRESHOLDS):
    """
    Find nearest infrastructure point and label based on adaptive thresholds.
    Always returns the distance to the nearest infrastructure point regardless of labeling.
    
    Args:
        lat, lon: GPS coordinates to check
        df_infra: DataFrame containing infrastructure points
        type_thresholds: Dictionary of thresholds for each infrastructure type
    
    Returns:
        tuple: (infrastructure_category, distance_in_meters)
               Returns ("Normal Track", distance) if no infrastructure within any threshold
    """
    if len(df_infra) == 0:
        return "Normal Track", float('inf')
    
    # Calculate distances to all infrastructure points
    lat_diff = (df_infra['Latitude'] - lat) * 111000
    lon_diff = (df_infra['Longitude'] - lon) * 111000 * np.cos(np.radians(lat))
    distances_meters = np.sqrt(lat_diff**2 + lon_diff**2)
    
    # Find the overall closest infrastructure point
    min_idx = distances_meters.idxmin()
    min_distance_meters = distances_meters.iloc[min_idx]
    closest_category = df_infra.iloc[min_idx]['Category']
    
    # Check if the closest point is within its specific threshold
    if closest_category in type_thresholds:
        threshold = type_thresholds[closest_category]
        if min_distance_meters < threshold:
            return closest_category, min_distance_meters
    
    # If not within threshold, return "Normal Track" but keep the actual distance
    return "Normal Track", min_distance_meters

# Apply infrastructure labeling to all GPS points
print(f"   ✅ Adaptive infrastructure labeling with thresholds: {TYPE_SPECIFIC_THRESHOLDS}")

gps_labels = []
gps_distances = []

for idx, row in df_gps.iterrows():
    label, distance = find_infrastructure_with_adaptive_thresholds(row['Latitude'], row['Longitude'], df_infra)
    gps_labels.append(label)
    gps_distances.append(distance)
    
    # Progress indicator for large datasets
    if (idx + 1) % 2000 == 0:
        print(f"      Processed {idx + 1}/{len(df_gps)} GPS points...")

# Add labels to GPS DataFrame
df_gps['InfraLabel'] = gps_labels
df_gps['InfraDistance_m'] = gps_distances

# Convert float('inf') values to None for cases where no infrastructure exists
df_gps['InfraDistance_m'] = df_gps['InfraDistance_m'].replace([float('inf')], None)

# Print labeling statistics
label_counts = pd.Series(gps_labels).value_counts()
print("\n📊 GPS Point Labeling Results:")
for label, count in label_counts.items():
    print(f"   • {label}: {count} points ({count/len(df_gps)*100:.1f}%)")
    # Show distance statistics for all points (including Normal Track)
    label_distances = [d for l, d in zip(gps_labels, gps_distances) if l == label and d != float('inf')]
    if label_distances:
        print(f"     - Distance range: {min(label_distances):.1f}m to {max(label_distances):.1f}m")
        if label != 'Normal Track':
            # For infrastructure points, show they're all within threshold
            threshold = TYPE_SPECIFIC_THRESHOLDS.get(label, 'N/A')
            print(f"     - Threshold used: {threshold}m")
        else:
            # For Normal Track, show average distance to nearest infrastructure
            avg_distance = sum(label_distances) / len(label_distances)
            print(f"     - Average distance to nearest infrastructure: {avg_distance:.1f}m")

# ====================
# VIBRATION SEGMENTATION WITH PROPER LABELING
# ====================

print("\n🔧 Creating vibration segments with categorical labels...")

# Calculate sampling rate relationship
samples_per_gps = int(dt_gps / dt_vibration)  # Number of vibration samples per GPS point

print(f"📡 Sampling Rate Synchronization:")
print(f"   • GPS: {GPS_SAMPLING_RATE} Hz ({dt_gps}s intervals)")
print(f"   • Vibration: {VIBRATION_SAMPLING_RATE} Hz ({dt_vibration}s intervals)")
print(f"   • Ratio: 1 GPS point = {samples_per_gps} vibration samples")

# Define segmentation parameters
segment_length = int(SEGMENT_DURATION_SECONDS / dt_vibration)  # Number of samples per segment

# Initialize storage for segments and their metadata
segments = []
segment_start_times = []
segment_labels = []
segment_gps_indices = [] # Track which GPS point each segment corresponds to

print(f"\n📏 Segment parameters:")
print(f"   • Segment duration: {SEGMENT_DURATION_SECONDS} seconds")
print(f"   • Samples per segment: {segment_length} samples")

# Create segments if vibration data is available
if not df_vibration.empty and len(df_gps) > 0:
    num_segments = len(df_vibration) // segment_length
    
    print_limit = 15    # Print first 15 rows

    for i in range(num_segments):
        # Define segment boundaries
        start_idx = i * segment_length
        end_idx = (i + 1) * segment_length
        
        # Extract vibration data for this segment
        seg_data = df_vibration.iloc[start_idx:end_idx][["vibration1", "vibration2"]].values
        
        # Only use complete segments with valid data
        if len(seg_data) == segment_length and not np.isnan(seg_data).any():
            segment_start_time = df_vibration.iloc[start_idx]["timestamp"]
            
            # Find the GPS point that best corresponds to this vibration segment
            # by finding the closest GPS timestamp
            time_differences = np.abs(df_gps['timestamp'] - segment_start_time)
            
            # Find the closest GPS point within a reasonable time window
            min_time_diff_idx = time_differences.idxmin()
            min_time_diff = time_differences.iloc[min_time_diff_idx]
            
            # Only associate with GPS point if timing is reasonable (within 1 second)
            if min_time_diff < 1.0:
                # Use the infrastructure label from the full GPS dataset
                segment_label = df_gps.iloc[min_time_diff_idx]['InfraLabel']
                gps_point_index = min_time_diff_idx
                if i < print_limit:
                    print(f"   Segment {i}: GPS point {gps_point_index}, Label='{segment_label}', Time diff={min_time_diff:.2f}s")
            else:
                # No valid GPS mapping - label as normal track
                segment_label = "Normal Track"
                gps_point_index = -1
                if i < print_limit:
                    print(f"   Segment {i}: No GPS mapping (time diff={min_time_diff:.2f}s), using 'Normal Track'")
            if i == print_limit and num_segments > print_limit:
                print(f"   ... and {num_segments - print_limit} more")

            
            # Store segment and metadata
            segments.append(seg_data)
            segment_start_times.append(segment_start_time)
            segment_labels.append(segment_label)
            segment_gps_indices.append(gps_point_index)
    
    segments = np.array(segments)
    print(f"   • Created segments: {len(segments)}")
else:
    segments = np.array([])
    print("   • No vibration data available for segmentation")

print(f"✅ Created {len(segments)} vibration segments with categorical labels")

# ====================
# LABELING QUALITY ASSESSMENT AND VALIDATION
# ====================

def analyze_labeling_performance(segment_labels, type_thresholds=None):
    """
    Analyze the effectiveness of adaptive threshold-based infrastructure labeling.
    
    This function evaluates:
    - Distribution of detected infrastructure types
    - Infrastructure density reasonableness 
    - RailJoint detection improvement (key metric)
    - Threshold effectiveness assessment
    
    Args:
        segment_labels: List of segment labels from infrastructure detection
        type_thresholds: Dictionary of thresholds used for each infrastructure type
    
    Returns:
        None (prints analysis results)
    """
    
    print(f"\n📊 INFRASTRUCTURE LABELING PERFORMANCE ANALYSIS:")
    print("="*60)
    
    label_counts = pd.Series(segment_labels).value_counts()
    total_segments = len(segment_labels)
    normal_track_segments = label_counts.get('Normal Track', 0)
    infrastructure_segments = total_segments - normal_track_segments
    
    # 1. Overall detection results
    print(f"🎯 Detection Results Summary:")
    for label, count in label_counts.items():
        percentage = count/total_segments*100
        print(f"   • {label}: {count} segments ({percentage:.1f}%)")
    
    # 2. RailJoint detection analysis (critical for vibration analysis)
    railjoints = label_counts.get('RailJoint', 0)
    print(f"\n🔍 RailJoint Detection Assessment:")
    print(f"   • RailJoint segments detected: {railjoints}")
    print(f"   • RailJoint coverage: {railjoints/total_segments*100:.1f}% of total journey")
    
    # Performance rating for RailJoint detection
    if railjoints > 30:
        print(f"   ✅ EXCELLENT: High RailJoint detection rate achieved!")
    elif railjoints > 15:
        print(f"   📈 GOOD: Significant RailJoint detection improvement")
    elif railjoints > 5:
        print(f"   📊 MODERATE: Reasonable RailJoint detection")
    else:
        threshold_info = f" (current threshold: {type_thresholds.get('RailJoint', 'N/A')}m)" if type_thresholds else ""
        print(f"   ⚠️ LIMITED: Low detection rate{threshold_info}")
        print(f"      → Consider reducing RailJoint threshold or verifying infrastructure data")
    
    # 3. Infrastructure density validation
    infrastructure_percentage = infrastructure_segments / total_segments * 100
    print(f"\n⚖️ Infrastructure Density Validation:")
    print(f"   • Total infrastructure coverage: {infrastructure_segments}/{total_segments} ({infrastructure_percentage:.1f}%)")
    
    # Density reasonableness check
    if infrastructure_percentage > 80:
        print(f"   ⚠️ VERY HIGH: Possible over-labeling - review thresholds")
        print(f"      → Consider increasing threshold distances")
    elif infrastructure_percentage > 60:
        print(f"   ⚠️ HIGH: Dense infrastructure labeling detected")
        print(f"      → Verify if route actually has high infrastructure density")
    elif infrastructure_percentage > 20:
        print(f"   ✅ OPTIMAL: Realistic infrastructure density for railway analysis")
    elif infrastructure_percentage > 5:
        print(f"   📊 CONSERVATIVE: Low but acceptable infrastructure coverage")
        print(f"      → Consider reducing thresholds if more detection needed")
    else:
        print("   ⚠️ VERY LOW: Likely under-detection or data issues")
        print("      → Check GPS coverage and infrastructure database completeness")
    
    # 4. Threshold effectiveness summary
    print(f"\n🎛️ Adaptive Threshold Configuration:")
    if type_thresholds:
        for infra_type, threshold in type_thresholds.items():
            detected_count = label_counts.get(infra_type, 0)
            print(f"   • {infra_type}: {threshold}m threshold → {detected_count} segments detected")
    
    # 5. Recommendations for next steps
    print(f"\n💡 ANALYSIS RECOMMENDATIONS:")
    if infrastructure_percentage < 10:
        print("   • Consider reducing threshold distances for better coverage")
        print("   • Verify infrastructure database matches the analyzed route")
    elif infrastructure_percentage > 60:
        print("   • Consider increasing threshold distances to reduce over-labeling")
        print("   • Review if multiple close infrastructure points cause duplicate labeling")
    else:
        print("   • Labeling performance appears suitable for vibration analysis")
        print("   • Data ready for machine learning classification tasks")


def analyze_temporal_distribution(segment_labels, segment_start_times, segment_gps_indices, df_gps):
    """
    Analyze the temporal and spatial distribution of detected infrastructure.
    
    This function examines:
    - Infrastructure timing throughout the journey
    - Distance measurements for each infrastructure type
    - Spacing patterns between infrastructure elements
    
    Args:
        segment_labels: List of infrastructure labels
        segment_start_times: Start times for each segment
        segment_gps_indices: GPS indices corresponding to segments
        df_gps: GPS DataFrame with location and distance data
    
    Returns:
        None (prints analysis results)
    """
    
    print(f"\n🔍 TEMPORAL AND SPATIAL DISTRIBUTION ANALYSIS:")
    print("=" * 60)

    # 1. Journey temporal coverage
    total_journey_time = df_gps['timestamp'].max()
    print(f"⏱️ Journey Coverage:")
    print(f"   • Total journey duration: {total_journey_time:.1f} seconds ({total_journey_time/60:.1f} minutes)")

    # 2. Infrastructure timing analysis
    infrastructure_events = []
    for i, (label, start_time) in enumerate(zip(segment_labels, segment_start_times)):
        if label != 'Normal Track':
            infrastructure_events.append({
                'segment_idx': i,
                'type': label,
                'start_time': start_time,
                'end_time': start_time + SEGMENT_DURATION_SECONDS
            })

    print(f"\n🏗️ Infrastructure Event Timeline:")
    print(f"   • Infrastructure events detected: {len(infrastructure_events)}")
    
    if infrastructure_events:
        print(f"   • First infrastructure: {infrastructure_events[0]['type']} at {infrastructure_events[0]['start_time']:.1f}s")
        print(f"   • Last infrastructure: {infrastructure_events[-1]['type']} at {infrastructure_events[-1]['start_time']:.1f}s")
        
        # Show sample of infrastructure events
        print(f"\n📋 Infrastructure Event Sample (first 8):")
        for event in infrastructure_events[:8]:
            print(f"      {event['start_time']:6.1f}s: {event['type']} (Segment {event['segment_idx']})")
        if len(infrastructure_events) > 8:
            print(f"      ... and {len(infrastructure_events) - 8} more events")

    # 3. Distance analysis by infrastructure type
    print(f"\n📏 Distance Analysis by Infrastructure Type:")
    for infrastructure_type in ['Bridge', 'Turnout', 'RailJoint']:
        type_segments = [i for i, label in enumerate(segment_labels) if label == infrastructure_type]
        
        if type_segments:
            # Collect distance measurements for this type
            type_distances = []
            for seg_idx in type_segments:
                gps_idx = segment_gps_indices[seg_idx]
                if 0 <= gps_idx < len(df_gps):
                    distance = df_gps.iloc[gps_idx]['InfraDistance_m']
                    if distance is not None and not np.isinf(distance):
                        type_distances.append(distance)
            
            if type_distances:
                print(f"   • {infrastructure_type}: {len(type_distances)} segments")
                print(f"     - Distance range: {min(type_distances):.1f}m to {max(type_distances):.1f}m")
                print(f"     - Average distance to reference point: {np.mean(type_distances):.1f}m")
                print(f"     - Threshold used: {TYPE_SPECIFIC_THRESHOLDS.get(infrastructure_type, 'N/A')}m")

    # 4. Infrastructure spacing analysis
    print(f"\n⏰ Infrastructure Spacing Analysis:")
    if len(infrastructure_events) > 1:
        # Calculate time gaps between consecutive infrastructure
        time_gaps = []
        for i in range(1, len(infrastructure_events)):
            gap = infrastructure_events[i]['start_time'] - infrastructure_events[i-1]['end_time']
            time_gaps.append(gap)
        
        if time_gaps:
            print(f"   • Average spacing between infrastructure: {np.mean(time_gaps):.1f} seconds")
            print(f"   • Spacing range: {min(time_gaps):.1f}s to {max(time_gaps):.1f}s")
            
            # Check for potential over-labeling (very close infrastructure)
            short_gaps = [gap for gap in time_gaps if gap < 5]
            if short_gaps:
                print(f"   ⚠️ {len(short_gaps)} very close infrastructure pairs (<5s apart) detected")
                print(f"      → Review if this represents genuine infrastructure clustering")
            else:
                print(f"   ✅ No concerning infrastructure clustering detected")
    else:
        print(f"   • Insufficient infrastructure events for spacing analysis")


# ====================
# EXECUTE QUALITY ASSESSMENT
# ====================

print("\n" + "="*73)
print("🔬 INFRASTRUCTURE LABELING QUALITY ASSESSMENT")
print("="*73)

# Run comprehensive labeling performance analysis
analyze_labeling_performance(segment_labels, type_thresholds=TYPE_SPECIFIC_THRESHOLDS)

# Run temporal and spatial distribution analysis  
analyze_temporal_distribution(segment_labels, segment_start_times, segment_gps_indices, df_gps)

print(f"\n✅ Quality assessment complete - data ready for vibration analysis")
print("="*73)

# ====================
# INTERACTIVE MAP CREATION WITH MULTI-LAYER VISUALIZATION
# ====================
"""
Create a comprehensive interactive map showing:
1. GPS track colored by train speed (continuous route visualization)
2. Infrastructure reference points (bridges, rail joints, turnouts)
3. Route start/end markers with detailed metadata
4. Interactive hover information and click functionality

The map uses OpenStreetMap for detailed railway infrastructure context
and provides clickable elements that trigger vibration data display.
"""

print("\n🗺️ Creating interactive map with speed visualization and infrastructure...")

map_fig = go.Figure()

# ====================
# GPS TRACK LAYER: SPEED-COLORED ROUTE VISUALIZATION
# ====================
"""
Display the complete GPS track as a continuous route with speed-based coloring.
Each point contains metadata for vibration analysis mapping.
"""

if not df_gps.empty:
    # Create hover text with comprehensive GPS point information
    gps_hover_text = []
    for i, (lat, lon, speed, timestamp, label, distance, sat, qual) in enumerate(zip(
        df_gps["Latitude"],
        df_gps["Longitude"], 
        df_gps["Speed"],
        df_gps["timestamp"],
        df_gps["InfraLabel"],
        df_gps["InfraDistance_m"],
        df_gps.get("Satellites", [None] * len(df_gps)),
        df_gps.get("GPS_Quality", [None] * len(df_gps))
    )):
        hover_info = f"GPS Point {i}<br>Infrastructure: {label}"
        hover_info += f"<br>Speed: {speed:.1f} km/h" if speed is not None else "<br>Speed: N/A"
        hover_info += f"<br>Time: {timestamp:.1f}s" if timestamp is not None else "<br>Time: N/A"
        hover_info += f"<br>Coordinates: {lat:.6f}, {lon:.6f}"
        hover_info += f"<br>Distance to infra: {distance:.1f}m" if distance is not None else "<br>Distance: N/A"
        if sat is not None:
            hover_info += f"<br>Satellites: {int(sat)}"
        if qual:
            hover_info += f"<br>GPS Quality: {qual}"
        gps_hover_text.append(hover_info)
    
    map_fig.add_trace(go.Scattermap(
        lat=df_gps["Latitude"],
        lon=df_gps["Longitude"],
        mode="markers",
        marker=dict(
            size=4,  # Small markers for dense GPS data
            color=df_gps["Speed"],  # Color mapping based on train speed
            colorscale="Viridis",   # Professional green-yellow-purple gradient
            showscale=True,
            colorbar=dict(
                title="Train Speed (km/h)",
                x=1.02,  # Position to avoid overlap with legend
                len=0.7,
                y=0.7,
                tickmode='linear',
                tick0=df_gps["Speed"].min(),
                dtick=(df_gps["Speed"].max() - df_gps["Speed"].min()) / 5
            ),
            cmin=df_gps["Speed"].min(),
            cmax=df_gps["Speed"].max(),
            opacity=0.8  # Slight transparency for overlapping points
        ),
        text=gps_hover_text,
        hovertemplate="%{text}<extra></extra>",
        customdata=df_gps[["OriginalIndex", "InfraLabel"]].values,  # For callback click handling
        name=f"GPS Track ({len(df_gps)} points)",
        showlegend=True
    ))

# ====================
# ROUTE START POINT: JOURNEY BEGINNING MARKER
# ====================
"""
Mark the beginning of the analyzed route with enhanced metadata display.
Provides context about journey initiation and initial conditions.
"""

start_point = df_gps.iloc[0]
start_hover = (f"🚩 JOURNEY START<br>"
              f"Infrastructure: {start_point['InfraLabel']}<br>"
              f"Speed: {start_point['Speed']:.1f} km/h<br>"
              f"Time: {start_point['timestamp']:.1f}s<br>"
              f"Coordinates: {start_point['Latitude']:.6f}, {start_point['Longitude']:.6f}<br>"
              f"Distance to infra: {start_point['InfraDistance_m']:.1f}m" if start_point['InfraDistance_m'] is not None else "Distance: N/A")

map_fig.add_trace(go.Scattermap(
    lat=[start_point["Latitude"]],
    lon=[start_point["Longitude"]],
    mode="markers",
    marker=dict(
        size=15,
        symbol="circle",      # Distinctive circle symbol for start
        color="purple"
    ),
    name="Start Point",
    customdata=[[0, start_point["InfraLabel"]]],
    text=[start_hover],
    hovertemplate="%{text}<extra></extra>",
    showlegend=True
))

# ====================
# ROUTE END POINT: JOURNEY CONCLUSION MARKER  
# ====================
"""
Mark the end of the analyzed route with comprehensive journey summary.
Shows final conditions and total route coverage information.
"""

end_point = df_gps.iloc[-1]
journey_duration = end_point['timestamp'] - start_point['timestamp']
end_hover = (f"🏁 JOURNEY END<br>"
            f"Infrastructure: {end_point['InfraLabel']}<br>"
            f"Speed: {end_point['Speed']:.1f} km/h<br>"
            f"Time: {end_point['timestamp']:.1f}s<br>"
            f"Duration: {journey_duration:.1f}s ({journey_duration/60:.1f} min)<br>"
            f"Coordinates: {end_point['Latitude']:.6f}, {end_point['Longitude']:.6f}<br>"
            f"Distance to infra: {end_point['InfraDistance_m']:.1f}m" if end_point['InfraDistance_m'] is not None else "Distance: N/A")

map_fig.add_trace(go.Scattermap(
    lat=[end_point["Latitude"]],
    lon=[end_point["Longitude"]],
    mode="markers",
    marker=dict(
        size=15,
        symbol="circle",    # Distinctive circle symbol for end
        color="orange"
    ),
    name="End Point",
    customdata=[[len(df_gps) - 1, end_point["InfraLabel"]]],
    text=[end_hover],
    hovertemplate="%{text}<extra></extra>",
    showlegend=True
))

# ====================
# INFRASTRUCTURE REFERENCE POINTS: STATIC MARKERS FROM DATABASE
# ====================
"""
Add infrastructure reference points from the database as distinct markers.
These represent the 'ground truth' infrastructure locations used for GPS labeling.
Each type uses different colors and sizes for easy identification.
"""

# Define visual styling for each infrastructure type
infra_styling = {
    "Bridge": {"color": "red", "size": 10, "symbol": "circle"},
    "RailJoint": {"color": "blue", "size": 8, "symbol": "circle"}, 
    "Turnout": {"color": "green", "size": 12, "symbol": "circle"}
}

for category in df_infra['Category'].unique():
    if category in infra_styling:
        cat_data = df_infra[df_infra['Category'] == category]
        if not cat_data.empty:
            # Create hover information for infrastructure points
            infra_hover_text = [
                f"{category} Infrastructure<br>"
                f"Reference Point<br>"
                f"Coordinates: {lat:.6f}, {lon:.6f}<br>"
                f"Used for GPS labeling within {TYPE_SPECIFIC_THRESHOLDS.get(category, 'N/A')}m"
                for lat, lon in zip(cat_data["Latitude"], cat_data["Longitude"])
            ]
            
            style = infra_styling[category]
            map_fig.add_trace(go.Scattermap(
                lat=cat_data["Latitude"],
                lon=cat_data["Longitude"],
                mode="markers",
                marker=dict(
                    size=style["size"],
                    color=style["color"],
                    symbol=style["symbol"]
                ),
                name=f"{category} ({len(cat_data)})",
                customdata=cat_data[["Latitude", "Longitude", "Category"]].values,
                text=infra_hover_text,
                hovertemplate="%{text}<extra></extra>",
                showlegend=True
            ))

# ====================
# MAP LAYOUT CONFIGURATION
# ====================
"""
Configure the map display settings for optimal railway route visualization.
Centers the view on the route and provides appropriate zoom for infrastructure detail.
"""

# Calculate route center for optimal map positioning
center_lat = df_gps["Latitude"].mean()
center_lon = df_gps["Longitude"].mean()

# Calculate route bounds for automatic zoom selection
lat_range = df_gps["Latitude"].max() - df_gps["Latitude"].min()
lon_range = df_gps["Longitude"].max() - df_gps["Longitude"].min()
max_range = max(lat_range, lon_range)

# Determine appropriate zoom level based on route extent
if max_range > 0.1:    # Large route
    zoom_level = 9
elif max_range > 0.05: # Medium route  
    zoom_level = 10
else:                  # Small/local route
    zoom_level = 11

map_fig.update_layout(
    title=dict(
        text=f"GPS Track: {route_name}<br>"
             f"<sub>Source: {selected_folder} | "
             f"GPS Points: {len(df_gps)} | "
             f"Infra Points: {len(df_infra)} | "
             f"Duration: {journey_duration:.1f}s</sub>",
        x=0.5,
        xanchor='center'
    ),
    map=dict(
        style="open-street-map",  # Detailed map style showing railway infrastructure
        zoom=zoom_level,
        center=dict(lat=center_lat, lon=center_lon)
    ),
    height=600,
    legend=dict(
        x=0.02,           # Position legend on left side
        y=0.98,
        bgcolor="rgba(255,255,255,0.95)",  # Semi-transparent white background
        bordercolor="rgba(0,0,0,0.3)",
        borderwidth=1,
        font=dict(size=10)
    ),
    margin=dict(r=140, l=10, t=80, b=10)  # Extra right margin for speed colorbar
)

# ====================
# INITIALIZE EMPTY VIBRATION PLOT
# ====================
"""
Create placeholder vibration plot that will be populated when users click GPS points.
Provides clear instructions and maintains consistent layout.
"""

vib_empty_fig = go.Figure()
vib_empty_fig.update_layout(
    title="🎯 Interactive Vibration Analysis<br><sub>Click any GPS point on the map to display corresponding vibration data</sub>",
    xaxis_title="Time (seconds)",
    yaxis_title="Acceleration (m/s²)",
    height=600,
    xaxis=dict(showgrid=True, gridcolor='lightgray'),
    yaxis=dict(showgrid=True, gridcolor='lightgray'),
    plot_bgcolor='white'
)

print(f"✅ Interactive map created with {len(df_gps)} GPS points and {len(df_infra)} infrastructure references")
print(f"   🎨 Speed visualization: {df_gps['Speed'].min():.1f} - {df_gps['Speed'].max():.1f} km/h range")
print(f"   🏗️ Infrastructure types: {', '.join(df_infra['Category'].unique())}")
print(f"   📍 Route coverage: {lat_range:.4f}° lat × {lon_range:.4f}° lon")

# ====================
# SAVE LABELED SEGMENTS DATA TO CSV
# ====================
"""
This section creates a comprehensive CSV file containing all labeled vibration segments
with their corresponding infrastructure labels, GPS coordinates, and metadata.

The output CSV serves as the primary dataset for machine learning classification tasks
and provides a complete record of the infrastructure labeling process.
"""

print("\n💾 Saving labeled segments to CSV...")

# ====================
# EXTRACT INFRASTRUCTURE DISTANCES FROM GPS DATA
# ====================
"""
For each vibration segment, we need to get the infrastructure distance from the 
corresponding GPS point. This creates the link between vibration data and spatial context.
"""

infrastructure_distances = []  # Distance from each segment to nearest infrastructure
corresponding_gps_data = []    # Full GPS data for each segment

# Map each vibration segment to its corresponding GPS point and extract distance data
for i, gps_idx in enumerate(segment_gps_indices):
    """
    segment_gps_indices contains the GPS array index that corresponds to each vibration segment.
    This was calculated earlier during the GPS-vibration synchronization process.
    """
    
    if gps_idx >= 0 and gps_idx < len(df_gps):
        # Valid GPS index - extract the corresponding GPS point
        gps_point = df_gps.iloc[gps_idx]
        
        # Extract infrastructure distance with proper handling of invalid values
        infra_distance = gps_point.get('InfraDistance_m', None)
        
        # Clean up infinite distance values (converted to None for consistency)
        # These occur when no infrastructure points exist in the database
        if infra_distance == float('inf') or infra_distance == -float('inf'):
            infra_distance = None
        
        infrastructure_distances.append(infra_distance)
        corresponding_gps_data.append(gps_point)
    else:
        # Invalid GPS index - segment has no corresponding GPS data
        # This can happen at the beginning/end of recordings due to timing misalignment
        infrastructure_distances.append(None)
        corresponding_gps_data.append(None)

# ====================
# UTILITY FUNCTION FOR CLEAN METADATA FORMATTING
# ====================

def format_thresholds_for_filename(thresholds_dict):
    """
    Convert the adaptive thresholds dictionary to a clean string for CSV metadata.
    
    Example: {'Bridge': 50, 'Turnout': 30} → "Bridge_50m_Turnout_30m"
    This ensures the labeling method is clearly documented and reproducible.
    
    Args:
        thresholds_dict: Dictionary mapping infrastructure types to threshold distances
        
    Returns:
        str: Formatted string suitable for filenames and metadata
    """
    return "_".join([f"{k}_{v}m" for k, v in thresholds_dict.items()])

# ====================
# CREATE COMPREHENSIVE SEGMENTS DATAFRAME
# ====================
"""
This DataFrame contains all the information needed for machine learning classification:
- Vibration segment metadata (timing, duration)
- Infrastructure labels (multiple formats for different use cases)
- GPS coordinates and speed information
- Distance measurements to nearest infrastructure
- Processing metadata for reproducibility
"""

segments_df = pd.DataFrame({
    # BASIC SEGMENT IDENTIFICATION
    'segment_index': range(len(segments)),           # Sequential segment number (0, 1, 2, ...)
    'timestamp': segment_start_times,                # Start time of segment in seconds
    
    # INFRASTRUCTURE LABELING (MULTIPLE FORMATS FOR DIFFERENT APPLICATIONS)
    'primary_label': segment_labels,                 # Main label: Bridge/Turnout/RailJoint/Normal Track
    'infrastructure_type': segment_labels,           # Identical to primary_label (for ML training clarity)
    'infrastructure_category': ['Infrastructure' if label != 'Normal Track' else 'Normal Track' 
                               for label in segment_labels],  # Binary: Infrastructure vs Normal Track
    'is_infrastructure_boolean': [label != 'Normal Track' for label in segment_labels],  # True/False boolean
    
    # GPS CORRESPONDENCE AND TIMING
    'corresponding_gps_index': segment_gps_indices,  # Index into df_gps array for this segment
    'start_time': segment_start_times,               # Segment start time (duplicate of timestamp for clarity)
    'end_time': [ts + SEGMENT_DURATION_SECONDS for ts in segment_start_times],  # Segment end time
    'segment_length': [segment_length] * len(segments),          # Number of data points per segment
    'segment_duration_sec': [SEGMENT_DURATION_SECONDS] * len(segments),  # Duration in seconds (typically 10s)
    
    # SPATIAL CONTEXT AND DISTANCE MEASUREMENTS
    'distance_to_infrastructure_m': infrastructure_distances,    # Distance to nearest infrastructure point
    'gps_latitude': [gps_data['Latitude'] if gps_data is not None else None 
                     for gps_data in corresponding_gps_data],    # GPS latitude coordinate
    'gps_longitude': [gps_data['Longitude'] if gps_data is not None else None 
                      for gps_data in corresponding_gps_data],   # GPS longitude coordinate
    'gps_speed_kmh': [gps_data['Speed'] if gps_data is not None else None 
                      for gps_data in corresponding_gps_data],   # Train speed at segment location
    
    # PROCESSING METADATA (FOR REPRODUCIBILITY AND TRACKING)
    'folder_source': [selected_folder] * len(segments),         # Source data folder
    'labeling_method': [f"adaptive_thresholds_{format_thresholds_for_filename(TYPE_SPECIFIC_THRESHOLDS)}"] * len(segments),
    # ^ Documents the exact thresholds used for this labeling run
})

# ====================
# DISPLAY DATA STRUCTURE INFORMATION
# ====================

print(f"📋 Column Definitions and Purposes:")
print(f"   • primary_label: Specific infrastructure type (Bridge/Turnout/RailJoint/Normal Track)")
print(f"   • infrastructure_type: Identical to primary_label (provided for ML training clarity)")
print(f"   • infrastructure_category: Binary classification (Infrastructure/Normal Track)")
print(f"   • is_infrastructure_boolean: Boolean format (True=Infrastructure, False=Normal Track)")
print(f"   • distance_to_infrastructure_m: Distance in meters to nearest infrastructure reference point")
print(f"   • GPS coordinates: Spatial location where this vibration segment was recorded")

# Show sample data to verify structure
print(f"\n📊 Sample Data Structure (first 10 rows):")
sample_columns = ['primary_label', 'infrastructure_type', 'infrastructure_category', 'is_infrastructure_boolean']
sample_df = segments_df[sample_columns].head(10)
print(sample_df.to_string())

# ====================
# STATISTICAL SUMMARY OF LABELING RESULTS
# ====================

print(f"\n📊 Infrastructure Label Distribution:")
label_distribution = segments_df['primary_label'].value_counts()
total_segments = len(segments_df)

for label, count in label_distribution.items():
    percentage = count/total_segments*100
    print(f"   • {label}: {count} segments ({percentage:.1f}%)")

print(f"\n📊 Binary Category Distribution:")
category_distribution = segments_df['infrastructure_category'].value_counts()
for category, count in category_distribution.items():
    percentage = count/total_segments*100
    print(f"   • {category}: {count} segments ({percentage:.1f}%)")

# ====================
# DISTANCE MEASUREMENT ANALYSIS
# ====================

print(f"\n📏 Distance Statistics for All Segments:")
valid_distance_count = segments_df['distance_to_infrastructure_m'].notna().sum()
total_segment_count = len(segments_df)

print(f"   • Segments with valid distance measurements: {valid_distance_count}/{total_segment_count}")

if valid_distance_count > 0:
    valid_distances = segments_df['distance_to_infrastructure_m'].dropna()
    print(f"   • Minimum distance to infrastructure: {valid_distances.min():.1f}m")
    print(f"   • Average distance to infrastructure: {valid_distances.mean():.1f}m")
    print(f"   • Maximum distance to infrastructure: {valid_distances.max():.1f}m")
else:
    print(f"   ⚠️ No valid distance measurements found - check infrastructure database")

# ====================
# SAVE TO CSV FILE
# ====================

# Create descriptive filename that includes route and folder information
output_filename = f"SL_labeled_segments_{route_name.replace(' ', '_').replace('to', 'to')}_{selected_folder.replace(' ', '_').replace('(', '').replace(')', '')}.csv"

# Save the complete dataset
segments_df.to_csv(output_filename, index=False)
print(f"\n💾 Saved {len(segments_df)} labeled segments to: {output_filename}")

# ====================
# DATA QUALITY VERIFICATION
# ====================

print(f"\n✅ DATA QUALITY VERIFICATION:")

# 1. Check unique values in key columns
print(f"   • Unique primary labels: {sorted(segments_df['primary_label'].unique())}")
print(f"   • Unique infrastructure categories: {sorted(segments_df['infrastructure_category'].unique())}")
print(f"   • Boolean values present: {sorted([bool(x) for x in segments_df['is_infrastructure_boolean'].unique()])}")

# 2. Check for missing distance values
null_distances = segments_df['distance_to_infrastructure_m'].isna().sum()
if null_distances > 0:
    percentage_missing = (null_distances / len(segments_df)) * 100
    print(f"   ⚠️ Missing distances: {null_distances} segments ({percentage_missing:.1f}%) have no distance data")
    print(f"      → This is normal for segments without corresponding GPS data")
else:
    print(f"   ✅ Distance completeness: All segments have valid distance measurements")

# 3. Check for data consistency issues
missing_distance_with_gps = segments_df[
    (segments_df['corresponding_gps_index'] >= 0) & 
    (segments_df['distance_to_infrastructure_m'].isna())
]

if len(missing_distance_with_gps) > 0:
    print(f"   ⚠️ DATA INCONSISTENCY: {len(missing_distance_with_gps)} segments have GPS data but missing distances")
    print(f"      → This indicates a potential issue in the GPS-distance calculation pipeline")
    print(f"      → Recommend reviewing the infrastructure distance calculation code")
else:
    print(f"   ✅ Data consistency: All segments with GPS data have corresponding distance measurements")

# 4. Final validation summary
infrastructure_segment_count = sum(segments_df['is_infrastructure_boolean'])
normal_track_count = len(segments_df) - infrastructure_segment_count

print(f"\n📋 FINAL DATASET SUMMARY:")
print(f"   • Total segments processed: {len(segments_df)}")
print(f"   • Infrastructure segments: {infrastructure_segment_count}")
print(f"   • Normal track segments: {normal_track_count}")
print(f"   • GPS coverage: {len(segments_df) - segments_df['gps_latitude'].isna().sum()}/{len(segments_df)} segments")
print(f"   • Ready for machine learning classification: ✅")

# ====================
# PREPARE STATISTICS FOR DASH DASHBOARD DISPLAY
# ====================
"""
This section calculates summary statistics that will be displayed in the Dash web interface.
These statistics provide users with an overview of the analysis results and data quality.

Key metrics calculated:
- Segment labeling distribution (how many of each infrastructure type)
- Infrastructure detection coverage and density
- GPS-vibration data synchronization quality
- Distance measurement completeness
"""

print("\n📊 Preparing dashboard display variables...")

# Calculate segment-level statistics for dashboard display
if len(segments) > 0:
    # Count occurrences of each infrastructure label type
    segment_label_counts = pd.Series(segment_labels).value_counts()
    
    # Count total infrastructure segments (anything that's not "Normal Track")
    infrastructure_segments_count = sum(1 for label in segment_labels if label != 'Normal Track')
    
    # DISTANCE STATISTICS: Calculate from ALL GPS points (not just infrastructure points)
    # This shows how far GPS points are from the nearest infrastructure reference points
    print(f"   🔍 Calculating distance statistics from all {len(df_gps)} GPS points...")
    
    # Clean distance data by removing infinite values and converting to None
    all_distances = df_gps['InfraDistance_m'].replace([float('inf'), -float('inf')], None).dropna()
    
    if not all_distances.empty:
        # Calculate summary statistics for valid distance measurements
        avg_dist = all_distances.mean()      # Average distance to nearest infrastructure
        max_dist = all_distances.max()       # Maximum distance (most isolated point)
        min_dist = all_distances.min()       # Minimum distance (closest to infrastructure)
        valid_distance_count = len(all_distances)  # Number of points with valid measurements
    else:
        # No valid distance measurements available
        avg_dist = None
        max_dist = None
        min_dist = None
        valid_distance_count = 0
    
    # Count GPS points that were labeled as being near infrastructure
    # This is different from distance statistics - these are points within threshold distances
    infrastructure_points = df_gps[df_gps['InfraLabel'] != 'Normal Track'].copy()
    
else:
    # No segments were processed - initialize empty statistics
    segment_label_counts = pd.Series(dtype=int)
    infrastructure_segments_count = 0
    infrastructure_points = pd.DataFrame()
    avg_dist = None
    max_dist = None
    min_dist = None
    valid_distance_count = 0

# ====================
# CALCULATE GPS-VIBRATION SYNCHRONIZATION OVERLAP
# ====================
"""
Determine how much temporal overlap exists between GPS and vibration recordings.
This is critical for ensuring the analysis covers periods where both data types are available.
"""

if 'overlap_duration' not in locals():
    # Calculate the duration of each data stream
    gps_duration = df_gps['timestamp'].max()
    vib_duration = df_vibration['timestamp'].max() if not df_vibration.empty else 0
    
    # The overlap is the shorter of the two durations
    # (assumes both start at approximately the same time)
    overlap_duration = min(gps_duration, vib_duration)

# Display summary of prepared statistics
print(f"📊 Dashboard statistics prepared:")
print(f"   • Total vibration segments: {len(segments)}")
print(f"   • Infrastructure segments detected: {infrastructure_segments_count}")
print(f"   • GPS points labeled as infrastructure: {len(infrastructure_points)}")
print(f"   • GPS points with valid distance measurements: {valid_distance_count}/{len(df_gps)}")
print(f"   • GPS-Vibration temporal overlap: {overlap_duration:.1f} seconds")

# ====================
# DASH WEB APPLICATION SETUP
# ====================
"""
Create the interactive web dashboard using Plotly Dash.
The dashboard provides:
1. Interactive GPS map showing the railway route with infrastructure points
2. Vibration plot that updates when users click on map points
3. Implementation details and analysis statistics
"""

print("\n🚀 Setting up interactive web dashboard...")

# Initialize Dash application
app = dash.Dash(__name__)
app.title = f"Railway Vibration Analysis - {route_name}"

# ====================
# DASHBOARD LAYOUT DEFINITION
# ====================
"""
The layout uses a three-section design:
1. Header with route name
2. Two-panel main area (map + vibration plot)
3. Information panel with analysis details and statistics
"""

app.layout = html.Div([
    # HEADER SECTION
    html.H1(f"Railway Vibration Analysis - {route_name}", 
            style={'text-align': 'center', 'margin-bottom': '20px', 'color': 'darkblue'}),
    
    # MAIN INTERACTIVE AREA: Side-by-side map and vibration display
    html.Div([
        # Left panel: Interactive GPS map with clickable points
        html.Div([
            dcc.Graph(id="gps-map", figure=map_fig)
        ], style={'width': '48%', 'display': 'inline-block', 'vertical-align': 'top'}),
        
        # Right panel: Vibration time series plot (updates when map points are clicked)
        html.Div([
            dcc.Graph(id="vibration-plot", figure=vib_empty_fig)
        ], style={'width': '48%', 'display': 'inline-block', 'vertical-align': 'top'})
    ]),

    # INFORMATION PANEL: Analysis details and implementation summary
    html.Div([
        html.H3("🔧 ANALYSIS IMPLEMENTATION DETAILS:", style={'color': 'darkgreen'}),
        
        # Data source and folder selection information
        html.P(f"✅ Data Source: Selected '{selected_folder}' from {len(valid_folders)} available measurement folders"),
        
        # Infrastructure detection methodology
        html.P(f"✅ Adaptive Threshold Detection: " + 
               ", ".join([f"{infrastructure_type}: {threshold}m" for infrastructure_type, threshold in TYPE_SPECIFIC_THRESHOLDS.items()])),
        
        # Segment-level analysis results
        html.P(f"📈 Infrastructure Segment Detection: {infrastructure_segments_count} segments " +
               (f"({infrastructure_segments_count / len(segments) * 100:.1f}% of total)" if len(segments) > 0 
                else "No segments processed")),
        
        # Distance measurement analysis (from ALL GPS points to nearest infrastructure)
        html.P(f"📐 Proximity Analysis: " +
               (f"Distance to nearest infrastructure ranges from {min_dist:.1f}m to {max_dist:.1f}m " +
                f"(average: {avg_dist:.1f}m) across {valid_distance_count} GPS measurements" 
                if avg_dist is not None else "No valid distance measurements available")),
        
        # Point-level infrastructure detection results
        html.P(f"🏗️ GPS Point Classification: {len(infrastructure_points)} points identified as near infrastructure"),
        
        # Data processing and synchronization quality
        html.P(f"✅ Data Integration: GPS-vibration segment mapping with consistent labeling applied"),
        html.P(f"✅ Segment Processing: {len(segments)} vibration segments created with categorical labels"),
        html.P(f"✅ Temporal Synchronization: {overlap_duration:.1f} seconds of coordinated GPS-vibration data"),
        html.P(f"✅ Infrastructure Database: {len(df_infra)} reference points loaded from infrastructure catalog"),
        html.P(f"✅ Interactive Documentation: HTML files saved with zoom, pan, hover features, and complete analysis metadata"),

        # Label distribution summary (formatted for readability)
        html.P(f"📊 Infrastructure Distribution: " + 
               (", ".join([f"{label}: {count}" for label, count in segment_label_counts.items()]) 
                if len(segments) > 0 else "No segments available")),
        
        # Geographic coverage information
        html.P(f"📍 Geographic Coverage: Latitude {df_gps['Latitude'].min():.4f}° to {df_gps['Latitude'].max():.4f}°, " +
               f"Longitude {df_gps['Longitude'].min():.4f}° to {df_gps['Longitude'].max():.4f}°"),
        
        # User interaction instructions
        html.P("🎯 Interactive Usage: Click any point on the GPS map to display corresponding vibration data", 
               style={'color': 'blue', 'font-weight': 'bold'}),
               
    ], style={
        'text-align': 'center', 
        'margin-top': '20px', 
        'background-color': '#e8f5e8',  # Light green background for info panel
        'padding': '15px', 
        'border-radius': '10px'
    })
])

# ====================
# INTERACTIVE MAP CLICK HANDLER
# ====================
"""
This callback function enables the core interactivity of the dashboard.
When users click on any point in the GPS map, this function:
1. Identifies what type of point was clicked (GPS track point or infrastructure marker)
2. Maps the clicked location to the corresponding vibration data segment
3. Generates and displays a detailed vibration plot for that location

The mapping process handles timing synchronization between GPS and vibration data,
ensuring users see vibration data from the correct time and location.
"""

@app.callback(
    Output('vibration-plot', 'figure'),
    Input('gps-map', 'clickData')
)
def update_vibration_from_map(clickData):
    """
    Update the vibration plot when a user clicks on the GPS map.
    
    This function handles two distinct types of clickable elements:
    1. GPS track points (continuous route, colored by train speed)
    2. Infrastructure reference points (discrete markers for bridges, turnouts, etc.)
    
    For each click, it performs GPS-to-vibration segment mapping and generates
    a time-series plot showing accelerometer data from both rails.
    
    Args:
        clickData (dict): Plotly click event data containing:
            - Point coordinates (lat/lon)
            - Custom data attached to the clicked point
            - Point type identification information
    
    Returns:
        go.Figure: Interactive vibration plot with:
            - Left rail accelerometer data (Channel 1, blue)
            - Right rail accelerometer data (Channel 2, red) 
            - Infrastructure type color-coded title
            - Location and timing metadata
    """
    
    # Handle case where no point was clicked or click data is invalid
    if not clickData or 'points' not in clickData:
        return vib_empty_fig
    
    # ====================
    # PARSE CLICK DATA AND IDENTIFY POINT TYPE
    # ====================
    
    try:
        point_data = clickData['points'][0]
        
        # Extract custom data that was attached when creating the map
        # Custom data format depends on point type (GPS track vs infrastructure marker)
        if 'customdata' in point_data and point_data['customdata']:
            customdata = point_data['customdata']
            
            # GPS TRACK POINT CLICKED (customdata length >= 2)
            if len(customdata) >= 2:
                """
                GPS track points contain:
                - customdata[0]: GPS array index
                - customdata[1]: Infrastructure label for this point
                """
                point_index = int(customdata[0])
                clicked_label = customdata[1]
                print(f"\n🎯 GPS track point clicked: Index {point_index}, Label: '{clicked_label}'")
                
                # Validate the GPS array index is within bounds
                if point_index < len(df_gps):
                    gps_point = df_gps.iloc[point_index]
                    gps_time = gps_point["timestamp"]
                    actual_gps_label = gps_point["InfraLabel"]
                    
                    print(f"   📍 GPS Point Details: Time={gps_time:.1f}s, Label='{actual_gps_label}'")
                else:
                    print(f"   ⚠️ ERROR: GPS index {point_index} out of range (max: {len(df_gps)-1})")
                    return vib_empty_fig
                    
            # INFRASTRUCTURE MARKER CLICKED (customdata length == 3)  
            elif len(customdata) == 3:
                """
                Infrastructure markers contain:
                - customdata[0]: Latitude coordinate
                - customdata[1]: Longitude coordinate  
                - customdata[2]: Infrastructure type (Bridge, Turnout, etc.)
                """
                lat_click = float(customdata[0])
                lon_click = float(customdata[1])
                clicked_label = customdata[2]
                
                print(f"🏗️ Infrastructure marker clicked: {clicked_label} at ({lat_click:.6f}, {lon_click:.6f})")
                
                # Find the GPS track point closest to this infrastructure marker
                # This is necessary because vibration data is synced to GPS track timing
                coordinate_distances = np.sqrt((df_gps['Latitude'] - lat_click)**2 + 
                                             (df_gps['Longitude'] - lon_click)**2)
                point_index = coordinate_distances.idxmin()
                gps_point = df_gps.iloc[point_index]
                gps_time = gps_point["timestamp"]
                actual_gps_label = gps_point["InfraLabel"]
                
                print(f"   📍 Mapped to nearest GPS point: Index {point_index}, Label='{actual_gps_label}'")
                print(f"   📏 Distance from infrastructure marker: {coordinate_distances.iloc[point_index]*111000:.1f}m")
            else:
                # Invalid custom data format
                print(f"   ⚠️ Unrecognized custom data format: {customdata}")
                return vib_empty_fig
        else:
            # No custom data attached to clicked point
            print(f"   ⚠️ No custom data found for clicked point")
            return vib_empty_fig
            
    except Exception as e:
        print(f"⚠️ Error parsing click data: {e}")
        return vib_empty_fig
    
    # ====================
    # MAP GPS POINT TO VIBRATION SEGMENT
    # ====================
    """
    GPS points are recorded continuously (e.g., every 0.1 seconds)
    Vibration segments are discrete 10-second windows
    We need to find which vibration segment corresponds to the clicked GPS point
    """
    
    if len(segments) == 0:
        print("   ⚠️ No vibration segments available for display")
        return vib_empty_fig
    
    # Find vibration segment with start time closest to the GPS point timestamp
    time_differences = np.abs(np.array(segment_start_times) - gps_time)
    segment_index = np.argmin(time_differences)
    
    # Ensure the segment index is valid (defensive programming)
    segment_index = min(max(segment_index, 0), len(segments) - 1)
    
    # Extract data for the selected vibration segment
    selected_segment = segments[segment_index]              # Vibration time series data
    segment_start_time = segment_start_times[segment_index] # When this segment starts
    segment_actual_label = segment_labels[segment_index]    # Infrastructure label for this segment
    
    # ====================
    # VALIDATE GPS-VIBRATION SYNCHRONIZATION
    # ====================
    """
    Check the quality of the GPS-to-vibration mapping and report any timing issues
    """
    
    time_mismatch = abs(gps_time - segment_start_time)
    
    print(f"🔍 GPS-to-Vibration Mapping:")
    print(f"   📍 GPS Point {point_index}: Time={gps_time:.1f}s, Label='{actual_gps_label}'")
    print(f"   📊 Vibration Segment {segment_index}: Time={segment_start_time:.1f}s, Label='{segment_actual_label}'")
    print(f"   ⏱️ Time synchronization difference: {time_mismatch:.1f}s")

    # Flag potential synchronization issues
    if time_mismatch > 3:
        print("   ⚠️ WARNING: Large time difference detected - GPS and vibration may be poorly synchronized")
        print("   💡 Consider reviewing the timestamp alignment in your data preprocessing")
    
    # Determine which label to display (segment label is authoritative for 10-second windows)
    display_label = segment_actual_label
    
    # ====================
    # HANDLE LABEL DISCREPANCIES
    # ====================
    """
    GPS points represent instantaneous labels while segments represent 10-second windows.
    It's normal for these to occasionally differ, especially near infrastructure boundaries.
    """
    
    if actual_gps_label != segment_actual_label:
        print(f"   ⚠️ Label discrepancy detected:")
        print(f"      🎯 GPS point label: '{actual_gps_label}' (instantaneous)")
        print(f"      📊 Segment label: '{segment_actual_label}' (10-second window average)")
        print(f"      ✅ Using segment label '{segment_actual_label}' as authoritative for vibration analysis")
    else:
        print(f"   ✅ Label consistency: GPS and segment both labeled as '{segment_actual_label}'")

    # ====================
    # CREATE VIBRATION TIME SERIES PLOT
    # ====================
    """
    Generate the interactive vibration plot showing accelerometer data from both rails
    """
    
    # Create time axis for the vibration segment (typically 0 to 10 seconds)
    time_axis = np.arange(len(selected_segment)) * dt_vibration
    
    # Initialize the plot figure
    vib_fig = go.Figure()
    
    # Add left rail vibration trace (Channel 1)
    vib_fig.add_trace(go.Scatter(
        x=time_axis,
        y=selected_segment[:, 0],
        mode='lines',
        name='Left Rail (Accelerometer Ch1)',
        line=dict(color='blue', width=1.5),
        hovertemplate='Time: %{x:.2f}s<br>Acceleration: %{y:.3f} m/s²<extra></extra>'
    ))
    
    # Add right rail vibration trace (Channel 2)
    vib_fig.add_trace(go.Scatter(
        x=time_axis,
        y=selected_segment[:, 1],
        mode='lines',
        name='Right Rail (Accelerometer Ch2)',
        line=dict(color='red', width=1.5),
        hovertemplate='Time: %{x:.2f}s<br>Acceleration: %{y:.3f} m/s²<extra></extra>'
    ))
    
    # ====================
    # EXTRACT ADDITIONAL CONTEXT FOR PLOT TITLE
    # ====================
    
    # Get location and speed information from the GPS point
    lat, lon, speed = gps_point["Latitude"], gps_point["Longitude"], gps_point["Speed"]
    distance_to_infra = gps_point.get("InfraDistance_m", None)
    
    # Define color scheme for infrastructure types (for visual consistency)
    title_colors = {
        'Bridge': 'red',
        'RailJoint': 'blue', 
        'Turnout': 'green',
        'Normal Track': 'black'
    }
    title_color = title_colors.get(display_label, 'black')
    
    # Format distance measurement for display (handle None, NaN, inf values)
    def format_distance_for_display(dist_value):
        """
        Convert distance value to human-readable string, handling edge cases.
        
        Args:
            dist_value: Distance in meters (may be None, NaN, or inf)
            
        Returns:
            str: Formatted distance string or "N/A" for invalid values
        """
        if dist_value is None:
            return "N/A"
        try:
            # Convert to float and validate
            dist_float = float(dist_value)
            if np.isnan(dist_float) or np.isinf(dist_float):
                return "N/A"
            return f"{dist_float:.1f}m"
        except (ValueError, TypeError):
            return "N/A"
    
    distance_str = format_distance_for_display(distance_to_infra)

    # ====================
    # CONFIGURE PLOT LAYOUT AND STYLING
    # ====================
    
    vib_fig.update_layout(
        # Multi-line title with infrastructure type, mapping details, and context
        title=f"<span style='color:{title_color}'><b>{display_label}</b></span><br>" +
              f"<sub>GPS Point {point_index} (t={gps_time:.1f}s) → Vibration Segment {segment_index} (t={segment_start_time:.1f}s)<br>" +
              f"📍 Location: {lat:.6f}°, {lon:.6f}° | 🚄 Speed: {speed:.1f} km/h | 📏 Distance to infrastructure: {distance_str}</sub>",
        
        # Axis labels and formatting
        xaxis_title="Time within segment (seconds)",
        yaxis_title="Acceleration (m/s²)",
        
        # Plot dimensions
        height=600,
        
        # Legend positioning (top-left corner)
        legend=dict(x=0.02, y=0.98),
        
        # Enable plot interactivity
        hovermode='x unified'
    )
    
    # ====================
    # SAVE INTERACTIVE DOCUMENTATION FILE
    # ====================
    """
    Save vibration analysis results as interactive HTML documentation.
    
    HTML format provides several advantages over static images:
    - Cross-platform compatibility (works on any device with a browser)
    - Interactive features (zoom, pan, hover tooltips, data exploration)
    - Vector graphics quality (crisp at any zoom level)
    - Self-contained files (no external dependencies)
    - Professional presentation for reports and analysis sharing
    - Reliable export (no kaleido/image engine dependencies)
    
    Each HTML file contains:
    - Complete vibration time series from both rail accelerometers
    - Infrastructure type and location context
    - GPS-vibration mapping metadata
    - Interactive legend and data exploration tools
    """
    
    # Create filesystem-safe filename components
    # Replace problematic characters that could cause file system issues
    safe_route_name = route_name.replace(' ', '_').replace('/', '-').replace('–', '-')
    safe_label = display_label.replace(' ', '_').replace('/', '-')
    
    # Generate descriptive filename with analysis context
    # Format: SL_RouteName_InfrastructureType_GPS_PointIndex_Seg_SegmentIndex.html
    filename = f"SL_{safe_route_name}_{safe_label}_GPS_{point_index}_Seg_{segment_index}.html"
    
    try:
        # Export interactive HTML with full plot functionality
        vib_fig.write_html(
            filename,
            include_plotlyjs=True,  # Embed Plotly.js for offline viewing
            config={
                'displayModeBar': True,  # Show toolbar for zoom, pan, save
                'displaylogo': False,    # Remove Plotly logo
                'modeBarButtonsToRemove': ['lasso2d', 'select2d']  # Remove unnecessary tools
            }
        )
        
        print(f"💾 Saved Interactive HTML documentation to: {filename}")
        
    except Exception as e:
        print(f"⚠️ HTML export failed: {e}")
        print(f"   📝 Note: Vibration plot is still displayed in dashboard")
        print(f"   🔧 Check file permissions and disk space if error persists")
    
    return vib_fig

# ====================
# DASH APPLICATION STARTUP AND SERVER CONFIGURATION
# ====================
"""
This section handles the startup of the Dash web server and provides
final implementation details to users before launching the interactive dashboard.
"""

if __name__ == "__main__":
    print("\n" + "="*73)
    print("🌐 LAUNCHING INTERACTIVE RAILWAY VIBRATION ANALYSIS DASHBOARD")
    print("="*73)
    
    print("\n🚀 Starting Dash web server...")
    print("📱 Dashboard will be available at: http://localhost:8060")
    print("💡 Click any point on the GPS map to view corresponding vibration data")
    
    # ====================
    # DISPLAY FINAL IMPLEMENTATION SUMMARY
    # ====================
    """
    Provide users with a comprehensive summary of the analysis implementation
    for verification and documentation purposes
    """
    
    print(f"\n🔧 ANALYSIS IMPLEMENTATION SUMMARY:")
    print(f"   ✅ Data Source: Selected '{selected_folder}' from {len(valid_folders)} available measurement folders")
    print(f"   ✅ Infrastructure Detection: Adaptive thresholds per type - {TYPE_SPECIFIC_THRESHOLDS}")
    print(f"   ✅ Data Synchronization: GPS-vibration segment mapping with temporal alignment verification") 
    print(f"   ✅ Label Consistency: Infrastructure categories applied consistently across {len(segments)} segments")
    
    # Safely display infrastructure point count (handle case where variable might not exist)
    infrastructure_count = len(infrastructure_points) if 'infrastructure_points' in locals() else 0
    print(f"   ✅ Infrastructure Detection: {infrastructure_count} GPS points identified as near infrastructure")
    
    print(f"   ✅ Segment Processing: {len(segments)} vibration segments created with proper GPS correspondence")
    print(f"   ✅ Temporal Coverage: {overlap_duration:.1f} seconds of synchronized GPS-vibration data")
    print(f"   ✅ Documentation: Interactive HTML exports with embedded functionality (superior to static images)")
    print(f"   ✅ External Data: Infrastructure database loaded from Code 1 output (valid_folders.txt & infrastructure_points.csv)")
    
    # ====================
    # START THE WEB SERVER
    # ====================
    """
    Launch the Dash application server with error handling
    """
    
    try:
        print(f"\n🌐 Server starting on http://127.0.0.1:8060...")
        print(f"📊 Dashboard ready with {len(segments)} vibration segments and {len(df_gps)} GPS points")
        print(f"🎯 Click any map point to explore vibration data interactively")
        
        # Start the server (debug=False for production use)
        app.run(debug=False, port=8060, host='127.0.0.1')
        
        print("✅ Server running successfully!")
        
    except KeyboardInterrupt:
        print("\n👋 Dashboard server stopped by user (Ctrl+C)")
        print("📊 Analysis complete - all data and images have been saved")
        
    except OSError as e:
        if "Address already in use" in str(e):
            print(f"\n❌ Port 8060 is already in use")
            print(f"💡 Try one of these solutions:")
            print(f"   • Close any other Dash applications running on port 8060")
            print(f"   • Change the port number in the app.run() call (e.g., port=8061)")
            print(f"   • Kill the process using port 8060: lsof -ti:8060 | xargs kill")
        else:
            print(f"\n❌ Server error: {e}")
            
    except Exception as e:
        print(f"\n❌ Unexpected server error: {e}")
        print(f"💡 Check your network configuration and try restarting")
        
    finally:
        print(f"\n📋 Session Summary:")
        print(f"   • Route analyzed: {route_name}")
        print(f"   • Vibration segments processed: {len(segments)}")
        print(f"   • GPS points analyzed: {len(df_gps)}")
        print(f"   • Infrastructure types detected: {len(set(segment_labels))}")
        print(f"   • CSV output saved: {output_filename}")
        print(f"📊 Analysis pipeline completed successfully!")

🚀 RAILWAY VIBRATION ANALYSIS - WITH GUI
Execution time: 2025-08-26 17:25:42

📂 Loaded 5 valid folders from Code 1
📂 First 5 folders: ['2024-12-10 10-00-00 (1)', '2024-12-10 12-00-00 (1)', '2024-12-10 16-00-00 (1)', '2024-12-12 10-00-00 (1)', '2024-12-12 12-00-00 (1)']

🖥️ Opening folder selection window...
   💡 A GUI window will appear - please select a folder to analyze
✅ User selected folder: 2024-12-12 12-00-00 (1)

🔍 Validating selected folder: 2024-12-12 12-00-00 (1)

✅ Selected folder validated: 2024-12-12 12-00-00 (1)
  • latitude: GPS.latitude.csv (0.4 MB)
  • longitude: GPS.longitude.csv (0.4 MB)
  • vibration1: CH1_ACCEL1Z1.csv (655.7 MB)
  • vibration2: CH2_ACCEL1Z2.csv (621.9 MB)
  • speed: GPS.speed.csv (0.6 MB)
  • satellites: GPS.satellites.csv (0.1 MB)

🌍 Loading GPS data from 2024-12-12 12-00-00 (1)...
📡 GPS satellites data loaded for quality assessment
📡 GPS Quality Assessment:
   • Satellite count range: 0 to 7
   • Average satellites: 5.3
   • Quality distribution:


✅ Server running successfully!

📋 Session Summary:
   • Route analyzed: Borlänge-Mora Route (60.71°N 14.54°E)
   • Vibration segments processed: 179
   • GPS points analyzed: 35669
   • Infrastructure types detected: 4
   • CSV output saved: SL_labeled_segments_Borlänge-Mora_Route_(60.71°N_14.54°E)_2024-12-12_12-00-00_1.csv
📊 Analysis pipeline completed successfully!



🎯 GPS track point clicked: Index 0, Label: 'Normal Track'
   📍 GPS Point Details: Time=0.0s, Label='Normal Track'
🔍 GPS-to-Vibration Mapping:
   📍 GPS Point 0: Time=0.0s, Label='Normal Track'
   📊 Vibration Segment 0: Time=0.0s, Label='Normal Track'
   ⏱️ Time synchronization difference: 0.0s
   ✅ Label consistency: GPS and segment both labeled as 'Normal Track'
💾 Saved Interactive HTML documentation to: SL_Borlänge-Mora_Route_(60.71°N_14.54°E)_Normal_Track_GPS_0_Seg_0.html

🎯 GPS track point clicked: Index 35668, Label: 'Normal Track'
   📍 GPS Point Details: Time=1799.9s, Label='Normal Track'
🔍 GPS-to-Vibration Mapping:
   📍 GPS Point 35668: Time=1799.9s, Label='Normal Track'
   📊 Vibration Segment 178: Time=1780.0s, Label='Normal Track'
   ⏱️ Time synchronization difference: 19.9s
   💡 Consider reviewing the timestamp alignment in your data preprocessing
   ✅ Label consistency: GPS and segment both labeled as 'Normal Track'
💾 Saved Interactive HTML documentation to: SL_Borlänge-Mora