From 0d5a2cb42f9ade227afb3b31678944aa40aac90c Mon Sep 17 00:00:00 2001 From: Alex Metelli Date: Thu, 28 Aug 2025 05:05:35 +0800 Subject: [PATCH] Add comprehensive crash logging and error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adds panic hook to log crashes with location, message, and backtrace instructions - Enhances service error logging with detailed failure context - Improves thread completion monitoring to identify which service failed - Adds error chain formatting to show root causes - Provides clear "INDEXER SHUTDOWN" messages explaining crash reasons Fixes silent ECS indexer crashes by ensuring all failures are logged with actionable details. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/indexer/lib.rs | 117 ++++++++++++++++++++++++++++++++++++++------ src/indexer/main.rs | 77 +++++++++++++++++++++++++++-- 2 files changed, 173 insertions(+), 21 deletions(-) diff --git a/src/indexer/lib.rs b/src/indexer/lib.rs index 98857c5..6074d83 100644 --- a/src/indexer/lib.rs +++ b/src/indexer/lib.rs @@ -267,11 +267,23 @@ async fn setup_database_and_rpc( fn spawn_router_service(should_terminate: Arc) -> tokio::task::JoinHandle> { tokio::spawn(async move { - if let Err(e) = router::initialize_router(should_terminate.clone()).await { - error!("[router] unexpected error {}", e); + info!("[router] Starting router service"); + match router::initialize_router(should_terminate.clone()).await { + Ok(()) => { + info!("[router] Router service completed normally"); + Ok(()) + } + Err(e) => { + error!( + "[router] CRITICAL: Router service failed with error: {:?}", + e + ); + error!("[router] This router failure may have caused the indexer to become unreachable"); + Err(BlockchainError::internal(format!( + "Router service failed: {e}" + ))) + } } - info!("[router] shutting down"); - Ok(()) }) } @@ -292,11 +304,23 @@ fn spawn_quick_indexer_service( let quick_indexer = QuickIndexer::new(quick_config, db, rpc_client, should_terminate); Ok(tokio::spawn(async move { - info!("Starting quick indexer"); - if let Err(e) = quick_indexer.index().await { - error!("[quick_index] unexpected error {}", e); + info!("[quick_index] Starting quick indexer service"); + match quick_indexer.index().await { + Ok(()) => { + info!("[quick_index] Quick indexer completed normally"); + Ok(()) + } + Err(e) => { + error!( + "[quick_index] CRITICAL: Quick indexer failed with error: {:?}", + e + ); + error!("[quick_index] Quick indexer handles real-time block indexing - this failure stops new block processing"); + Err(BlockchainError::internal(format!( + "Quick indexer failed: {e}" + ))) + } } - Ok(()) })) } @@ -319,11 +343,23 @@ fn spawn_batch_indexer_service( let batch_indexer = BatchIndexer::new(batch_config, db, rpc_client, should_terminate); Ok(tokio::spawn(async move { - info!("Starting batch indexer"); - if let Err(e) = batch_indexer.index().await { - error!("[batch_index] unexpected error {}", e); + info!("[batch_index] Starting batch indexer service"); + match batch_indexer.index().await { + Ok(()) => { + info!("[batch_index] Batch indexer completed normally"); + Ok(()) + } + Err(e) => { + error!( + "[batch_index] CRITICAL: Batch indexer failed with error: {:?}", + e + ); + error!("[batch_index] Batch indexer handles historical block indexing - this failure stops backfilling"); + Err(BlockchainError::internal(format!( + "Batch indexer failed: {e}" + ))) + } } - Ok(()) })) } @@ -389,20 +425,69 @@ async fn initialize_index_metadata( Err(BlockchainError::internal("Failed to get indexer metadata")) } +#[allow(clippy::cognitive_complexity)] async fn wait_for_thread_completion(handles: Vec>>) -> Result<()> { - for handle in handles { + let mut has_errors = false; + + for (index, handle) in handles.into_iter().enumerate() { + let service_name = match index { + 0 => "router", + 1 => "quick_indexer", + 2 => "batch_indexer", + _ => "unknown_service", + }; + match handle.await { Ok(Ok(())) => { - info!("Thread completed successfully"); + info!("[{}] Thread completed successfully", service_name); } Ok(Err(e)) => { - error!("Thread completed with an error: {:?}", e); + error!( + "[{}] CRITICAL: Thread completed with an error: {:?}", + service_name, e + ); + error!( + "[{}] Error details: {}", + service_name, + format_error_details(&e) + ); + has_errors = true; } Err(e) => { - error!("Thread panicked: {:?}", e); + error!("[{}] CRITICAL: Thread panicked: {:?}", service_name, e); + if e.is_panic() { + error!("[{}] This was a panic - check for unwrap(), expect(), or other panic sources", service_name); + } + if e.is_cancelled() { + error!("[{}] Task was cancelled", service_name); + } + has_errors = true; } } } + if has_errors { + error!( + "INDEXER SHUTDOWN: One or more services failed - this explains why the indexer stopped" + ); + return Err(BlockchainError::internal( + "One or more indexing services failed", + )); + } + + info!("All indexing services completed successfully"); Ok(()) } + +fn format_error_details(error: &BlockchainError) -> String { + let mut details = Vec::new(); + details.push(format!("Error: {error}")); + + let mut current_error: &dyn std::error::Error = error; + while let Some(source) = current_error.source() { + details.push(format!("Caused by: {source}")); + current_error = source; + } + + details.join("\n ") +} diff --git a/src/indexer/main.rs b/src/indexer/main.rs index fa130ad..c4cc808 100644 --- a/src/indexer/main.rs +++ b/src/indexer/main.rs @@ -1,13 +1,13 @@ use fossil_headers_db::errors::{BlockchainError, Result}; use fossil_headers_db::indexer::lib::{start_indexing_services, IndexingConfig}; use std::{ - env, + env, panic, sync::{ atomic::{AtomicBool, Ordering}, Arc, }, }; -use tracing::info; +use tracing::{error, info}; use tracing_subscriber::fmt; #[tokio::main] @@ -48,6 +48,9 @@ pub async fn main() -> Result<()> { .compact() .init(); + // Setup panic hook to log panics before the application crashes + setup_panic_hook(); + let should_terminate = Arc::new(AtomicBool::new(false)); setup_ctrlc_handler(Arc::clone(&should_terminate))?; @@ -63,9 +66,19 @@ pub async fn main() -> Result<()> { let indexing_config = indexing_config_builder.build()?; - start_indexing_services(indexing_config, should_terminate).await?; - - Ok(()) + // Main indexing operation with comprehensive error logging + match start_indexing_services(indexing_config, should_terminate).await { + Ok(()) => { + info!("Indexing services completed successfully"); + Ok(()) + } + Err(e) => { + error!("CRITICAL: Indexing services failed with error: {:?}", e); + error!("Error chain: {}", format_error_chain(&e)); + error!("This is a fatal error that caused the indexer to stop"); + Err(e) + } + } } fn setup_ctrlc_handler(should_terminate: Arc) -> Result<()> { @@ -76,3 +89,57 @@ fn setup_ctrlc_handler(should_terminate: Arc) -> Result<()> { }) .map_err(|e| BlockchainError::internal(format!("Failed to set Ctrl+C handler: {e}"))) } + +#[allow(clippy::cognitive_complexity)] +fn setup_panic_hook() { + panic::set_hook(Box::new(|panic_info| { + let location = panic_info.location().map_or_else( + || "unknown location".to_string(), + |loc| format!("{}:{}:{}", loc.file(), loc.line(), loc.column()), + ); + + let message = panic_info.payload().downcast_ref::<&str>().map_or_else( + || { + panic_info.payload().downcast_ref::().map_or_else( + || "unknown panic message".to_string(), + std::clone::Clone::clone, + ) + }, + |s| (*s).to_string(), + ); + + error!("PANIC OCCURRED - INDEXER CRASHING!"); + error!("Panic location: {}", location); + error!("Panic message: {}", message); + error!("This indicates a critical bug in the indexer"); + + // Try to log the backtrace if available + if let Ok(backtrace) = std::env::var("RUST_BACKTRACE") { + if !backtrace.is_empty() && backtrace != "0" { + error!( + "Backtrace logging is enabled (RUST_BACKTRACE={})", + backtrace + ); + } + } else { + error!("Enable RUST_BACKTRACE=1 for stack traces"); + } + + // Flush logs before panic continues + std::io::Write::flush(&mut std::io::stderr()).ok(); + })); +} + +fn format_error_chain(error: &BlockchainError) -> String { + let mut chain = Vec::new(); + let mut current_error: &dyn std::error::Error = error; + + chain.push(current_error.to_string()); + + while let Some(source) = current_error.source() { + chain.push(source.to_string()); + current_error = source; + } + + chain.join(" -> ") +}