From 065de0d6ac4499492291b27c104d54dfd0f04ac4 Mon Sep 17 00:00:00 2001 From: Kishore S N Date: Fri, 5 Apr 2024 15:55:06 +0530 Subject: [PATCH] os/arch/arm/src/armv7-a, os/kernel: Fix crash issues in SMP Some random crash issues (data abort, prefetch abort) were noticed when running various commands like ble_rmc, kernel_tc. Our analysis showed that the issue is happening since the stack pointer is getting updated with wrong values after multiple context switches. We found similar issues reported and fixed in nuttx. So, we pick the below commits from nuttx to fix our issues. ====================================================================== commit e2df52390ace192e8fe632ecb83c2fa44373259a SMP: fix crash when switch to new task which is still running Situation: Assume we have 2 cpus, and busy run task0. CPU0 CPU1 task0 -> task1 task2 -> task0 1. remove task0 form runninglist 2. take task1 as new tcb 3. add task0 to blocklist 4. clear spinlock 4.1 remove task2 form runninglist 4.2 take task0 as new tcb 4.3 add task2 to blocklist 4.4 use svc ISR swith to task0 4.5 crash 5. use svc ISR swith to task1 Fix: Move clear spinlock to the end of svc ISR commit 2241969e5ada2eaafee38ab6de6457aa15c8402c SMP: fix crash when switch to new task which is still running cpu0 thread0: cpu1: sched_yield() nxsched_set_priority() nxsched_running_setpriority() nxsched_reprioritize_rtr() nxsched_add_readytorun() up_cpu_pause() IRQ enter arm64_pause_handler() enter_critical_section() begin up_cpu_paused() pick thread0 arm64_restorestate() set thread0 tcb->xcp.regs to CURRENT_REGS up_switch_context() thread0 -> thread1 arm64_syscall() case SYS_switch_context change thread0 tcb->xcp.regs restore_critical_section() enter_critical_section() done leave_critical_section() IRQ leave with restore CURRENT_REGS ERROR !!! Reason: As descript above, cpu0 swith task: thread0 -> thread1, and the syscall() execute slowly, this time cpu1 pick thread0 to run at up_cpu_paused(). Then cpu0 syscall execute, cpu1 IRQ leave error. Resolve: Move arm64_restorestate() after enter_critical_section() done ====================================================================== Signed-off-by: Kishore S N --- os/arch/arm/src/armv7-a/arm_cpupause.c | 122 ++++++++++++++++-------- os/arch/arm/src/armv7-a/arm_doirq.c | 10 +- os/arch/arm/src/armv7-a/arm_syscall.c | 6 +- os/include/tinyara/irq.h | 21 ++++ os/include/tinyara/spinlock.h | 2 + os/kernel/irq/irq_csection.c | 56 ++++++++++- os/kernel/sched/sched_resumescheduler.c | 40 -------- 7 files changed, 172 insertions(+), 85 deletions(-) diff --git a/os/arch/arm/src/armv7-a/arm_cpupause.c b/os/arch/arm/src/armv7-a/arm_cpupause.c index 12c2ecc126..f149decf01 100644 --- a/os/arch/arm/src/armv7-a/arm_cpupause.c +++ b/os/arch/arm/src/armv7-a/arm_cpupause.c @@ -75,6 +75,7 @@ static volatile spinlock_t g_cpu_wait[CONFIG_SMP_NCPUS]; static volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS]; +static volatile spinlock_t g_cpu_resumed[CONFIG_SMP_NCPUS]; /**************************************************************************** * Public Functions @@ -97,29 +98,21 @@ static volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS]; bool up_cpu_pausereq(int cpu) { - return spin_islocked(&g_cpu_paused[cpu]); + return spin_is_locked(&g_cpu_paused[cpu]); } /**************************************************************************** - * Name: up_cpu_paused + * Name: up_cpu_paused_save * * Description: * Handle a pause request from another CPU. Normally, this logic is * executed from interrupt handling logic within the architecture-specific - * However, it is sometimes necessary necessary to perform the pending + * However, it is sometimes necessary to perform the pending * pause operation in other contexts where the interrupt cannot be taken * in order to avoid deadlocks. * - * This function performs the following operations: - * - * 1. It saves the current task state at the head of the current assigned - * task list. - * 2. It waits on a spinlock, then - * 3. Returns from interrupt, restoring the state of the new task at the - * head of the ready to run list. - * * Input Parameters: - * cpu - The index of the CPU to be paused + * None * * Returned Value: * On success, OK is returned. Otherwise, a negated errno value indicating @@ -127,13 +120,13 @@ bool up_cpu_pausereq(int cpu) * ****************************************************************************/ -int up_cpu_paused(int cpu) +int up_cpu_paused_save(void) { struct tcb_s *tcb = this_task(); /* Update scheduler parameters */ - //sched_suspend_scheduler(tcb); + sched_suspend_scheduler(tcb); #ifdef CONFIG_SCHED_INSTRUMENTATION /* Notify that we are paused */ @@ -147,23 +140,79 @@ int up_cpu_paused(int cpu) arm_savestate(tcb->xcp.regs); + return OK; +} + +/**************************************************************************** + * Name: up_cpu_paused + * + * Description: + * Handle a pause request from another CPU. Normally, this logic is + * executed from interrupt handling logic within the architecture-specific + * However, it is sometimes necessary to perform the pending + * pause operation in other contexts where the interrupt cannot be taken + * in order to avoid deadlocks. + * + * This function performs the following operations: + * + * 1. It saves the current task state at the head of the current assigned + * task list. + * 2. It waits on a spinlock, then + * 3. Returns from interrupt, restoring the state of the new task at the + * head of the ready to run list. + * + * Input Parameters: + * cpu - The index of the CPU to be paused + * + * Returned Value: + * On success, OK is returned. Otherwise, a negated errno value indicating + * the nature of the failure is returned. + * + ****************************************************************************/ + +int up_cpu_paused(int cpu) +{ /* Release the g_cpu_paused spinlock to synchronize with the * requesting CPU. */ spin_unlock(&g_cpu_paused[cpu]); + /* Ensure the CPU has been resumed to avoid causing a deadlock */ + + spin_lock(&g_cpu_resumed[cpu]); + /* Wait for the spinlock to be released. The requesting CPU will release * the spinlock when the CPU is resumed. */ spin_lock(&g_cpu_wait[cpu]); - /* This CPU has been resumed. Restore the exception context of the TCB at - * the (new) head of the assigned task list. - */ + spin_unlock(&g_cpu_wait[cpu]); + spin_unlock(&g_cpu_resumed[cpu]); - tcb = this_task(); + return OK; +} + +/**************************************************************************** + * Name: up_cpu_paused_restore + * + * Description: + * Restore the state of the CPU after it was paused via up_cpu_pause(), + * and resume normal tasking. + * + * Input Parameters: + * None + * + * Returned Value: + * On success, OK is returned. Otherwise, a negated errno value indicating + * the nature of the failure is returned. + * + ****************************************************************************/ + +int up_cpu_paused_restore(void) +{ + struct tcb_s *tcb = this_task(); #ifdef CONFIG_SCHED_INSTRUMENTATION /* Notify that we have resumed */ @@ -181,7 +230,6 @@ int up_cpu_paused(int cpu) up_restoretask(tcb); arm_restorestate(tcb->xcp.regs); - spin_unlock(&g_cpu_wait[cpu]); return OK; } @@ -259,8 +307,6 @@ int arm_pause_handler(int irq, void *context, void *arg) int up_cpu_pause(int cpu) { - int ret; - DEBUGASSERT(cpu >= 0 && cpu < CONFIG_SMP_NCPUS && cpu != this_cpu()); #ifdef CONFIG_SCHED_INSTRUMENTATION @@ -278,29 +324,20 @@ int up_cpu_pause(int cpu) * request. */ - DEBUGASSERT(!spin_islocked(&g_cpu_paused[cpu])); + DEBUGASSERT(!spin_is_locked(&g_cpu_paused[cpu])); spin_lock(&g_cpu_wait[cpu]); spin_lock(&g_cpu_paused[cpu]); /* Execute SGI2 */ - ret = arm_cpu_sgi(GIC_IRQ_SGI2, (1 << cpu)); - if (ret < 0) - { - /* What happened? Unlock the g_cpu_wait spinlock */ + arm_cpu_sgi(GIC_IRQ_SGI2, (1 << cpu)); - spin_unlock(&g_cpu_wait[cpu]); - } - else - { - /* Wait for the other CPU to unlock g_cpu_paused meaning that - * it is fully paused and ready for up_cpu_resume(); - */ - - spin_lock(&g_cpu_paused[cpu]); - } + /* Wait for the other CPU to unlock g_cpu_paused meaning that + * it is fully paused and ready for up_cpu_resume(); + */ + spin_lock(&g_cpu_paused[cpu]); spin_unlock(&g_cpu_paused[cpu]); /* On successful return g_cpu_wait will be locked, the other CPU will be @@ -308,7 +345,7 @@ int up_cpu_pause(int cpu) * called. g_cpu_paused will be unlocked in any case. */ - return ret; + return OK; } /**************************************************************************** @@ -345,10 +382,17 @@ int up_cpu_resume(int cpu) * established thread. */ - DEBUGASSERT(spin_islocked(&g_cpu_wait[cpu]) && - !spin_islocked(&g_cpu_paused[cpu])); + DEBUGASSERT(spin_is_locked(&g_cpu_wait[cpu]) && + !spin_is_locked(&g_cpu_paused[cpu])); spin_unlock(&g_cpu_wait[cpu]); + + /* Ensure the CPU has been resumed to avoid causing a deadlock */ + + spin_lock(&g_cpu_resumed[cpu]); + + spin_unlock(&g_cpu_resumed[cpu]); + return OK; } diff --git a/os/arch/arm/src/armv7-a/arm_doirq.c b/os/arch/arm/src/armv7-a/arm_doirq.c index 63abda7b79..6e2e1cb407 100644 --- a/os/arch/arm/src/armv7-a/arm_doirq.c +++ b/os/arch/arm/src/armv7-a/arm_doirq.c @@ -91,6 +91,8 @@ uint32_t *arm_doirq(int irq, uint32_t *regs) irq_dispatch(irq, regs); + if (regs != CURRENT_REGS) + { #ifdef CONFIG_ARCH_ADDRENV /* Check for a context switch. If a context switch occurred, then * CURRENT_REGS will have a different value than it did on entry. If an @@ -98,8 +100,6 @@ uint32_t *arm_doirq(int irq, uint32_t *regs) * address environment before returning from the interrupt. */ - if (regs != CURRENT_REGS) - { /* Make sure that the address environment for the previously * running task is closed down gracefully (data caches dump, * MMU flushed) and set up the address environment for the new @@ -107,14 +107,16 @@ uint32_t *arm_doirq(int irq, uint32_t *regs) */ group_addrenv(NULL); - } #endif + restore_critical_section(); + regs = (uint32_t *)CURRENT_REGS; + } + /* Set CURRENT_REGS to NULL to indicate that we are no longer in an * interrupt handler. */ - regs = (uint32_t *)CURRENT_REGS; CURRENT_REGS = NULL; #endif /* Reset the interrupt number values */ diff --git a/os/arch/arm/src/armv7-a/arm_syscall.c b/os/arch/arm/src/armv7-a/arm_syscall.c index 7d46f99f9d..9772410202 100644 --- a/os/arch/arm/src/armv7-a/arm_syscall.c +++ b/os/arch/arm/src/armv7-a/arm_syscall.c @@ -628,7 +628,11 @@ uint32_t *arm_syscall(uint32_t *regs) } #endif - regs = (uint32_t *)CURRENT_REGS; + if (regs != CURRENT_REGS) + { + restore_critical_section(); + regs = (uint32_t *)CURRENT_REGS; + } /* Report what happened */ diff --git a/os/include/tinyara/irq.h b/os/include/tinyara/irq.h index d161fba1c2..645f7cc644 100644 --- a/os/include/tinyara/irq.h +++ b/os/include/tinyara/irq.h @@ -150,6 +150,27 @@ void leave_critical_section(irqstate_t flags); # define leave_critical_section(f) irqrestore(f) #endif + +/**************************************************************************** + * Name: restore_critical_section + * + * Description: + * Restore the critical_section + * + * Input Parameters: + * None + * + * Returned Value: + * None + * + ****************************************************************************/ + +#ifdef CONFIG_SMP +void restore_critical_section(void); +#else +# define restore_critical_section() +#endif + /** * @cond * @internal diff --git a/os/include/tinyara/spinlock.h b/os/include/tinyara/spinlock.h index a88e91601e..0616b4d108 100644 --- a/os/include/tinyara/spinlock.h +++ b/os/include/tinyara/spinlock.h @@ -81,6 +81,8 @@ typedef uint8_t spinlock_t; # define __SP_UNLOCK_FUNCTION 1 #endif +# define spin_is_locked(l) (*(l) == SP_LOCKED) + /**************************************************************************** * Public Function Prototypes ****************************************************************************/ diff --git a/os/kernel/irq/irq_csection.c b/os/kernel/irq/irq_csection.c index de0e6fc368..105586a0b1 100755 --- a/os/kernel/irq/irq_csection.c +++ b/os/kernel/irq/irq_csection.c @@ -245,6 +245,7 @@ irqstate_t enter_critical_section(void) * interrupt handler. */ } else { + int paused = false; /* Make sure that the g_cpu_irqset was not already set * by previous logic on this CPU that was executed by the * interrupt handler. We know that the bit in g_cpu_irqset @@ -263,8 +264,13 @@ irqstate_t enter_critical_section(void) * pause request interrupt. Break the deadlock by * handling the pause request now. */ + if (!paused) + { + up_cpu_paused_save(); + } DEBUGVERIFY(up_cpu_paused(cpu)); + paused = true; /* NOTE: As the result of up_cpu_paused(cpu), this CPU * might set g_cpu_irqset in sched_resume_scheduler() @@ -295,7 +301,11 @@ irqstate_t enter_critical_section(void) spin_setbit(&g_cpu_irqset, cpu, &g_cpu_irqsetlock, \ &g_cpu_irqlock); - } + if (paused) + { + up_cpu_paused_restore(); + } + } } else { /* Normal tasking environment. * @@ -644,4 +654,48 @@ bool irq_cpu_locked(int cpu) } #endif +#ifdef CONFIG_SMP +void restore_critical_section(void) +{ + /* NOTE: The following logic for adjusting global IRQ controls were + * derived from nxsched_add_readytorun() and sched_removedreadytorun() + * Here, we only handles clearing logic to defer unlocking IRQ lock + * followed by context switching. + */ + + FAR struct tcb_s *tcb = this_task(); + int me = this_cpu(); + + /* Adjust global IRQ controls. If irqcount is greater than zero, + * then this task/this CPU holds the IRQ lock + */ + + if (tcb->irqcount > 0) + { + /* Do notihing here + * NOTE: spin_setbit() is done in nxsched_add_readytorun() + * and nxsched_remove_readytorun() + */ + } + + /* No.. This CPU will be relinquishing the lock. But this works + * differently if we are performing a context switch from an + * interrupt handler and the interrupt handler has established + * a critical section. We can detect this case when + * g_cpu_nestcount[me] > 0. + */ + + else if (g_cpu_nestcount[me] <= 0) + { + /* Release our hold on the IRQ lock. */ + + if ((g_cpu_irqset & (1 << me)) != 0) + { + spin_clrbit(&g_cpu_irqset, me, &g_cpu_irqsetlock, + &g_cpu_irqlock); + } + } +} +#endif /* CONFIG_SMP */ + #endif /* CONFIG_IRQCOUNT */ diff --git a/os/kernel/sched/sched_resumescheduler.c b/os/kernel/sched/sched_resumescheduler.c index 248d185a29..22b716a9c8 100644 --- a/os/kernel/sched/sched_resumescheduler.c +++ b/os/kernel/sched/sched_resumescheduler.c @@ -74,46 +74,6 @@ void sched_resume_scheduler(FAR struct tcb_s *tcb) #ifdef CONFIG_SCHED_INSTRUMENTATION sched_note_resume(tcb); #endif - -#ifdef CONFIG_SMP - /* NOTE: The following logic for adjusting global IRQ controls were - * derived from sched_add_readytorun() and sched_removedreadytorun() - * Here, we only handles clearing logic to defer unlocking IRQ lock - * followed by context switching. - */ - - int me = this_cpu(); - - /* Adjust global IRQ controls. If irqcount is greater than zero, - * then this task/this CPU holds the IRQ lock - */ - - if (tcb->irqcount > 0) - { - /* Do nothing here - * NOTE: spin_setbit() is done in sched_add_readytorun() - * and sched_remove_readytorun() - */ - } - - /* No.. This CPU will be relinquishing the lock. But this works - * differently if we are performing a context switch from an - * interrupt handler and the interrupt handler has established - * a critical section. We can detect this case when - * g_cpu_nestcount[me] > 0. - */ - - else if (g_cpu_nestcount[me] <= 0) - { - /* Release our hold on the IRQ lock. */ - - if ((g_cpu_irqset & (1 << me)) != 0) - { - spin_clrbit(&g_cpu_irqset, me, &g_cpu_irqsetlock, - &g_cpu_irqlock); - } - } -#endif /* CONFIG_SMP */ } #endif /* CONFIG_RR_INTERVAL > 0 || CONFIG_SCHED_RESUMESCHEDULER */