Skip to content

Commit

Permalink
os/arch/arm/src/armv7-a, os/kernel: Fix crash issues in SMP
Browse files Browse the repository at this point in the history
Some random crash issues (data abort, prefetch abort) were noticed
when running various commands like ble_rmc, kernel_tc. Our analysis
showed that the issue is happening since the stack pointer is getting
updated with wrong values after multiple context switches. We found
similar issues reported and fixed in nuttx. So, we pick the below
commits from nuttx to fix our issues.

======================================================================
commit e2df52390ace192e8fe632ecb83c2fa44373259a

    SMP: fix crash when switch to new task which is still running

    Situation:

    Assume we have 2 cpus, and busy run task0.

    CPU0                                CPU1
    task0 -> task1                      task2 -> task0
    1. remove task0 form runninglist
    2. take task1 as new tcb
    3. add task0 to blocklist
    4. clear spinlock
                                        4.1 remove task2 form runninglist
                                        4.2 take task0 as new tcb
                                        4.3 add task2 to blocklist
                                        4.4 use svc ISR swith to task0
                                        4.5 crash
    5. use svc ISR swith to task1

    Fix:
    Move clear spinlock to the end of svc ISR

commit 2241969e5ada2eaafee38ab6de6457aa15c8402c

    SMP: fix crash when switch to new task which is still running

    cpu0 thread0:                        cpu1:
    sched_yield()
    nxsched_set_priority()
    nxsched_running_setpriority()
    nxsched_reprioritize_rtr()
    nxsched_add_readytorun()
    up_cpu_pause()
                                         IRQ enter
                                         arm64_pause_handler()
                                         enter_critical_section() begin
                                         up_cpu_paused() pick thread0
                                         arm64_restorestate() set thread0 tcb->xcp.regs to CURRENT_REGS
    up_switch_context()
      thread0 -> thread1
    arm64_syscall()
        case SYS_switch_context
         change thread0 tcb->xcp.regs
        restore_critical_section()
                                         enter_critical_section() done
                                         leave_critical_section()
                                         IRQ leave with restore CURRENT_REGS
                                         ERROR !!!

    Reason:
    As descript above, cpu0 swith task: thread0 -> thread1, and the
    syscall() execute slowly, this time cpu1 pick thread0 to run at
    up_cpu_paused(). Then cpu0 syscall execute, cpu1 IRQ leave error.

    Resolve:
    Move arm64_restorestate() after enter_critical_section() done

======================================================================

Signed-off-by: Kishore S N <kishore.sn@samsung.com>
  • Loading branch information
kishore-sn committed Apr 5, 2024
1 parent 8d3f6d2 commit 065de0d
Show file tree
Hide file tree
Showing 7 changed files with 172 additions and 85 deletions.
122 changes: 83 additions & 39 deletions os/arch/arm/src/armv7-a/arm_cpupause.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@

static volatile spinlock_t g_cpu_wait[CONFIG_SMP_NCPUS];
static volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS];
static volatile spinlock_t g_cpu_resumed[CONFIG_SMP_NCPUS];

/****************************************************************************
* Public Functions
Expand All @@ -97,43 +98,35 @@ static volatile spinlock_t g_cpu_paused[CONFIG_SMP_NCPUS];

bool up_cpu_pausereq(int cpu)
{
return spin_islocked(&g_cpu_paused[cpu]);
return spin_is_locked(&g_cpu_paused[cpu]);
}

/****************************************************************************
* Name: up_cpu_paused
* Name: up_cpu_paused_save
*
* Description:
* Handle a pause request from another CPU. Normally, this logic is
* executed from interrupt handling logic within the architecture-specific
* However, it is sometimes necessary necessary to perform the pending
* However, it is sometimes necessary to perform the pending
* pause operation in other contexts where the interrupt cannot be taken
* in order to avoid deadlocks.
*
* This function performs the following operations:
*
* 1. It saves the current task state at the head of the current assigned
* task list.
* 2. It waits on a spinlock, then
* 3. Returns from interrupt, restoring the state of the new task at the
* head of the ready to run list.
*
* Input Parameters:
* cpu - The index of the CPU to be paused
* None
*
* Returned Value:
* On success, OK is returned. Otherwise, a negated errno value indicating
* the nature of the failure is returned.
*
****************************************************************************/

int up_cpu_paused(int cpu)
int up_cpu_paused_save(void)
{
struct tcb_s *tcb = this_task();

/* Update scheduler parameters */

//sched_suspend_scheduler(tcb);
sched_suspend_scheduler(tcb);

#ifdef CONFIG_SCHED_INSTRUMENTATION
/* Notify that we are paused */
Expand All @@ -147,23 +140,79 @@ int up_cpu_paused(int cpu)

arm_savestate(tcb->xcp.regs);

return OK;
}

/****************************************************************************
* Name: up_cpu_paused
*
* Description:
* Handle a pause request from another CPU. Normally, this logic is
* executed from interrupt handling logic within the architecture-specific
* However, it is sometimes necessary to perform the pending
* pause operation in other contexts where the interrupt cannot be taken
* in order to avoid deadlocks.
*
* This function performs the following operations:
*
* 1. It saves the current task state at the head of the current assigned
* task list.
* 2. It waits on a spinlock, then
* 3. Returns from interrupt, restoring the state of the new task at the
* head of the ready to run list.
*
* Input Parameters:
* cpu - The index of the CPU to be paused
*
* Returned Value:
* On success, OK is returned. Otherwise, a negated errno value indicating
* the nature of the failure is returned.
*
****************************************************************************/

int up_cpu_paused(int cpu)
{
/* Release the g_cpu_paused spinlock to synchronize with the
* requesting CPU.
*/

spin_unlock(&g_cpu_paused[cpu]);

/* Ensure the CPU has been resumed to avoid causing a deadlock */

spin_lock(&g_cpu_resumed[cpu]);

/* Wait for the spinlock to be released. The requesting CPU will release
* the spinlock when the CPU is resumed.
*/

spin_lock(&g_cpu_wait[cpu]);

/* This CPU has been resumed. Restore the exception context of the TCB at
* the (new) head of the assigned task list.
*/
spin_unlock(&g_cpu_wait[cpu]);
spin_unlock(&g_cpu_resumed[cpu]);

tcb = this_task();
return OK;
}

/****************************************************************************
* Name: up_cpu_paused_restore
*
* Description:
* Restore the state of the CPU after it was paused via up_cpu_pause(),
* and resume normal tasking.
*
* Input Parameters:
* None
*
* Returned Value:
* On success, OK is returned. Otherwise, a negated errno value indicating
* the nature of the failure is returned.
*
****************************************************************************/

int up_cpu_paused_restore(void)
{
struct tcb_s *tcb = this_task();

#ifdef CONFIG_SCHED_INSTRUMENTATION
/* Notify that we have resumed */
Expand All @@ -181,7 +230,6 @@ int up_cpu_paused(int cpu)

up_restoretask(tcb);
arm_restorestate(tcb->xcp.regs);
spin_unlock(&g_cpu_wait[cpu]);

return OK;
}
Expand Down Expand Up @@ -259,8 +307,6 @@ int arm_pause_handler(int irq, void *context, void *arg)

int up_cpu_pause(int cpu)
{
int ret;

DEBUGASSERT(cpu >= 0 && cpu < CONFIG_SMP_NCPUS && cpu != this_cpu());

#ifdef CONFIG_SCHED_INSTRUMENTATION
Expand All @@ -278,37 +324,28 @@ int up_cpu_pause(int cpu)
* request.
*/

DEBUGASSERT(!spin_islocked(&g_cpu_paused[cpu]));
DEBUGASSERT(!spin_is_locked(&g_cpu_paused[cpu]));

spin_lock(&g_cpu_wait[cpu]);
spin_lock(&g_cpu_paused[cpu]);

/* Execute SGI2 */

ret = arm_cpu_sgi(GIC_IRQ_SGI2, (1 << cpu));
if (ret < 0)
{
/* What happened? Unlock the g_cpu_wait spinlock */
arm_cpu_sgi(GIC_IRQ_SGI2, (1 << cpu));

spin_unlock(&g_cpu_wait[cpu]);
}
else
{
/* Wait for the other CPU to unlock g_cpu_paused meaning that
* it is fully paused and ready for up_cpu_resume();
*/

spin_lock(&g_cpu_paused[cpu]);
}
/* Wait for the other CPU to unlock g_cpu_paused meaning that
* it is fully paused and ready for up_cpu_resume();
*/

spin_lock(&g_cpu_paused[cpu]);
spin_unlock(&g_cpu_paused[cpu]);

/* On successful return g_cpu_wait will be locked, the other CPU will be
* spinning on g_cpu_wait and will not continue until g_cpu_resume() is
* called. g_cpu_paused will be unlocked in any case.
*/

return ret;
return OK;
}

/****************************************************************************
Expand Down Expand Up @@ -345,10 +382,17 @@ int up_cpu_resume(int cpu)
* established thread.
*/

DEBUGASSERT(spin_islocked(&g_cpu_wait[cpu]) &&
!spin_islocked(&g_cpu_paused[cpu]));
DEBUGASSERT(spin_is_locked(&g_cpu_wait[cpu]) &&
!spin_is_locked(&g_cpu_paused[cpu]));

spin_unlock(&g_cpu_wait[cpu]);

/* Ensure the CPU has been resumed to avoid causing a deadlock */

spin_lock(&g_cpu_resumed[cpu]);

spin_unlock(&g_cpu_resumed[cpu]);

return OK;
}

Expand Down
10 changes: 6 additions & 4 deletions os/arch/arm/src/armv7-a/arm_doirq.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,30 +91,32 @@ uint32_t *arm_doirq(int irq, uint32_t *regs)

irq_dispatch(irq, regs);

if (regs != CURRENT_REGS)
{
#ifdef CONFIG_ARCH_ADDRENV
/* Check for a context switch. If a context switch occurred, then
* CURRENT_REGS will have a different value than it did on entry. If an
* interrupt level context switch has occurred, then establish the correct
* address environment before returning from the interrupt.
*/

if (regs != CURRENT_REGS)
{
/* Make sure that the address environment for the previously
* running task is closed down gracefully (data caches dump,
* MMU flushed) and set up the address environment for the new
* thread at the head of the ready-to-run list.
*/

group_addrenv(NULL);
}
#endif

restore_critical_section();
regs = (uint32_t *)CURRENT_REGS;
}

/* Set CURRENT_REGS to NULL to indicate that we are no longer in an
* interrupt handler.
*/

regs = (uint32_t *)CURRENT_REGS;
CURRENT_REGS = NULL;
#endif
/* Reset the interrupt number values */
Expand Down
6 changes: 5 additions & 1 deletion os/arch/arm/src/armv7-a/arm_syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,11 @@ uint32_t *arm_syscall(uint32_t *regs)
}
#endif

regs = (uint32_t *)CURRENT_REGS;
if (regs != CURRENT_REGS)
{
restore_critical_section();
regs = (uint32_t *)CURRENT_REGS;
}

/* Report what happened */

Expand Down
21 changes: 21 additions & 0 deletions os/include/tinyara/irq.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,27 @@ void leave_critical_section(irqstate_t flags);
# define leave_critical_section(f) irqrestore(f)
#endif


/****************************************************************************
* Name: restore_critical_section
*
* Description:
* Restore the critical_section
*
* Input Parameters:
* None
*
* Returned Value:
* None
*
****************************************************************************/

#ifdef CONFIG_SMP
void restore_critical_section(void);
#else
# define restore_critical_section()
#endif

/**
* @cond
* @internal
Expand Down
2 changes: 2 additions & 0 deletions os/include/tinyara/spinlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ typedef uint8_t spinlock_t;
# define __SP_UNLOCK_FUNCTION 1
#endif

# define spin_is_locked(l) (*(l) == SP_LOCKED)

/****************************************************************************
* Public Function Prototypes
****************************************************************************/
Expand Down
56 changes: 55 additions & 1 deletion os/kernel/irq/irq_csection.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ irqstate_t enter_critical_section(void)
* interrupt handler.
*/
} else {
int paused = false;
/* Make sure that the g_cpu_irqset was not already set
* by previous logic on this CPU that was executed by the
* interrupt handler. We know that the bit in g_cpu_irqset
Expand All @@ -263,8 +264,13 @@ irqstate_t enter_critical_section(void)
* pause request interrupt. Break the deadlock by
* handling the pause request now.
*/
if (!paused)
{
up_cpu_paused_save();
}

DEBUGVERIFY(up_cpu_paused(cpu));
paused = true;

/* NOTE: As the result of up_cpu_paused(cpu), this CPU
* might set g_cpu_irqset in sched_resume_scheduler()
Expand Down Expand Up @@ -295,7 +301,11 @@ irqstate_t enter_critical_section(void)

spin_setbit(&g_cpu_irqset, cpu, &g_cpu_irqsetlock, \
&g_cpu_irqlock);
}
if (paused)
{
up_cpu_paused_restore();
}
}
} else {
/* Normal tasking environment.
*
Expand Down Expand Up @@ -644,4 +654,48 @@ bool irq_cpu_locked(int cpu)
}
#endif

#ifdef CONFIG_SMP
void restore_critical_section(void)
{
/* NOTE: The following logic for adjusting global IRQ controls were
* derived from nxsched_add_readytorun() and sched_removedreadytorun()
* Here, we only handles clearing logic to defer unlocking IRQ lock
* followed by context switching.
*/

FAR struct tcb_s *tcb = this_task();
int me = this_cpu();

/* Adjust global IRQ controls. If irqcount is greater than zero,
* then this task/this CPU holds the IRQ lock
*/

if (tcb->irqcount > 0)
{
/* Do notihing here
* NOTE: spin_setbit() is done in nxsched_add_readytorun()
* and nxsched_remove_readytorun()
*/
}

/* No.. This CPU will be relinquishing the lock. But this works
* differently if we are performing a context switch from an
* interrupt handler and the interrupt handler has established
* a critical section. We can detect this case when
* g_cpu_nestcount[me] > 0.
*/

else if (g_cpu_nestcount[me] <= 0)
{
/* Release our hold on the IRQ lock. */

if ((g_cpu_irqset & (1 << me)) != 0)
{
spin_clrbit(&g_cpu_irqset, me, &g_cpu_irqsetlock,
&g_cpu_irqlock);
}
}
}
#endif /* CONFIG_SMP */

#endif /* CONFIG_IRQCOUNT */

0 comments on commit 065de0d

Please sign in to comment.