-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Description
复现
CI 中会运行 atomic test case,时不时会失败。
在本地下参考 CI 的配置运行 utest 可以得到如下结果:
[D/utest] [ OK ] [ unit ] (test_atomic_api:120) is passed
[D/utest] [ OK ] [ unit ] (test_atomic_api:121) is passed
[E/utest] [ ASSERT ] [ unit ] at (atomic_tc.c); func: (test_atomic_add:152); msg: ((count == 3000000) is false)
[E/utest] [ FAILED ] [ result ] testcase (testcases.kernel.atomic_tc)
[I/utest] [----------] [ testcase ] (testcases.kernel.atomic_tc) finished
[I/utest] [----------] [ testcase ] (testcases.kernel.thread_tc) started分析
首先,问题的直接原因在于测试用例 test_atomic_add。其代码如下:
static void ture_entry(void *parameter)
{
int i;
for (i = 0; i < 1000000; i++)
{
rt_atomic_add(&count, 1);
}
rt_sem_release(sem_t);
}
static void test_atomic_add(void)
{
rt_thread_t thread;
int i;
sem_t = rt_sem_create("atomic_sem", 0, RT_IPC_FLAG_PRIO);
count = 0;
thread = rt_thread_create("t1", ture_entry, RT_NULL, THREAD_STACKSIZE, THREAD_PRIORITY, THREAD_TIMESLICE);
rt_thread_startup(thread);
thread = rt_thread_create("t2", ture_entry, RT_NULL, THREAD_STACKSIZE, THREAD_PRIORITY, THREAD_TIMESLICE);
rt_thread_startup(thread);
thread = rt_thread_create("t3", ture_entry, RT_NULL, THREAD_STACKSIZE, THREAD_PRIORITY, THREAD_TIMESLICE);
rt_thread_startup(thread);
for (i = 0; i < 3; i++)
{
rt_sem_take(sem_t, RT_WAITING_FOREVER);
}
uassert_true(count == 3000000);
}这份代码存在一点小问题,初始化全局变量的时候没有使用原子操作。由于 arm 体系的松内存模型,这样的行为会导致变量的值处于 undefined 的状态。更正如下:
diff --git a/examples/utest/testcases/kernel/atomic_tc.c b/examples/utest/testcases/kernel/atomic_tc.c
index a83c05f73..88a883916 100644
--- a/examples/utest/testcases/kernel/atomic_tc.c
+++ b/examples/utest/testcases/kernel/atomic_tc.c
@@ -134,10 +134,11 @@ static void ture_entry(void *parameter)
static void test_atomic_add(void)
{
rt_thread_t thread;
- int i;
+ size_t i;
sem_t = rt_sem_create("atomic_sem", 0, RT_IPC_FLAG_PRIO);
- count = 0;
+ rt_atomic_store(&count, 0);
+
thread = rt_thread_create("t1", ture_entry, RT_NULL, THREAD_STACKSIZE, THREAD_PRIORITY, THREAD_TIMESLICE);
rt_thread_startup(thread);
thread = rt_thread_create("t2", ture_entry, RT_NULL, THREAD_STACKSIZE, THREAD_PRIORITY, THREAD_TIMESLICE);
@@ -149,7 +150,8 @@ static void test_atomic_add(void)
{
rt_sem_take(sem_t, RT_WAITING_FOREVER);
}
- uassert_true(count == 3000000);
+ i = rt_atomic_load(&count);
+ uassert_true(i == 3000000);
}
static rt_err_t utest_tc_init(void)检查这部分发现第二个问题,CI 选用了 USING STD ATOMIC 后仍然没有将 rt_atomic_add 替换为 c11 API,而是 rt_hw_atomic。这部分需要修改宏的顺序。
接下来检查 rt_hw_atomic_add 的实现:
60037360 <rt_hw_atomic_add>:
60037360: e52db004 push {fp} ; (str fp, [sp, #-4]!)
60037364: e28db000 add fp, sp, #0
60037368: e24dd044 sub sp, sp, #68 ; 0x44
6003736c: e50b0040 str r0, [fp, #-64] ; 0xffffffc0
60037370: e50b1044 str r1, [fp, #-68] ; 0xffffffbc
60037374: e51b3040 ldr r3, [fp, #-64] ; 0xffffffc0
60037378: e50b300c str r3, [fp, #-12]
6003737c: e51b300c ldr r3, [fp, #-12]
60037380: e1933f9f ldrex r3, [r3]
60037384: e50b3030 str r3, [fp, #-48] ; 0xffffffd0
60037388: f57ff05b dmb ish
6003738c: e51b3030 ldr r3, [fp, #-48] ; 0xffffffd0
60037390: f57ff05b dmb ish
60037394: e50b3010 str r3, [fp, #-16]
60037398: e51b3010 ldr r3, [fp, #-16]
6003739c: e50b3028 str r3, [fp, #-40] ; 0xffffffd8
600373a0: e51b3028 ldr r3, [fp, #-40] ; 0xffffffd8
600373a4: f57ff05b dmb ish
600373a8: e50b302c str r3, [fp, #-44] ; 0xffffffd4
600373ac: f57ff05b dmb ish
600373b0: f57ff05b dmb ish
600373b4: e51b302c ldr r3, [fp, #-44] ; 0xffffffd4
600373b8: f57ff05b dmb ish
600373bc: e50b3024 str r3, [fp, #-36] ; 0xffffffdc
600373c0: e51b2024 ldr r2, [fp, #-36] ; 0xffffffdc
600373c4: f57ff05b dmb ish
600373c8: e51b3044 ldr r3, [fp, #-68] ; 0xffffffbc
600373cc: f57ff05b dmb ish
600373d0: e50b3020 str r3, [fp, #-32] ; 0xffffffe0
600373d4: e51b3020 ldr r3, [fp, #-32] ; 0xffffffe0
600373d8: e0823003 add r3, r2, r3
600373dc: e50b3034 str r3, [fp, #-52] ; 0xffffffcc
600373e0: e51b3040 ldr r3, [fp, #-64] ; 0xffffffc0
600373e4: e50b3008 str r3, [fp, #-8]
600373e8: f57ff05b dmb ish
600373ec: e51b3034 ldr r3, [fp, #-52] ; 0xffffffcc
600373f0: f57ff05b dmb ish
600373f4: e50b3014 str r3, [fp, #-20] ; 0xffffffec
600373f8: e51b2014 ldr r2, [fp, #-20] ; 0xffffffec
600373fc: e51b1008 ldr r1, [fp, #-8]
60037400: e1813f92 strex r3, r2, [r1]
60037404: e50b3038 str r3, [fp, #-56] ; 0xffffffc8
60037408: f57ff05b dmb ish
6003740c: e51b3038 ldr r3, [fp, #-56] ; 0xffffffc8
60037410: f57ff05b dmb ish
60037414: e50b3018 str r3, [fp, #-24] ; 0xffffffe8
60037418: e51b3018 ldr r3, [fp, #-24] ; 0xffffffe8
6003741c: e3530000 cmp r3, #0
60037420: 1affffd3 bne 60037374 <rt_hw_atomic_add+0x14>
60037424: f57ff05b dmb ish
60037428: e51b302c ldr r3, [fp, #-44] ; 0xffffffd4
6003742c: f57ff05b dmb ish
60037430: e50b301c str r3, [fp, #-28] ; 0xffffffe4
60037434: e51b301c ldr r3, [fp, #-28] ; 0xffffffe4
60037438: e1a00003 mov r0, r3
6003743c: e28bd000 add sp, fp, #0
60037440: e49db004 pop {fp} ; (ldr fp, [sp], #4)
60037444: e12fff1e bx lr在 armv7 体系下,原子操作通过 ldrex 和 strex 两个指令完成。考虑 ldrex 和 strex 的实现原理,可以知道独占性源自按照核的粒度实现的保护标志位。而查看上面的反汇编可以发现这 rt_hw_atomic_add 生成的指令流特别长,假如替换成 C11 atomic 就不会复现这个问题了。使用 C11 atomic 后,编译器自然会优化掉一大堆出入栈的指令,这样同样的操作只用三条指令完成:ldrex -> add -> strex。另外,主线上的 vexpress BSP 现在跑在单核系统上。
从以上的背景条件,可以猜测是一个核上,执行指令流 ldrex 和 strex 之间被中断,接着其它线程更新了值,导致了这个问题。顺着这个猜测,还发现调大测试用例线程的 ticks 也不会复现这个问题。基本可以怀疑就是中断里没有正确处理 monitor 导致同步异常,这有点类似多线程编程下非常经典的 ABA 问题。以下的时序图解释了产生这种情况的原因。
在 armv7 体系下,提供了指令 CLREX 来解决这个问题。它在线程切换时清空当前核的 monitor 标志,这样同一个核上多线程导致的 ABA 问题就不会发生。在v7 内核的上下文切换中加入指令 CLREX ,果然问题不再复现。
diff --git a/libcpu/arm/cortex-a/context_gcc.S b/libcpu/arm/cortex-a/context_gcc.S
index 98080de5c..da6e6507d 100644
--- a/libcpu/arm/cortex-a/context_gcc.S
+++ b/libcpu/arm/cortex-a/context_gcc.S
@@ -40,6 +40,7 @@ rt_hw_interrupt_enable:
*/
.globl rt_hw_context_switch_to
rt_hw_context_switch_to:
+ clrex
ldr sp, [r0] @ get new task stack pointer
#ifdef RT_USING_SMP
@@ -76,6 +77,7 @@ _guest_switch_lvl:
*/
.globl rt_hw_context_switch
rt_hw_context_switch:
+ clrex
stmfd sp!, {lr} @ push pc (lr should be pushed in place of PC)
stmfd sp!, {r0-r12, lr} @ push lr & register file
@@ -143,6 +145,7 @@ rt_hw_context_switch:
.globl rt_interrupt_to_thread
.globl rt_hw_context_switch_interrupt
rt_hw_context_switch_interrupt:
+ clrex
#ifdef RT_USING_SMP
/* r0 :svc_mod context
* r1 :addr of from_thread's sp综上,本问题根本原因是核线程切换的实现没有处理好独占访问的竞争条件。这导致了多线程环境下,内核存在非常隐蔽的竟态问题。修改上下文切换的实现后就可以解决 CI 的异常。
