diff --git a/cpu/esp8266/ld/esp8266.riot-os.no_sdk.app.ld b/cpu/esp8266/ld/esp8266.riot-os.no_sdk.app.ld index e7b7e0d550d7..fd555cae0172 100644 --- a/cpu/esp8266/ld/esp8266.riot-os.no_sdk.app.ld +++ b/cpu/esp8266/ld/esp8266.riot-os.no_sdk.app.ld @@ -206,6 +206,8 @@ SECTIONS LONG(0) LONG(0) . = ALIGN (16); + *(.UserExceptionTrampoline.text) + . = ALIGN (16); *(.entry.text) *(.init.literal) *(.init) diff --git a/cpu/esp8266/ld/esp8266.riot-os.sdk.app.ld b/cpu/esp8266/ld/esp8266.riot-os.sdk.app.ld index 5a3bf0f3da02..f9f779cff223 100644 --- a/cpu/esp8266/ld/esp8266.riot-os.sdk.app.ld +++ b/cpu/esp8266/ld/esp8266.riot-os.sdk.app.ld @@ -105,16 +105,43 @@ SECTIONS _data_end = ABSOLUTE(.); } >dram0_0_seg :dram0_0_phdr + /* + * .rodata sections that are placed in RAM + * + * Usually, all .rodata sections are placed in RAM by the Espressif SDK + * since IROM (flash) access requires 32-bit word aligned reads. + * + * However, thanks to the LoadStoreError handler from esp-open-rtos which is + * also used in RIOT-OS, it is possible to place .rodata sections in IROM + * (flash) to save RAM resources. + * + * Only .rodata data sections of compilation units that may be executed + * while SPI flash is not mapped have to be stored in RAM. These are IRAM + * functions that are called from interrupt context or SPI flash management + * functions. Such compilation units have to be listed here. + * + * Furthermore, compilation units with constant data that are performance- + * critical should be listed here as well. + */ + .rodata : ALIGN(4) { _rodata_start = ABSOLUTE(.); *(.sdk.version) - /* TODO put only necessary .rodata to dram - *libc.a:*.o(.rodata.* .rodata) + *core.a:*(.rodata.* .rodata) *cpu.a:*(.rodata .rodata.*) - */ - *(.rodata .rodata.*) + *esp.a:*(.rodata .rodata.*) + *esp_now.a:*(.rodata .rodata.*) + *esp_wifi.a:*(.rodata .rodata.*) + *periph.a:*(.rodata.* .rodata) + *sdk.a:*(.rodata .rodata.*) + *xtensa.a:*(.rodata .rodata.*) + + *libc.a:*.o(.rodata.* .rodata) + *libpp.a:wdev.o(.rodata.* .rodata) + *libmain.a:spi_flash.o(.rodata.* .rodata) + *(.gnu.linkonce.r.*) *(.rodata1) __XT_EXCEPTION_TABLE__ = ABSOLUTE(.); @@ -206,6 +233,8 @@ SECTIONS LONG(0) LONG(0) . = ALIGN (16); + *(.UserExceptionTrampoline.text) + . = ALIGN (16); *(.entry.text) *(.init.literal) *(.init) diff --git a/cpu/esp8266/vendor/xtensa/xtensa_vectors.S b/cpu/esp8266/vendor/xtensa/xtensa_vectors.S index e620f5facc0d..069d9b558464 100644 --- a/cpu/esp8266/vendor/xtensa/xtensa_vectors.S +++ b/cpu/esp8266/vendor/xtensa/xtensa_vectors.S @@ -488,12 +488,324 @@ User Exception (including Level 1 Interrupt from user mode). _UserExceptionVector: +#ifdef MCU_ESP8266 + wsr a0, EXCSAVE_1 /* preserve a0 */ + j _UserExceptionTrampoline /* jump to handler trampoline */ +#else wsr a0, EXCSAVE_1 /* preserve a0 */ call0 _xt_user_exc /* user exception handler */ /* never returns here - call0 is used as a jump (see note at top) */ +#endif .end literal_prefix +#ifdef MCU_ESP8266 +/*************************** LoadStoreError Handler BEGIN ********************/ +/* + * PLEASE NOTE: The code between "LoadStoreError Handler BEGIN" and + * "LoadStoreError Handler END" markers was extracted from esp-open-rtos. It is + * under the following copyright: + * + * Original vector contents Copyright (C) 2014-2015 Espressif Systems + * Additions Copyright (C) Superhouse Automation Pty Ltd and Angus Gratton + * BSD Licensed as described in the file LICENSE + * + * Usually, the access to the IROM (flash) memory requires 32-bit word aligned + * reads. Attempts to access data in the IROM (flash) memory less than 32 bits + * in size triggers a LoadStoreError exception. Therefore, it is not possible to + * place .rodata sections in IROM (flash). Rather, .rodata sections have to + * be placed in RAM. With the exception handler from esp-open-rtos it becomes + * possible to access data in IROM (flash) with a size of less than 32 bits + * and thus to place .rodata sections in the IROM (flash). + */ + +#define CAUSE_LOADSTORE 3 +#define fatal_exception_handler _xt_user_exc + +/* LoadStoreError handler stack */ + + .section .bss + .balign 16 + +_LoadStoreErrorHandlerStack: + .word 0 # a0 + .word 0 # (unused) + .word 0 # a2 + .word 0 # a3 + .word 0 # a4 + +/* LoadStoreError Trampoline */ + + .section .UserExceptionTrampoline.text, "x" + .literal_position + .balign 4 + +_UserExceptionTrampoline: + + wsr a1, EXCSAVE_2 /* preserve a1 */ +#ifdef MCU_ESP8266 + rsr a1, exccause + beqi a1, CAUSE_LOADSTORE, _LoadStoreErrorHandler +#endif + rsr a1, EXCSAVE_2 /* restore a1 */ + call0 _xt_user_exc /* user exception handler */ + /* never returns here - call0 is used as a jump (see note at top) */ + + /* + * Xtensa "Load/Store Exception" handler: + * Completes L8/L16 load instructions from Instruction address space, + * for which the architecture only supports 32-bit reads. + * + * Called from UserExceptionVector if EXCCAUSE is LoadStoreErrorCause + * + * (Fast path (no branches) is for L8UI) + */ + .literal_position + .balign 4 + .type LoadStoreErrorHandler, @function + +_LoadStoreErrorHandler: + + rsr a1, EXCSAVE_2 /* restore a1 */ + wsr a1, EXCSAVE_1 /* save it to excsave1 */ + /* Registers are saved in the address corresponding to their register + * number times 4. This allows a quick and easy mapping later on when + * needing to store the value to a particular register number. */ + movi sp, _LoadStoreErrorHandlerStack + s32i a0, sp, 0 + s32i a2, sp, 0x08 + s32i a3, sp, 0x0c + s32i a4, sp, 0x10 + rsr a0, sar # Save SAR in a0 to restore later + + /* Examine the opcode which generated the exception */ + /* Note: Instructions are in this order to avoid pipeline stalls. */ + rsr a2, epc1 + movi a3, ~3 + ssa8l a2 # sar is now correct shift for aligned read + and a2, a2, a3 # a2 now 4-byte aligned address of instruction + l32i a4, a2, 0 + l32i a2, a2, 4 + movi a3, 0x00700F # opcode mask for l8ui/l16si/l16ui + src a2, a2, a4 # a2 now instruction that failed + and a3, a2, a3 # a3 is masked instruction + bnei a3, 0x000002, .LSE_check_l16 + + /* Note: At this point, opcode could technically be one of two things: + * xx0xx2 (L8UI) + * xx8xx2 (Reserved (invalid) opcode) + * It is assumed that we'll never get to this point from an illegal + * opcode, so we don't bother to check for that case and presume this + * is always an L8UI. */ + + movi a4, ~3 + rsr a3, excvaddr # read faulting address + and a4, a3, a4 # a4 now word aligned read address + + l32i a4, a4, 0 # perform the actual read + ssa8l a3 # sar is now shift to extract a3's byte + srl a3, a4 # shift right correct distance + extui a4, a3, 0, 8 # mask off bits we need for an l8 + +.LSE_post_fetch: + /* We jump back here after either the L8UI or the L16*I routines do the + * necessary work to read the value from memory. + * At this point, a2 holds the faulting instruction and a4 holds the + * correctly read value. + + * Restore original SAR value (saved in a0) and update EPC so we'll + * return back to the instruction following the one we just emulated */ + + /* Note: Instructions are in this order to avoid pipeline stalls */ + rsr a3, epc1 + wsr a0, sar + addi a3, a3, 0x3 + wsr a3, epc1 + + /* Stupid opcode tricks: The jumptable we use later on needs 16 bytes + * per entry (so we can avoid a second jump by just doing a RFE inside + * each entry). Unfortunately, however, Xtensa doesn't have an addx16 + * operation to make that easy for us. Luckily, all of the faulting + * opcodes we're processing are guaranteed to have bit 3 be zero, which + * means if we just shift the register bits of the opcode down by 3 + * instead of 4, we will get the register number multiplied by 2. This + * combined with an addx8 will give us an effective addx16 without + * needing any extra shift operations. */ + extui a2, a2, 3, 5 # a2 is now destination register 0-15 times 2 + + bgei a2, 10, .LSE_assign_reg # a5..a15 use jumptable + beqi a2, 2, .LSE_assign_a1 # a1 uses a special routine + + /* We're storing into a0 or a2..a4, which are all saved in our "stack" + * area. Calculate the correct address and stick the value in there, + * then just do our normal restore and RFE (no jumps required, which + * actually makes a0..a4 substantially faster). */ + addx2 a2, a2, sp + s32i a4, a2, 0 + + /* Restore all regs and return */ + l32i a0, sp, 0 + l32i a2, sp, 0x08 + l32i a3, sp, 0x0c + l32i a4, sp, 0x10 + rsr a1, excsave1 # restore a1 saved by UserExceptionVector + rfe + +.LSE_assign_reg: + /* At this point, a2 contains the register number times 2, a4 is the + * read value. */ + + /* Calculate the jumptable address, and restore all regs except a2 and + * a4 so we have less to do after jumping. */ + /* Note: Instructions are in this order to avoid pipeline stalls. */ + movi a3, .LSE_jumptable_base + l32i a0, sp, 0 + addx8 a2, a2, a3 # a2 is now the address to jump to + l32i a3, sp, 0x0c + + jx a2 + + .balign 4 +.LSE_check_l16: + /* At this point, a2 contains the opcode, a3 is masked opcode */ + movi a4, 0x001002 # l16si or l16ui opcode after masking + bne a3, a4, .LSE_wrong_opcode + + /* Note: At this point, the opcode could be one of two things: + * xx1xx2 (L16UI) + * xx9xx2 (L16SI) + * Both of these we can handle. */ + + movi a4, ~3 + rsr a3, excvaddr # read faulting address + and a4, a3, a4 # a4 now word aligned read address + + l32i a4, a4, 0 # perform the actual read + ssa8l a3 # sar is now shift to extract a3's bytes + srl a3, a4 # shift right correct distance + extui a4, a3, 0, 16 # mask off bits we need for an l16 + + bbci a2, 15, .LSE_post_fetch # Not a signed op + bbci a4, 15, .LSE_post_fetch # Value does not need sign-extension + + movi a3, 0xFFFF0000 + or a4, a3, a4 # set 32-bit sign bits + j .LSE_post_fetch + +.LSE_wrong_opcode: + /* If we got here it's not an opcode we can try to fix, so bomb out. + * Restore registers so any dump the fatal exception routine produces + * will have correct values */ + wsr a0, sar + l32i a0, sp, 0 + /*l32i a2, sp, 0x08*/ + l32i a3, sp, 0x0c + l32i a4, sp, 0x10 + rsr a1, excsave1 + mov a2, a1 + movi a3, 0 + call0 fatal_exception_handler + + .balign 4 +.LSE_assign_a1: + /* a1 is saved in excsave1, so just update that with the value, */ + wsr a4, excsave1 + /* Then restore all regs and return */ + l32i a0, sp, 0 + l32i a2, sp, 0x08 + l32i a3, sp, 0x0c + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .balign 4 +.LSE_jumptable: + /* The first 5 entries (80 bytes) of this table are unused (registers + * a0..a4 are handled separately above). Rather than have a whole bunch + * of wasted space, we just pretend that the table starts 80 bytes + * earlier in memory. */ + .set .LSE_jumptable_base, .LSE_jumptable - (16 * 5) + + .org .LSE_jumptable_base + (16 * 5) + mov a5, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 6) + mov a6, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 7) + mov a7, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 8) + mov a8, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 9) + mov a9, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 10) + mov a10, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 11) + mov a11, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 12) + mov a12, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 13) + mov a13, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 14) + mov a14, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + + .org .LSE_jumptable_base + (16 * 15) + mov a15, a4 + l32i a2, sp, 0x08 + l32i a4, sp, 0x10 + rsr a1, excsave1 + rfe + +/*************************** LoadStoreError Handler END **********************/ +#endif + /* -------------------------------------------------------------------------------- Insert some waypoints for jumping beyond the signed 8-bit range of