Skip to content

Commit

Permalink
memblock: add rest clockgate of reg (#3017)
Browse files Browse the repository at this point in the history
Co-authored-by: cai luoshan <cailuoshan@node005.bosccluster.com>
Co-authored-by: Cai Luoshan <cailuoshan18@mails.ucas.ac.cn>
Co-authored-by: good-circle <fenghaoyuan19@mails.ucas.ac.cn>
Co-authored-by: Ma-YX <71326427+Ma-YX@users.noreply.github.com>
Co-authored-by: Ma-YX <mayuexiao19@mails.ucas.ac.cn>
Co-authored-by: CharlieLiu <67408162+bosscharlie@users.noreply.github.com>
  • Loading branch information
7 people committed Jun 16, 2024
1 parent 0d257fb commit 5adc482
Show file tree
Hide file tree
Showing 22 changed files with 289 additions and 255 deletions.
18 changes: 10 additions & 8 deletions src/main/scala/xiangshan/backend/MemBlock.scala
Original file line number Diff line number Diff line change
Expand Up @@ -348,10 +348,10 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
val prefetcherOpt: Option[BasePrefecher] = coreParams.prefetcher.map {
case _: SMSParams =>
val sms = Module(new SMSPrefetcher())
sms.io_agt_en := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable_agt, 2, Some(false.B))
sms.io_pht_en := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable_pht, 2, Some(false.B))
sms.io_act_threshold := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_active_threshold, 2, Some(12.U))
sms.io_act_stride := RegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_active_stride, 2, Some(30.U))
sms.io_agt_en := GatedRegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable_agt, 2, Some(false.B))
sms.io_pht_en := GatedRegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_enable_pht, 2, Some(false.B))
sms.io_act_threshold := GatedRegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_active_threshold, 2, Some(12.U))
sms.io_act_stride := GatedRegNextN(io.ooo_to_mem.csrCtrl.l1D_pf_active_stride, 2, Some(30.U))
sms.io_stride_en := false.B
sms.io_dcache_evict <> dcache.io.sms_agt_evict_req
sms
Expand Down Expand Up @@ -500,8 +500,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// load/store prefetch to l2 cache
prefetcherOpt.foreach(sms_pf => {
l1PrefetcherOpt.foreach(l1_pf => {
val sms_pf_to_l2 = ValidIODelay(sms_pf.io.l2_req, 2)
val l1_pf_to_l2 = ValidIODelay(l1_pf.io.l2_req, 2)
val sms_pf_to_l2 = DelayNWithValid(sms_pf.io.l2_req, 2)
val l1_pf_to_l2 = DelayNWithValid(l1_pf.io.l2_req, 2)

outer.l2_pf_sender_opt.get.out.head._1.addr_valid := sms_pf_to_l2.valid || l1_pf_to_l2.valid
outer.l2_pf_sender_opt.get.out.head._1.addr := Mux(l1_pf_to_l2.valid, l1_pf_to_l2.bits.addr, sms_pf_to_l2.bits.addr)
Expand Down Expand Up @@ -613,6 +613,8 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
ptwio.resp.ready := true.B

val tlbreplay = WireInit(VecInit(Seq.fill(LdExuCnt)(false.B)))
val tlbreplay_reg = GatedValidRegNext(tlbreplay)
val dtlb_ld0_tlbreplay_reg = GatedValidRegNext(dtlb_ld(0).tlbreplay)
dontTouch(tlbreplay)
for (i <- 0 until LdExuCnt) {
tlbreplay(i) := dtlb_ld(0).ptw.req(i).valid && ptw_resp_next.vector(0) && ptw_resp_v &&
Expand Down Expand Up @@ -799,7 +801,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
loadUnits(i).io.l2_hint <> l2_hint
loadUnits(i).io.tlb_hint.id := dtlbRepeater.io.hint.get.req(i).id
loadUnits(i).io.tlb_hint.full := dtlbRepeater.io.hint.get.req(i).full ||
RegNext(tlbreplay(i)) || RegNext(dtlb_ld(0).tlbreplay(i))
tlbreplay_reg(i) || dtlb_ld0_tlbreplay_reg(i)

// passdown to lsq (load s2)
lsq.io.ldu.ldin(i) <> loadUnits(i).io.lsq.ldin
Expand Down Expand Up @@ -875,7 +877,7 @@ class MemBlockImp(outer: MemBlock) extends LazyModuleImp(outer)
// dcache refill req
hybridUnits(i).io.ldu_io.tlb_hint.id := dtlbRepeater.io.hint.get.req(LduCnt + i).id
hybridUnits(i).io.ldu_io.tlb_hint.full := dtlbRepeater.io.hint.get.req(LduCnt + i).full ||
RegNext(tlbreplay(LduCnt + i)) || RegNext(dtlb_ld(0).tlbreplay(LduCnt + i))
tlbreplay_reg(LduCnt + i) || dtlb_ld0_tlbreplay_reg(LduCnt + i)

// dtlb
hybridUnits(i).io.tlb <> dtlb_ld.head.requestor(LduCnt + i)
Expand Down
68 changes: 35 additions & 33 deletions src/main/scala/xiangshan/cache/dcache/DCacheWrapper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,9 @@ class DcacheToLduForwardIO(implicit p: Parameters) extends DCacheBundle {

forward_D := all_match
for (i <- 0 until VLEN/8) {
forwardData(i) := selected_data(8 * i + 7, 8 * i)
when (all_match) {
forwardData(i) := selected_data(8 * i + 7, 8 * i)
}
}

(forward_D, forwardData)
Expand All @@ -689,7 +691,7 @@ class MissEntryForwardIO(implicit p: Parameters) extends DCacheBundle {

// check if we can forward from mshr or D channel
def check(req_valid : Bool, req_paddr : UInt) = {
RegNext(req_valid && inflight && req_paddr(PAddrBits - 1, blockOffBits) === paddr(PAddrBits - 1, blockOffBits))
RegNext(req_valid && inflight && req_paddr(PAddrBits - 1, blockOffBits) === paddr(PAddrBits - 1, blockOffBits)) // TODO: clock gate(1-bit)
}

def forward(req_valid : Bool, req_paddr : UInt) = {
Expand Down Expand Up @@ -1243,11 +1245,9 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//----------------------------------------
// atomics
// atomics not finished yet
// io.lsu.atomic <> atomicsReplayUnit.io.lsu
val atomicResp = RegNext(mainPipe.io.atomic_resp)
io.lsu.atomics.resp.valid := atomicResp.valid && atomicResp.bits.isAMO
io.lsu.atomics.resp.bits := atomicResp.bits

val atomic_resp_valid = mainPipe.io.atomic_resp.valid && mainPipe.io.atomic_resp.bits.isAMO
io.lsu.atomics.resp.valid := RegNext(atomic_resp_valid)
io.lsu.atomics.resp.bits := RegEnable(mainPipe.io.atomic_resp.bits, atomic_resp_valid)
io.lsu.atomics.block_lr := mainPipe.io.block_lr
// atomicsReplayUnit.io.pipe_resp := RegNext(mainPipe.io.atomic_resp)
// atomicsReplayUnit.io.block_lr <> mainPipe.io.block_lr
Expand Down Expand Up @@ -1327,7 +1327,8 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
bus.e <> missQueue.io.mem_finish
missQueue.io.probe_addr := bus.b.bits.address

missQueue.io.main_pipe_resp := RegNext(mainPipe.io.atomic_resp)
missQueue.io.main_pipe_resp.valid := RegNext(mainPipe.io.atomic_resp.valid)
missQueue.io.main_pipe_resp.bits := RegEnable(mainPipe.io.atomic_resp.bits, mainPipe.io.atomic_resp.valid)

//----------------------------------------
// probe
Expand All @@ -1346,7 +1347,8 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
// block_decoupled(io.lsu.store.req, mainPipe.io.store_req, refillPipe.io.req.valid)
block_decoupled(io.lsu.store.req, mainPipe.io.store_req, refill_req)

io.lsu.store.replay_resp := RegNext(mainPipe.io.store_replay_resp)
io.lsu.store.replay_resp.valid := RegNext(mainPipe.io.store_replay_resp.valid)
io.lsu.store.replay_resp.bits := RegEnable(mainPipe.io.store_replay_resp.bits, mainPipe.io.store_replay_resp.valid)
io.lsu.store.main_pipe_hit_resp := mainPipe.io.store_hit_resp

mainPipe.io.atomic_req <> io.lsu.atomics.req
Expand Down Expand Up @@ -1378,7 +1380,7 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
//wb.io.probe_ttob_check_resp <> mainPipe.io.probe_ttob_check_resp

io.lsu.release.valid := RegNext(wb.io.req.fire)
io.lsu.release.bits.paddr := RegNext(wb.io.req.bits.addr)
io.lsu.release.bits.paddr := RegEnable(wb.io.req.bits.addr, wb.io.req.fire)
// Note: RegNext() is required by:
// * load queue released flag update logic
// * load / load violation check logic
Expand Down Expand Up @@ -1529,29 +1531,29 @@ class DCacheImp(outer: DCache) extends LazyModuleImp(outer) with HasDCacheParame
io.mshrFull := missQueue.io.full

// performance counter
// val ld_access = Wire(Vec(LoadPipelineWidth, missQueue.io.debug_early_replace.last.cloneType))
// val st_access = Wire(ld_access.last.cloneType)
// ld_access.zip(ldu).foreach {
// case (a, u) =>
// a.valid := RegNext(u.io.lsu.req.fire()) && !u.io.lsu.s1_kill
// a.bits.idx := RegNext(get_idx(u.io.lsu.req.bits.vaddr))
// a.bits.tag := get_tag(u.io.lsu.s1_paddr_dup_dcache)
// }
// st_access.valid := RegNext(mainPipe.io.store_req.fire())
// st_access.bits.idx := RegNext(get_idx(mainPipe.io.store_req.bits.vaddr))
// st_access.bits.tag := RegNext(get_tag(mainPipe.io.store_req.bits.addr))
// val access_info = ld_access.toSeq ++ Seq(st_access)
// val early_replace = RegNext(missQueue.io.debug_early_replace)
// val access_early_replace = access_info.map {
// case acc =>
// Cat(early_replace.map {
// case r =>
// acc.valid && r.valid &&
// acc.bits.tag === r.bits.tag &&
// acc.bits.idx === r.bits.idx
// })
// }
// XSPerfAccumulate("access_early_replace", PopCount(Cat(access_early_replace)))
// val ld_access = Wire(Vec(LoadPipelineWidth, missQueue.io.debug_early_replace.last.cloneType))
// val st_access = Wire(ld_access.last.cloneType)
// ld_access.zip(ldu).foreach {
// case (a, u) =>
// a.valid := RegNext(u.io.lsu.req.fire) && !u.io.lsu.s1_kill
// a.bits.idx := RegEnable(get_idx(u.io.lsu.req.bits.vaddr), u.io.lsu.req.fire)
// a.bits.tag := get_tag(u.io.lsu.s1_paddr_dup_dcache)
// }
// st_access.valid := RegNext(mainPipe.io.store_req.fire)
// st_access.bits.idx := RegEnable(get_idx(mainPipe.io.store_req.bits.vaddr), mainPipe.io.store_req.fire)
// st_access.bits.tag := RegEnable(get_tag(mainPipe.io.store_req.bits.addr), mainPipe.io.store_req.fire)
// val access_info = ld_access.toSeq ++ Seq(st_access)
// val early_replace = RegNext(missQueue.io.debug_early_replace) // TODO: clock gate
// val access_early_replace = access_info.map {
// case acc =>
// Cat(early_replace.map {
// case r =>
// acc.valid && r.valid &&
// acc.bits.tag === r.bits.tag &&
// acc.bits.idx === r.bits.idx
// })
// }
// XSPerfAccumulate("access_early_replace", PopCount(Cat(access_early_replace)))

val perfEvents = (Seq(wb, mainPipe, missQueue, probeQueue) ++ ldu).flatMap(_.getPerfEvents)
generatePerfEvent()
Expand Down
38 changes: 20 additions & 18 deletions src/main/scala/xiangshan/cache/dcache/data/BankedDataArray.scala
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ class DataSRAMBank(index: Int)(implicit p: Parameters) extends DCacheModule {
assert(RegNext(!io.w.en || PopCount(io.w.way_en) <= 1.U))
assert(RegNext(!io.r.en || PopCount(io.r.way_en) <= 1.U))

val r_way_en_reg = RegNext(io.r.way_en)
val r_way_en_reg = RegEnable(io.r.way_en, io.r.en)

// external controls do not read and write at the same time
val w_info = io.w
Expand Down Expand Up @@ -355,15 +355,15 @@ class SramedDataArray(implicit p: Parameters) extends AbstractBankedDataArray {
val line_div_addr = addr_to_dcache_div(io.readline.bits.addr)
// when WPU is enabled, line_way_en is all enabled when read data
val line_way_en = Fill(DCacheWays, 1.U) // val line_way_en = io.readline.bits.way_en
val line_way_en_reg = RegNext(io.readline.bits.way_en)
val line_way_en_reg = RegEnable(io.readline.bits.way_en, io.readline.valid)

val write_bank_mask_reg = RegNext(io.write.bits.wmask)
val write_data_reg = RegNext(io.write.bits.data)
val write_bank_mask_reg = RegEnable(io.write.bits.wmask, io.write.valid)
val write_data_reg = RegEnable(io.write.bits.data, io.write.valid)
val write_valid_reg = RegNext(io.write.valid)
val write_valid_dup_reg = io.write_dup.map(x => RegNext(x.valid))
val write_wayen_dup_reg = io.write_dup.map(x => RegNext(x.bits.way_en))
val write_set_addr_dup_reg = io.write_dup.map(x => RegNext(addr_to_dcache_div_set(x.bits.addr)))
val write_div_addr_dup_reg = io.write_dup.map(x => RegNext(addr_to_dcache_div(x.bits.addr)))
val write_wayen_dup_reg = io.write_dup.map(x => RegEnable(x.bits.way_en, x.valid))
val write_set_addr_dup_reg = io.write_dup.map(x => RegEnable(addr_to_dcache_div_set(x.bits.addr), x.valid))
val write_div_addr_dup_reg = io.write_dup.map(x => RegEnable(addr_to_dcache_div(x.bits.addr), x.valid))

// read data_banks and ecc_banks
// for single port SRAM, do not allow read and write in the same cycle
Expand Down Expand Up @@ -504,6 +504,7 @@ class SramedDataArray(implicit p: Parameters) extends AbstractBankedDataArray {
XSPerfAccumulate("data_read_counter", PopCount(Cat(data_read_oh)))

// read result: expose banked read result
// TODO: clock gate
val read_result_delayed = RegNext(read_result)
(0 until LoadPipelineWidth).map(i => {
// io.read_resp(i) := read_result(RegNext(bank_addrs(i)))(RegNext(OHToUInt(way_en(i))))
Expand Down Expand Up @@ -728,12 +729,12 @@ class BankedDataArray(implicit p: Parameters) extends AbstractBankedDataArray {
val line_way_en = io.readline.bits.way_en

val write_bank_mask_reg = RegNext(io.write.bits.wmask)
val write_data_reg = RegNext(io.write.bits.data)
val write_data_reg = RegEnable(io.write.bits.data, io.write.valid)
val write_valid_reg = RegNext(io.write.valid)
val write_valid_dup_reg = io.write_dup.map(x => RegNext(x.valid))
val write_wayen_dup_reg = io.write_dup.map(x => RegNext(x.bits.way_en))
val write_set_addr_dup_reg = io.write_dup.map(x => RegNext(addr_to_dcache_div_set(x.bits.addr)))
val write_div_addr_dup_reg = io.write_dup.map(x => RegNext(addr_to_dcache_div(x.bits.addr)))
val write_set_addr_dup_reg = io.write_dup.map(x => RegEnable(addr_to_dcache_div_set(x.bits.addr), x.valid))
val write_div_addr_dup_reg = io.write_dup.map(x => RegEnable(addr_to_dcache_div(x.bits.addr), x.valid))

// read data_banks and ecc_banks
// for single port SRAM, do not allow read and write in the same cycle
Expand All @@ -744,12 +745,12 @@ class BankedDataArray(implicit p: Parameters) extends AbstractBankedDataArray {
bank_addrs(rport_index)(0) := addr_to_dcache_bank(io.read(rport_index).bits.addr)
bank_addrs(rport_index)(1) := Mux(io.is128Req(rport_index), bank_addrs(rport_index)(0) + 1.U, DCacheBanks.asUInt)
set_addrs(rport_index) := addr_to_dcache_div_set(io.read(rport_index).bits.addr)
set_addrs_reg(rport_index) := RegNext(addr_to_dcache_div_set(io.read(rport_index).bits.addr))
set_addrs_reg(rport_index) := RegEnable(addr_to_dcache_div_set(io.read(rport_index).bits.addr), io.read(rport_index).valid)

// use way_en to select a way after data read out
assert(!(RegNext(io.read(rport_index).fire && PopCount(io.read(rport_index).bits.way_en) > 1.U)))
way_en(rport_index) := io.read(rport_index).bits.way_en
way_en_reg(rport_index) := RegNext(io.read(rport_index).bits.way_en)
way_en_reg(rport_index) := RegEnable(io.read(rport_index).bits.way_en, io.read(rport_index).valid)
})

// read each bank, get bank result
Expand Down Expand Up @@ -895,10 +896,11 @@ class BankedDataArray(implicit p: Parameters) extends AbstractBankedDataArray {

val bank_result_delayed = RegNext(bank_result)
(0 until LoadPipelineWidth).map(i => {
val rr_read_fire = RegNext(RegNext(io.read(i).fire))
val rr_div_addr = RegNext(RegNext(div_addrs(i)))
val rr_bank_addr = RegNext(RegNext(bank_addrs(i)))
val rr_way_addr = RegNext(RegNext(OHToUInt(way_en(i))))
val r_read_fire = RegNext(io.read(i).fire)
val rr_read_fire = RegNext(r_read_fire)
val rr_div_addr = RegEnable(RegEnable(div_addrs(i), io.read(i).fire), r_read_fire)
val rr_bank_addr = RegEnable(RegEnable(bank_addrs(i), io.read(i).fire), r_read_fire)
val rr_way_addr = RegEnable(RegEnable(OHToUInt(way_en(i)), io.read(i).fire), r_read_fire)
(0 until VLEN/DCacheSRAMRowBits).map( j =>{
io.read_resp_delayed(i)(j) := bank_result_delayed(rr_div_addr)(rr_bank_addr(j))
// error detection
Expand All @@ -907,7 +909,7 @@ class BankedDataArray(implicit p: Parameters) extends AbstractBankedDataArray {
})

// read result: expose banked read result
io.readline_resp := bank_result(RegNext(line_div_addr))
io.readline_resp := bank_result(RegEnable(line_div_addr, io.readline.valid))
io.readline_error_delayed := RegNext(RegNext(io.readline.fire)) &&
VecInit((0 until DCacheBanks).map(i => io.readline_resp(i).error_delayed)).asUInt.orR

Expand All @@ -931,7 +933,7 @@ class BankedDataArray(implicit p: Parameters) extends AbstractBankedDataArray {
ecc_bank.io.w.req.valid := wen_reg
ecc_bank.io.w.req.bits.apply(
setIdx = write_set_addr_dup_reg(bank_index),
data = RegNext(getECCFromEncWord(cacheParams.dataCode.encode((io.write.bits.data(bank_index))))),
data = RegEnable(getECCFromEncWord(cacheParams.dataCode.encode((io.write.bits.data(bank_index)))), wen_reg),
waymask = write_wayen_dup_reg(bank_index)
)
when(ecc_bank.io.w.req.valid) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class DuplicatedDataArray(implicit p: Parameters) extends AbstractDataArray {
val rdata = Output(UInt())
})

val r_way_en_reg = RegNext(io.r_way_en)
val r_way_en_reg = RegEnable(io.r_way_en, io.ren)
val data_array = Array.fill(nWays) {
Module(new SRAMTemplate(
Bits(rowBits.W),
Expand Down Expand Up @@ -110,7 +110,7 @@ class DuplicatedDataArray(implicit p: Parameters) extends AbstractDataArray {

// use way_en to select a way after data read out
assert(!(RegNext(io.read(j).fire && PopCount(io.read(j).bits.way_en) > 1.U)))
val way_en = RegNext(io.read(j).bits.way_en)
val way_en = RegEnable(io.read(j).bits.way_en, io.read(j).fire)

val row_error = Wire(Vec(blockRows, Vec(rowWords, Bool())))
for (r <- 0 until blockRows) {
Expand Down Expand Up @@ -163,7 +163,7 @@ class DuplicatedDataArray(implicit p: Parameters) extends AbstractDataArray {
}
})
io.errors(j).bits.report_to_beu := RegNext(io.read(j).fire) && Cat(row_error.flatten).orR
io.errors(j).bits.paddr := RegNext(io.read(j).bits.addr)
io.errors(j).bits.paddr := RegEnable(io.read(j).bits.addr, io.read(j).fire)
}

io.nacks(j) := false.B
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ class LoadPipe(id: Int)(implicit p: Parameters) extends DCacheModule with HasPer
io.miss_req.valid := s2_valid && s2_can_send_miss_req
io.miss_req.bits := DontCare
io.miss_req.bits.source := s2_instrtype
io.miss_req.bits.pf_source := RegNext(RegNext(io.lsu.pf_source))
io.miss_req.bits.pf_source := RegNext(RegNext(io.lsu.pf_source)) // TODO: clock gate
io.miss_req.bits.cmd := s2_req.cmd
io.miss_req.bits.addr := get_block_addr(s2_paddr)
io.miss_req.bits.vaddr := s2_vaddr
Expand Down
Loading

0 comments on commit 5adc482

Please sign in to comment.