Skip to content

Commit

Permalink
New out-of-order vlsu for better vector performance (#2944)
Browse files Browse the repository at this point in the history
Support out-of-order non-Segment Unit-Strdie load/store instructions
Support out-of-order non-Segment Stride load/store instructions
Support out-of-order non-Segment Order/Unorder Index load/store instructions
Use LSQ to ensure memory access order of order index instructions
Use FSM to achieve Segment Load/Store instructions, which can ensure segment access order

TODO: Except Segment order index, other segment instructions can execute out-of-order, don't need to use FSM to ensure memory access order.
  • Loading branch information
Tang-Haojin committed May 21, 2024
2 parents 2316cea + c11f007 commit 9f2c7f7
Show file tree
Hide file tree
Showing 72 changed files with 4,164 additions and 3,331 deletions.
2 changes: 1 addition & 1 deletion difftest
22 changes: 16 additions & 6 deletions src/main/scala/top/Configs.scala
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class MinimalConfig(n: Int = 1) extends Config(
RobCommitWidth = 8,
FetchWidth = 4,
VirtualLoadQueueSize = 24,
LoadQueueRARSize = 16,
LoadQueueRARSize = 24,
LoadQueueRAWSize = 12,
LoadQueueReplaySize = 24,
LoadUncacheBufferSize = 8,
Expand All @@ -77,14 +77,20 @@ class MinimalConfig(n: Int = 1) extends Config(
StoreQueueSize = 20,
StoreQueueNWriteBanks = 4, // NOTE: make sure that StoreQueueSize is divided by StoreQueueNWriteBanks
StoreQueueForwardWithMask = true,
// ============ VLSU ============
VlMergeBufferSize = 8,
VsMergeBufferSize = 8,
UopWritebackWidth = 2,
SplitBufferSize = 8,
// ==============================
RobSize = 48,
RabSize = 96,
FtqSize = 8,
IBufSize = 24,
IBufNBank = 6,
StoreBufferSize = 4,
StoreBufferThreshold = 3,
IssueQueueSize = 8,
IssueQueueSize = 10,
IssueQueueCompEntrySize = 4,
dpParams = DispatchParameters(
IntDqSize = 12,
Expand Down Expand Up @@ -141,28 +147,32 @@ class MinimalConfig(n: Int = 1) extends Config(
NWays = 4,
partialStaticPMP = true,
outsideRecvFlush = true,
outReplace = false
outReplace = false,
lgMaxSize = 4
),
sttlbParameters = TLBParameters(
name = "sttlb",
NWays = 4,
partialStaticPMP = true,
outsideRecvFlush = true,
outReplace = false
outReplace = false,
lgMaxSize = 4
),
hytlbParameters = TLBParameters(
name = "hytlb",
NWays = 4,
partialStaticPMP = true,
outsideRecvFlush = true,
outReplace = false
outReplace = false,
lgMaxSize = 4
),
pftlbParameters = TLBParameters(
name = "pftlb",
NWays = 4,
partialStaticPMP = true,
outsideRecvFlush = true,
outReplace = false
outReplace = false,
lgMaxSize = 4
),
btlbParameters = TLBParameters(
name = "btlb",
Expand Down
9 changes: 5 additions & 4 deletions src/main/scala/xiangshan/Bundle.scala
Original file line number Diff line number Diff line change
Expand Up @@ -406,19 +406,20 @@ class SnapshotPort(implicit p: Parameters) extends XSBundle {
val flushVec = Vec(RenameSnapshotNum, Bool())
}

class RSFeedback(implicit p: Parameters) extends XSBundle {
class RSFeedback(isVector: Boolean = false)(implicit p: Parameters) extends XSBundle {
val robIdx = new RobPtr
val hit = Bool()
val flushState = Bool()
val sourceType = RSFeedbackType()
val dataInvalidSqIdx = new SqPtr
val uopIdx = OptionWrapper(isVector, UopIdx())
}

class MemRSFeedbackIO(implicit p: Parameters) extends XSBundle {
class MemRSFeedbackIO(isVector: Boolean = false)(implicit p: Parameters) extends XSBundle {
// Note: you need to update in implicit Parameters p before imp MemRSFeedbackIO
// for instance: MemRSFeedbackIO()(updateP)
val feedbackSlow = ValidIO(new RSFeedback()) // dcache miss queue full, dtlb miss
val feedbackFast = ValidIO(new RSFeedback()) // bank conflict
val feedbackSlow = ValidIO(new RSFeedback(isVector)) // dcache miss queue full, dtlb miss
val feedbackFast = ValidIO(new RSFeedback(isVector)) // bank conflict
}

class LoadCancelIO(implicit p: Parameters) extends XSBundle {
Expand Down
65 changes: 48 additions & 17 deletions src/main/scala/xiangshan/Parameters.scala
Original file line number Diff line number Diff line change
Expand Up @@ -199,17 +199,19 @@ case class XSCoreParameters
VecMemSrcInWidth: Int = 2,
VecMemInstWbWidth: Int = 1,
VecMemDispatchWidth: Int = 1,
VecMemDispatchMaxNumber: Int = 16,
StoreBufferSize: Int = 16,
StoreBufferThreshold: Int = 7,
EnsbufferWidth: Int = 2,
LoadDependencyWidth: Int = 2,
// ============ VLSU ============
UsQueueSize: Int = 8,
VlFlowSize: Int = 32,
VlUopSize: Int = 32,
VsFlowL1Size: Int = 128,
VsFlowL2Size: Int = 32,
VsUopSize: Int = 32,
VlMergeBufferSize: Int = 16,
VsMergeBufferSize: Int = 16,
UopWritebackWidth: Int = 2,
VLUopWritebackWidth: Int = 2,
VSUopWritebackWidth: Int = 1,
SplitBufferSize: Int = 8,
VSegmentBufferSize: Int = 8,
// ==============================
UncacheBufferSize: Int = 4,
EnableLoadToLoadForward: Boolean = false,
Expand Down Expand Up @@ -252,31 +254,35 @@ case class XSCoreParameters
outReplace = false,
partialStaticPMP = true,
outsideRecvFlush = true,
saveLevel = true
saveLevel = true,
lgMaxSize = 4
),
sttlbParameters: TLBParameters = TLBParameters(
name = "sttlb",
NWays = 48,
outReplace = false,
partialStaticPMP = true,
outsideRecvFlush = true,
saveLevel = true
saveLevel = true,
lgMaxSize = 4
),
hytlbParameters: TLBParameters = TLBParameters(
name = "hytlb",
NWays = 48,
outReplace = false,
partialStaticPMP = true,
outsideRecvFlush = true,
saveLevel = true
saveLevel = true,
lgMaxSize = 4
),
pftlbParameters: TLBParameters = TLBParameters(
name = "pftlb",
NWays = 48,
outReplace = false,
partialStaticPMP = true,
outsideRecvFlush = true,
saveLevel = true
saveLevel = true,
lgMaxSize = 4
),
l2ToL1tlbParameters: TLBParameters = TLBParameters(
name = "l2tlb",
Expand Down Expand Up @@ -325,6 +331,16 @@ case class XSCoreParameters
){
def vlWidth = log2Up(VLEN) + 1

/**
* the minimum element length of vector elements
*/
val minVecElen: Int = 8

/**
* the maximum number of elements in vector register
*/
val maxElemPerVreg: Int = VLEN / minVecElen

val allHistLens = SCHistLens ++ ITTageTableInfos.map(_._2) ++ TageTableInfos.map(_._2) :+ UbtbGHRLength
val HistoryLength = allHistLens.max + numBr * FtqSize + 9 // 256 for the predictor configs now

Expand Down Expand Up @@ -425,7 +441,10 @@ case class XSCoreParameters
ExeUnitParams("LDU2", Seq(LduCfg), Seq(IntWB(7, 0), FpWB(7, 0)), Seq(Seq(IntRD(14, 0))), true, 2),
), numEntries = IssueQueueSize, numEnq = 2, numComp = IssueQueueCompEntrySize),
IssueBlockParams(Seq(
ExeUnitParams("VLSU0", Seq(VlduCfg, VstuCfg), Seq(VfWB(0, 0)), Seq(Seq(VfRD(10, 0)), Seq(VfRD(11, 0)), Seq(VfRD(12, 0)), Seq(VfRD(13, 0)), Seq(VfRD(14, 0)))),
ExeUnitParams("VLSU0", Seq(VlduCfg, VstuCfg, VseglduSeg, VsegstuCfg), Seq(VfWB(0, 0)), Seq(Seq(VfRD(10, 0)), Seq(VfRD(11, 0)), Seq(VfRD(12, 0)), Seq(VfRD(13, 0)), Seq(VfRD(14, 0)))),
), numEntries = IssueQueueSize, numEnq = 2, numComp = IssueQueueCompEntrySize),
IssueBlockParams(Seq(
ExeUnitParams("VLSU1", Seq(VlduCfg, VstuCfg), Seq(VfWB(8, 0)), Seq(Seq(VfRD(15, 0)), Seq(VfRD(16, 0)), Seq(VfRD(17, 0)), Seq(VfRD(18, 0)), Seq(VfRD(19, 0)))),
), numEntries = IssueQueueSize, numEnq = 2, numComp = IssueQueueCompEntrySize),
IssueBlockParams(Seq(
ExeUnitParams("STD0", Seq(StdCfg, MoudCfg), Seq(), Seq(Seq(IntRD(10, 1), FpRD(14, 0)))),
Expand Down Expand Up @@ -638,6 +657,16 @@ trait HasXSParameter {
def RobSize = coreParams.RobSize
def RabSize = coreParams.RabSize
def VTypeBufferSize = coreParams.VTypeBufferSize
/**
* the minimum element length of vector elements
*/
def minVecElen: Int = coreParams.minVecElen

/**
* the maximum number of elements in vector register
*/
def maxElemPerVreg: Int = coreParams.maxElemPerVreg

def IntRefCounterWidth = log2Ceil(RobSize)
def LSQEnqWidth = coreParams.dpParams.LsDqDeqWidth
def LSQLdEnqWidth = LSQEnqWidth min backendParams.numLoadDp
Expand Down Expand Up @@ -668,16 +697,18 @@ trait HasXSParameter {
def VecMemSrcInWidth = coreParams.VecMemSrcInWidth
def VecMemInstWbWidth = coreParams.VecMemInstWbWidth
def VecMemDispatchWidth = coreParams.VecMemDispatchWidth
def VecMemDispatchMaxNumber = coreParams.VecMemDispatchMaxNumber
def StoreBufferSize = coreParams.StoreBufferSize
def StoreBufferThreshold = coreParams.StoreBufferThreshold
def EnsbufferWidth = coreParams.EnsbufferWidth
def LoadDependencyWidth = coreParams.LoadDependencyWidth
def UsQueueSize = coreParams.UsQueueSize
def VlFlowSize = coreParams.VlFlowSize
def VlUopSize = coreParams.VlUopSize
def VsFlowL1Size = coreParams.VsFlowL1Size
def VsFlowL2Size = coreParams.VsFlowL2Size
def VsUopSize = coreParams.VsUopSize
def VlMergeBufferSize = coreParams.VlMergeBufferSize
def VsMergeBufferSize = coreParams.VsMergeBufferSize
def UopWritebackWidth = coreParams.UopWritebackWidth
def VLUopWritebackWidth = coreParams.VLUopWritebackWidth
def VSUopWritebackWidth = coreParams.VSUopWritebackWidth
def SplitBufferSize = coreParams.SplitBufferSize
def VSegmentBufferSize = coreParams.VSegmentBufferSize
def UncacheBufferSize = coreParams.UncacheBufferSize
def EnableLoadToLoadForward = coreParams.EnableLoadToLoadForward
def EnableFastForward = coreParams.EnableFastForward
Expand Down
3 changes: 3 additions & 0 deletions src/main/scala/xiangshan/XSCore.scala
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
backend.io.mem.ldaIqFeedback <> memBlock.io.mem_to_ooo.ldaIqFeedback
backend.io.mem.staIqFeedback <> memBlock.io.mem_to_ooo.staIqFeedback
backend.io.mem.hyuIqFeedback <> memBlock.io.mem_to_ooo.hyuIqFeedback
backend.io.mem.vstuIqFeedback <> memBlock.io.mem_to_ooo.vstuIqFeedback
backend.io.mem.vlduIqFeedback <> memBlock.io.mem_to_ooo.vlduIqFeedback
backend.io.mem.ldCancel <> memBlock.io.mem_to_ooo.ldCancel
backend.io.mem.wakeup <> memBlock.io.mem_to_ooo.wakeup
backend.io.mem.writebackLda <> memBlock.io.mem_to_ooo.writebackLda
Expand Down Expand Up @@ -203,6 +205,7 @@ class XSCoreImp(outer: XSCoreBase) extends LazyModuleImp(outer)
memBlock.io.ooo_to_mem.lsqio.scommit := backend.io.mem.robLsqIO.scommit
memBlock.io.ooo_to_mem.lsqio.pendingld := backend.io.mem.robLsqIO.pendingld
memBlock.io.ooo_to_mem.lsqio.pendingst := backend.io.mem.robLsqIO.pendingst
memBlock.io.ooo_to_mem.lsqio.pendingVst := backend.io.mem.robLsqIO.pendingVst
memBlock.io.ooo_to_mem.lsqio.commit := backend.io.mem.robLsqIO.commit
memBlock.io.ooo_to_mem.lsqio.pendingPtr := backend.io.mem.robLsqIO.pendingPtr
memBlock.io.ooo_to_mem.lsqio.pendingPtrNext := backend.io.mem.robLsqIO.pendingPtrNext
Expand Down
30 changes: 30 additions & 0 deletions src/main/scala/xiangshan/backend/Backend.scala
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
private val og1CancelOH: UInt = dataPath.io.og1CancelOH
private val og0CancelOH: UInt = dataPath.io.og0CancelOH
private val cancelToBusyTable = dataPath.io.cancelToBusyTable
private val vlIsZero = intExuBlock.io.vlIsZero.get
private val vlIsVlmax = intExuBlock.io.vlIsVlmax.get

ctrlBlock.io.IQValidNumVec := intScheduler.io.IQValidNumVec
ctrlBlock.io.fromTop.hartId := io.fromTop.hartId
Expand Down Expand Up @@ -241,6 +243,8 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
intScheduler.io.fromDataPath.og1Cancel := og1CancelOH
intScheduler.io.ldCancel := io.mem.ldCancel
intScheduler.io.fromDataPath.cancelToBusyTable := cancelToBusyTable
intScheduler.io.vlWriteBack.vlIsZero := false.B
intScheduler.io.vlWriteBack.vlIsVlmax := false.B

fpScheduler.io.fromTop.hartId := io.fromTop.hartId
fpScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush
Expand All @@ -255,6 +259,8 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
fpScheduler.io.fromDataPath.og1Cancel := og1CancelOH
fpScheduler.io.ldCancel := io.mem.ldCancel
fpScheduler.io.fromDataPath.cancelToBusyTable := cancelToBusyTable
fpScheduler.io.vlWriteBack.vlIsZero := false.B
fpScheduler.io.vlWriteBack.vlIsVlmax := false.B

memScheduler.io.fromTop.hartId := io.fromTop.hartId
memScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush
Expand All @@ -281,11 +287,15 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
memScheduler.io.fromMem.get.ldaFeedback := io.mem.ldaIqFeedback
memScheduler.io.fromMem.get.staFeedback := io.mem.staIqFeedback
memScheduler.io.fromMem.get.hyuFeedback := io.mem.hyuIqFeedback
memScheduler.io.fromMem.get.vstuFeedback := io.mem.vstuIqFeedback
memScheduler.io.fromMem.get.vlduFeedback := io.mem.vlduIqFeedback
memScheduler.io.fromSchedulers.wakeupVec.foreach { wakeup => wakeup := iqWakeUpMappedBundle(wakeup.bits.exuIdx) }
memScheduler.io.fromDataPath.og0Cancel := og0CancelOH
memScheduler.io.fromDataPath.og1Cancel := og1CancelOH
memScheduler.io.ldCancel := io.mem.ldCancel
memScheduler.io.fromDataPath.cancelToBusyTable := cancelToBusyTable
memScheduler.io.vlWriteBack.vlIsZero := vlIsZero
memScheduler.io.vlWriteBack.vlIsVlmax := vlIsVlmax

vfScheduler.io.fromTop.hartId := io.fromTop.hartId
vfScheduler.io.fromCtrlBlock.flush := ctrlBlock.io.toIssueBlock.flush
Expand All @@ -300,6 +310,8 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
vfScheduler.io.fromDataPath.og1Cancel := og1CancelOH
vfScheduler.io.ldCancel := io.mem.ldCancel
vfScheduler.io.fromDataPath.cancelToBusyTable := cancelToBusyTable
vfScheduler.io.vlWriteBack.vlIsZero := vlIsZero
vfScheduler.io.vlWriteBack.vlIsVlmax := vlIsVlmax
vfScheduler.io.fromOg2.get := og2ForVector.io.toVfIQ

dataPath.io.hartId := io.fromTop.hartId
Expand Down Expand Up @@ -487,7 +499,9 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
// to mem
private val memIssueParams = params.memSchdParams.get.issueBlockParams
private val memExuBlocksHasLDU = memIssueParams.map(_.exuBlockParams.map(x => x.hasLoadFu || x.hasHyldaFu))
private val memExuBlocksHasVecLoad = memIssueParams.map(_.exuBlockParams.map(x => x.hasVLoadFu))
println(s"[Backend] memExuBlocksHasLDU: $memExuBlocksHasLDU")
println(s"[Backend] memExuBlocksHasVecLoad: $memExuBlocksHasVecLoad")

private val toMem = Wire(bypassNetwork.io.toExus.mem.cloneType)
for (i <- toMem.indices) {
Expand Down Expand Up @@ -523,6 +537,18 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
memScheduler.io.memAddrIssueResp(i)(j).bits.robIdx := toMem(i)(j).bits.robIdx
memScheduler.io.memAddrIssueResp(i)(j).bits.resp := RespType.success // for load inst, firing at toMem means issuing successfully
}

if (memScheduler.io.vecLoadIssueResp(i).nonEmpty && memExuBlocksHasVecLoad(i)(j)) {
memScheduler.io.vecLoadIssueResp(i)(j) match {
case resp =>
resp.valid := toMem(i)(j).fire && LSUOpType.isVecLd(toMem(i)(j).bits.fuOpType)
resp.bits.fuType := toMem(i)(j).bits.fuType
resp.bits.robIdx := toMem(i)(j).bits.robIdx
resp.bits.uopIdx.get := toMem(i)(j).bits.vpu.get.vuopIdx
resp.bits.resp := RespType.success
}
dontTouch(memScheduler.io.vecLoadIssueResp(i)(j))
}
}
}

Expand Down Expand Up @@ -558,6 +584,8 @@ class BackendImp(override val wrapper: Backend)(implicit p: Parameters) extends
sink.bits.uop.debugInfo := source.bits.perfDebugInfo
sink.bits.uop.vpu := source.bits.vpu.getOrElse(0.U.asTypeOf(new VPUCtrlSignals))
sink.bits.uop.preDecodeInfo := source.bits.preDecode.getOrElse(0.U.asTypeOf(new PreDecodeInfo))
sink.bits.uop.numLsElem := source.bits.numLsElem.getOrElse(0.U) // Todo: remove this bundle, keep only the one below
sink.bits.flowNum.foreach(_ := source.bits.numLsElem.get)
}
io.mem.loadFastMatch := memScheduler.io.toMem.get.loadFastMatch.map(_.fastMatch)
io.mem.loadFastImm := memScheduler.io.toMem.get.loadFastMatch.map(_.fastImm)
Expand Down Expand Up @@ -627,6 +655,8 @@ class BackendMemIO(implicit p: Parameters, params: BackendParams) extends XSBund
val ldaIqFeedback = Vec(params.LduCnt, Flipped(new MemRSFeedbackIO))
val staIqFeedback = Vec(params.StaCnt, Flipped(new MemRSFeedbackIO))
val hyuIqFeedback = Vec(params.HyuCnt, Flipped(new MemRSFeedbackIO))
val vstuIqFeedback = Flipped(Vec(params.VstuCnt, new MemRSFeedbackIO(isVector = true)))
val vlduIqFeedback = Flipped(Vec(params.VlduCnt, new MemRSFeedbackIO(isVector = true)))
val ldCancel = Vec(params.LdExuCnt, Flipped(new LoadCancelIO))
val wakeup = Vec(params.LdExuCnt, Flipped(Valid(new DynInst)))
val loadPcRead = Vec(params.LduCnt, Output(UInt(VAddrBits.W)))
Expand Down
Loading

0 comments on commit 9f2c7f7

Please sign in to comment.