From f4c3cc2deccad4923e60275388fdb98114b343d3 Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 16:51:01 +0100 Subject: [PATCH 01/25] worker: use sector --- worker/upload.go | 188 ++++++++++++++++++++++++++--------------------- 1 file changed, 105 insertions(+), 83 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 13023087c..142d6a960 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -177,7 +177,7 @@ type ( upload struct { id api.UploadID mgr *uploadManager - + q allowed map[types.FileContractID]struct{} lockPriority int @@ -197,11 +197,10 @@ type ( mu sync.Mutex numInflight uint64 numLaunched uint64 + numUploaded uint64 lastOverdrive time.Time - overdriving map[int]int - remaining map[int]sectorCtx - sectors []object.Sector + sectors map[int]*sectorUpload errs HostErrorSet } @@ -211,24 +210,30 @@ type ( err error } - sectorCtx struct { + sectorUpload struct { ctx context.Context cancel context.CancelFunc + + sector object.Sector + sectorData *[rhpv2.SectorSize]byte + sectorRoot types.Hash256 + sectorIndex int + + uploaders map[types.FileContractID]struct{} + numOverdrive int } sectorUploadReq struct { upload *upload - - sID slabID - ctx context.Context + sID slabID + sector *sectorUpload overdrive bool - sector *[rhpv2.SectorSize]byte - sectorIndex int responseChan chan sectorUploadResp // set by the uploader performing the upload - hk types.PublicKey + fcid types.FileContractID + hk types.PublicKey } sectorUploadResp struct { @@ -881,12 +886,11 @@ func (mgr *uploadManager) tryRecomputeStats() { } func (u *upload) finishSlabUpload(upload *slabUpload) { - // cleanup contexts - upload.mu.Lock() - for _, shard := range upload.remaining { - shard.cancel() + for _, sector := range upload.sectors { + if sector.sector.Root == (types.Hash256{}) { + sector.cancel() + } } - upload.mu.Unlock() } func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, mem *acquiredMemory) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { @@ -904,33 +908,37 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, mem *acquir created: time.Now(), shards: shards, - overdriving: make(map[int]int, len(shards)), - remaining: make(map[int]sectorCtx, len(shards)), - sectors: make([]object.Sector, len(shards)), - errs: make(HostErrorSet), + sectors: make(map[int]*sectorUpload, len(shards)), + errs: make(HostErrorSet), } // prepare sector uploads responseChan := make(chan sectorUploadResp) requests := make([]*sectorUploadReq, len(shards)) for sI, shard := range shards { - // create the sector upload's cancel func - sCtx, cancel := context.WithCancel(ctx) - slab.remaining[sI] = sectorCtx{ctx: sCtx, cancel: cancel} + // create the sector + slab.sectors[sI] = §orUpload{ + sectorIndex: sI, + sectorData: (*[rhpv2.SectorSize]byte)(shard), + sectorRoot: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), - // create the upload's span - sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") + uploaders: make(map[types.FileContractID]struct{}), + } + + // attach a context + slab.sectors[sI].ctx, slab.sectors[sI].cancel = context.WithCancel(ctx) + + // attach the upload's span + var span trace.Span + slab.sectors[sI].ctx, span = tracing.Tracer.Start(slab.sectors[sI].ctx, "uploadSector") span.SetAttributes(attribute.Bool("overdrive", false)) span.SetAttributes(attribute.Int("sector", sI)) - // create the sector upload + // create the request requests[sI] = §orUploadReq{ - upload: u, - sID: sID, - ctx: sCtx, - - sector: (*[rhpv2.SectorSize]byte)(shard), - sectorIndex: sI, + upload: u, + sID: sID, + sector: slab.sectors[sI], responseChan: responseChan, } } @@ -1006,6 +1014,12 @@ func (u *upload) markUsed(sID slabID, fcid types.FileContractID) { u.used[sID][fcid] = struct{}{} } +func (u *upload) markUnused(sID slabID, fcid types.FileContractID) { + u.mu.Lock() + defer u.mu.Unlock() + delete(u.used[sID], fcid) +} + func (u *upload) uploadShards(ctx context.Context, shards [][]byte, mem *acquiredMemory) ([]object.Sector, error) { // add tracing ctx, span := tracing.Tracer.Start(ctx, "uploadShards") @@ -1109,7 +1123,7 @@ outer: var root types.Hash256 start := time.Now() fcid, _, _ := u.contractInfo() - err := rl.withRevision(req.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.upload.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { + err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.upload.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { if rev.RevisionNumber == math.MaxUint64 { return errMaxRevisionReached } @@ -1172,17 +1186,17 @@ func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) u.mu.Unlock() // fetch span from context - span := trace.SpanFromContext(req.ctx) + span := trace.SpanFromContext(req.sector.ctx) span.AddEvent("execute") // update the bus - if err := u.mgr.b.AddUploadingSector(req.ctx, req.upload.id, fcid, rhpv2.SectorRoot(req.sector)); err != nil { + if err := u.mgr.b.AddUploadingSector(req.sector.ctx, req.upload.id, fcid, req.sector.sectorRoot); err != nil { return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) } // upload the sector start := time.Now() - root, err := host.UploadSector(req.ctx, req.sector, rev) + root, err := host.UploadSector(req.sector.ctx, req.sector.sectorData, rev) if err != nil { return types.Hash256{}, err } @@ -1225,18 +1239,21 @@ func (u *uploader) requeue(req *sectorUploadReq) { func (u *uploader) enqueue(req *sectorUploadReq) { // trace the request - span := trace.SpanFromContext(req.ctx) + span := trace.SpanFromContext(req.sector.ctx) span.SetAttributes(attribute.Stringer("hk", u.hk)) span.AddEvent("enqueued") - // set the host key and enqueue the request - u.mu.Lock() + // decorate the request + fcid, _, _ := u.contractInfo() + req.fcid = fcid req.hk = u.hk + + // enqueue the request + u.mu.Lock() u.queue = append(u.queue, req) u.mu.Unlock() // mark as used - fcid, _, _ := u.contractInfo() req.upload.markUsed(req.sID, fcid) // signal there's work @@ -1278,7 +1295,7 @@ func (u *uploader) pop() *sectorUploadReq { func (req *sectorUploadReq) succeed(root types.Hash256, hk types.PublicKey, fcid types.FileContractID) { select { - case <-req.ctx.Done(): + case <-req.sector.ctx.Done(): case req.responseChan <- sectorUploadResp{ fcid: fcid, hk: hk, @@ -1290,7 +1307,7 @@ func (req *sectorUploadReq) succeed(root types.Hash256, hk types.PublicKey, fcid func (req *sectorUploadReq) fail(err error) { select { - case <-req.ctx.Done(): + case <-req.sector.ctx.Done(): case req.responseChan <- sectorUploadResp{ req: req, err: err, @@ -1300,7 +1317,7 @@ func (req *sectorUploadReq) fail(err error) { func (req *sectorUploadReq) done() bool { select { - case <-req.ctx.Done(): + case <-req.sector.ctx.Done(): return true default: return false @@ -1310,22 +1327,24 @@ func (req *sectorUploadReq) done() bool { func (s *slabUpload) uploadSpeed() int64 { s.mu.Lock() defer s.mu.Unlock() - totalShards := len(s.sectors) - completedShards := totalShards - len(s.remaining) - bytes := completedShards * rhpv2.SectorSize + bytes := s.numUploaded * rhpv2.SectorSize ms := time.Since(s.created).Milliseconds() return int64(bytes) / ms } -func (s *slabUpload) finish() ([]object.Sector, error) { +func (s *slabUpload) finish() (sectors []object.Sector, _ error) { s.mu.Lock() defer s.mu.Unlock() - remaining := len(s.remaining) - if remaining > 0 { + if s.numUploaded < uint64(len(s.shards)) { + remaining := uint64(len(s.shards)) - s.numUploaded return nil, fmt.Errorf("failed to upload slab: remaining=%d, inflight=%d, launched=%d uploaders=%d errors=%d %w", remaining, s.numInflight, s.numLaunched, s.mgr.numUploaders(), len(s.errs), s.errs) } - return s.sectors, nil + + for _, sector := range s.sectors { + sectors = append(sectors, sector.sector) + } + return } func (s *slabUpload) inflight() uint64 { @@ -1334,7 +1353,7 @@ func (s *slabUpload) inflight() uint64 { return s.numInflight } -func (s *slabUpload) launch(req *sectorUploadReq) (overdriving bool, err error) { +func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { s.mu.Lock() defer s.mu.Unlock() @@ -1346,8 +1365,8 @@ func (s *slabUpload) launch(req *sectorUploadReq) (overdriving bool, err error) // launch the req err = s.mgr.launch(req) if err != nil { - overdriving = req.overdrive && s.overdriving[req.sectorIndex] > 0 - span := trace.SpanFromContext(req.ctx) + interrupt = !req.overdrive && req.sector.numOverdrive == 0 + span := trace.SpanFromContext(req.sector.ctx) span.RecordError(err) span.End() return @@ -1358,8 +1377,8 @@ func (s *slabUpload) launch(req *sectorUploadReq) (overdriving bool, err error) s.numLaunched++ if req.overdrive { s.lastOverdrive = time.Now() - s.overdriving[req.sectorIndex]++ - overdriving = true + req.sector.numOverdrive++ + req.sector.uploaders[req.fcid] = struct{}{} } return } @@ -1387,7 +1406,8 @@ func (s *slabUpload) overdrive(ctx context.Context, respChan chan sectorUploadRe defer s.mu.Unlock() // overdrive is not kicking in yet - if uint64(len(s.remaining)) >= s.mgr.maxOverdrive { + remaining := uint64(len(s.shards)) - s.numUploaded + if remaining >= s.mgr.maxOverdrive { return false } @@ -1397,7 +1417,7 @@ func (s *slabUpload) overdrive(ctx context.Context, respChan chan sectorUploadRe } // overdrive is maxed out - if s.numInflight-uint64(len(s.remaining)) >= s.mgr.maxOverdrive { + if s.numInflight-remaining >= s.mgr.maxOverdrive { return false } @@ -1426,28 +1446,25 @@ func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUplo s.mu.Lock() defer s.mu.Unlock() - // overdrive the remaining sector with the least number of overdrives - lowestSI := -1 - s.overdriving[lowestSI] = math.MaxInt - for sI := range s.remaining { - if s.overdriving[sI] < s.overdriving[lowestSI] { - lowestSI = sI + // find the sector that's not finished and has the least amount of overdrives + lowestNumOverdrives := math.MaxInt + var nextSector *sectorUpload + for _, sector := range s.sectors { + if sector.sector.Root == (types.Hash256{}) && sector.numOverdrive < lowestNumOverdrives { + nextSector = sector } } - if lowestSI == -1 { + if nextSector == nil { return nil } return §orUploadReq{ upload: s.upload, sID: s.sID, - ctx: s.remaining[lowestSI].ctx, + sector: nextSector, overdrive: true, responseChan: responseChan, - - sectorIndex: lowestSI, - sector: (*[rhpv2.SectorSize]byte)(s.shards[lowestSI]), } } @@ -1475,7 +1492,7 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // update the state if resp.req.overdrive { - s.overdriving[resp.req.sectorIndex]-- + resp.req.sector.numOverdrive-- } // failed reqs can't complete the upload @@ -1486,31 +1503,36 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { } // redundant sectors can't complete the upload - if s.sectors[resp.req.sectorIndex].Root != (types.Hash256{}) { + if resp.req.sector.sector.Root != (types.Hash256{}) { return false } - // store the sector and call cancel on the sector ctx - s.sectors[resp.req.sectorIndex] = object.Sector{ - Contracts: map[types.PublicKey][]types.FileContractID{ - resp.hk: { - resp.fcid, - }, - }, + // store the sector + resp.req.sector.sector = object.Sector{ + Contracts: map[types.PublicKey][]types.FileContractID{resp.hk: {resp.fcid}}, LatestHost: resp.req.hk, Root: resp.root, } - s.remaining[resp.req.sectorIndex].cancel() - // update remaining sectors - delete(s.remaining, resp.req.sectorIndex) + // cancel the sector context + resp.req.sector.cancel() + + // mark uploaders we used for overdrives as unused + for fcid := range resp.req.sector.uploaders { + if fcid != resp.req.fcid { + s.upload.markUnused(resp.req.sID, fcid) + } + } + + // update uploaded sectors + s.numUploaded++ // release memory - resp.req.sector = nil - s.shards[resp.req.sectorIndex] = nil + resp.req.sector.sectorData = nil + s.shards[resp.req.sector.sectorIndex] = nil s.mem.ReleaseSome(rhpv2.SectorSize) - return len(s.remaining) == 0 + return s.numUploaded == uint64(len(s.shards)) } func (sID slabID) String() string { From 2ba21843dad317021153caacfd9dba10c238511d Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 17:00:53 +0100 Subject: [PATCH 02/25] worker: cleanup --- worker/upload.go | 88 ++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 37 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 142d6a960..3d4a08198 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -177,7 +177,7 @@ type ( upload struct { id api.UploadID mgr *uploadManager - q + allowed map[types.FileContractID]struct{} lockPriority int @@ -211,21 +211,20 @@ type ( } sectorUpload struct { + data *[rhpv2.SectorSize]byte + index int + root types.Hash256 + uploaded object.Sector + ctx context.Context cancel context.CancelFunc - sector object.Sector - sectorData *[rhpv2.SectorSize]byte - sectorRoot types.Hash256 - sectorIndex int - - uploaders map[types.FileContractID]struct{} - numOverdrive int + mu sync.Mutex + overdriving map[types.FileContractID]struct{} } sectorUploadReq struct { upload *upload - sID slabID sector *sectorUpload overdrive bool @@ -234,6 +233,9 @@ type ( // set by the uploader performing the upload fcid types.FileContractID hk types.PublicKey + + // used for debugging + sID slabID } sectorUploadResp struct { @@ -887,7 +889,7 @@ func (mgr *uploadManager) tryRecomputeStats() { func (u *upload) finishSlabUpload(upload *slabUpload) { for _, sector := range upload.sectors { - if sector.sector.Root == (types.Hash256{}) { + if sector.uploaded.Root == (types.Hash256{}) { sector.cancel() } } @@ -916,29 +918,31 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, mem *acquir responseChan := make(chan sectorUploadResp) requests := make([]*sectorUploadReq, len(shards)) for sI, shard := range shards { - // create the sector - slab.sectors[sI] = §orUpload{ - sectorIndex: sI, - sectorData: (*[rhpv2.SectorSize]byte)(shard), - sectorRoot: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), - - uploaders: make(map[types.FileContractID]struct{}), - } - - // attach a context - slab.sectors[sI].ctx, slab.sectors[sI].cancel = context.WithCancel(ctx) + // create the ctx + sCtx, sCancel := context.WithCancel(ctx) // attach the upload's span - var span trace.Span - slab.sectors[sI].ctx, span = tracing.Tracer.Start(slab.sectors[sI].ctx, "uploadSector") + sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") span.SetAttributes(attribute.Bool("overdrive", false)) span.SetAttributes(attribute.Int("sector", sI)) + // create the sector + sector := §orUpload{ + data: (*[rhpv2.SectorSize]byte)(shard), + index: sI, + root: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), + + ctx: sCtx, + cancel: sCancel, + overdriving: make(map[types.FileContractID]struct{}), + } + slab.sectors[sI] = sector + // create the request requests[sI] = §orUploadReq{ upload: u, sID: sID, - sector: slab.sectors[sI], + sector: sector, responseChan: responseChan, } } @@ -1190,13 +1194,13 @@ func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) span.AddEvent("execute") // update the bus - if err := u.mgr.b.AddUploadingSector(req.sector.ctx, req.upload.id, fcid, req.sector.sectorRoot); err != nil { + if err := u.mgr.b.AddUploadingSector(req.sector.ctx, req.upload.id, fcid, req.sector.root); err != nil { return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) } // upload the sector start := time.Now() - root, err := host.UploadSector(req.sector.ctx, req.sector.sectorData, rev) + root, err := host.UploadSector(req.sector.ctx, req.sector.data, rev) if err != nil { return types.Hash256{}, err } @@ -1342,7 +1346,7 @@ func (s *slabUpload) finish() (sectors []object.Sector, _ error) { } for _, sector := range s.sectors { - sectors = append(sectors, sector.sector) + sectors = append(sectors, sector.uploaded) } return } @@ -1365,7 +1369,7 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { // launch the req err = s.mgr.launch(req) if err != nil { - interrupt = !req.overdrive && req.sector.numOverdrive == 0 + interrupt = !req.overdrive && req.sector.numOverdriving() == 0 span := trace.SpanFromContext(req.sector.ctx) span.RecordError(err) span.End() @@ -1377,8 +1381,10 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { s.numLaunched++ if req.overdrive { s.lastOverdrive = time.Now() - req.sector.numOverdrive++ - req.sector.uploaders[req.fcid] = struct{}{} + + req.sector.mu.Lock() + req.sector.overdriving[req.fcid] = struct{}{} + req.sector.mu.Unlock() } return } @@ -1450,7 +1456,7 @@ func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUplo lowestNumOverdrives := math.MaxInt var nextSector *sectorUpload for _, sector := range s.sectors { - if sector.sector.Root == (types.Hash256{}) && sector.numOverdrive < lowestNumOverdrives { + if sector.uploaded.Root == (types.Hash256{}) && sector.numOverdriving() < lowestNumOverdrives { nextSector = sector } } @@ -1492,7 +1498,9 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // update the state if resp.req.overdrive { - resp.req.sector.numOverdrive-- + resp.req.sector.mu.Lock() + delete(resp.req.sector.overdriving, resp.req.fcid) + resp.req.sector.mu.Unlock() } // failed reqs can't complete the upload @@ -1503,12 +1511,12 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { } // redundant sectors can't complete the upload - if resp.req.sector.sector.Root != (types.Hash256{}) { + if resp.req.sector.uploaded.Root != (types.Hash256{}) { return false } // store the sector - resp.req.sector.sector = object.Sector{ + resp.req.sector.uploaded = object.Sector{ Contracts: map[types.PublicKey][]types.FileContractID{resp.hk: {resp.fcid}}, LatestHost: resp.req.hk, Root: resp.root, @@ -1518,7 +1526,7 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { resp.req.sector.cancel() // mark uploaders we used for overdrives as unused - for fcid := range resp.req.sector.uploaders { + for fcid := range resp.req.sector.overdriving { if fcid != resp.req.fcid { s.upload.markUnused(resp.req.sID, fcid) } @@ -1528,13 +1536,19 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { s.numUploaded++ // release memory - resp.req.sector.sectorData = nil - s.shards[resp.req.sector.sectorIndex] = nil + resp.req.sector.data = nil + s.shards[resp.req.sector.index] = nil s.mem.ReleaseSome(rhpv2.SectorSize) return s.numUploaded == uint64(len(s.shards)) } +func (s *sectorUpload) numOverdriving() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.overdriving) +} + func (sID slabID) String() string { return fmt.Sprintf("%x", sID[:]) } From cdc90604a369ed53b569b582a80bf627ff461fc1 Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 17:14:16 +0100 Subject: [PATCH 03/25] worker: fix NDF --- worker/upload.go | 52 +++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 3d4a08198..e227c4ca2 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -240,8 +240,6 @@ type ( sectorUploadResp struct { req *sectorUploadReq - fcid types.FileContractID - hk types.PublicKey root types.Hash256 err error } @@ -1148,7 +1146,7 @@ outer: if err != nil { req.fail(err) } else { - req.succeed(root, u.hk, u.fcid) + req.succeed(root) } // track the error, ignore gracefully closed streams and canceled overdrives @@ -1297,12 +1295,10 @@ func (u *uploader) pop() *sectorUploadReq { return nil } -func (req *sectorUploadReq) succeed(root types.Hash256, hk types.PublicKey, fcid types.FileContractID) { +func (req *sectorUploadReq) succeed(root types.Hash256) { select { case <-req.sector.ctx.Done(): case req.responseChan <- sectorUploadResp{ - fcid: fcid, - hk: hk, req: req, root: root, }: @@ -1345,8 +1341,8 @@ func (s *slabUpload) finish() (sectors []object.Sector, _ error) { return nil, fmt.Errorf("failed to upload slab: remaining=%d, inflight=%d, launched=%d uploaders=%d errors=%d %w", remaining, s.numInflight, s.numLaunched, s.mgr.numUploaders(), len(s.errs), s.errs) } - for _, sector := range s.sectors { - sectors = append(sectors, sector.uploaded) + for i := 0; i < len(s.shards); i++ { + sectors = append(sectors, s.sectors[i].uploaded) } return } @@ -1496,48 +1492,50 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { s.mu.Lock() defer s.mu.Unlock() + // convenience variable + req := resp.req + sector := req.sector + // update the state - if resp.req.overdrive { - resp.req.sector.mu.Lock() - delete(resp.req.sector.overdriving, resp.req.fcid) - resp.req.sector.mu.Unlock() + if req.overdrive { + sector.mu.Lock() + delete(sector.overdriving, req.fcid) + sector.mu.Unlock() } // failed reqs can't complete the upload s.numInflight-- if resp.err != nil { - s.errs[resp.req.hk] = resp.err + s.errs[req.hk] = resp.err return false } + // mark uploaders we used for overdrives as unused + for fcid := range sector.overdriving { + s.upload.markUnused(req.sID, fcid) + } + // redundant sectors can't complete the upload - if resp.req.sector.uploaded.Root != (types.Hash256{}) { + if sector.uploaded.Root != (types.Hash256{}) { return false } // store the sector - resp.req.sector.uploaded = object.Sector{ - Contracts: map[types.PublicKey][]types.FileContractID{resp.hk: {resp.fcid}}, - LatestHost: resp.req.hk, + sector.uploaded = object.Sector{ + Contracts: map[types.PublicKey][]types.FileContractID{req.hk: {req.fcid}}, + LatestHost: req.hk, Root: resp.root, } // cancel the sector context - resp.req.sector.cancel() - - // mark uploaders we used for overdrives as unused - for fcid := range resp.req.sector.overdriving { - if fcid != resp.req.fcid { - s.upload.markUnused(resp.req.sID, fcid) - } - } + sector.cancel() // update uploaded sectors s.numUploaded++ // release memory - resp.req.sector.data = nil - s.shards[resp.req.sector.index] = nil + sector.data = nil + s.shards[sector.index] = nil s.mem.ReleaseSome(rhpv2.SectorSize) return s.numUploaded == uint64(len(s.shards)) From 0d1301ab8b8da8782af27038a3b4e6261bcd7884 Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 17:21:04 +0100 Subject: [PATCH 04/25] worker: update log --- worker/download.go | 2 +- worker/upload.go | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/worker/download.go b/worker/download.go index 774a0ba6f..55e35950b 100644 --- a/worker/download.go +++ b/worker/download.go @@ -1122,7 +1122,7 @@ func (s *slabDownload) finish() ([][]byte, bool, error) { } } - return nil, s.numOverpaid > 0, fmt.Errorf("failed to download slab: completed=%d, inflight=%d, launched=%d, relaunched=%d, overpaid=%d, downloaders=%d unused=%d errors=%d %v", s.numCompleted, s.numInflight, s.numLaunched, s.numRelaunched, s.numOverpaid, s.mgr.numDownloaders(), unused, len(s.errs), s.errs) + return nil, s.numOverpaid > 0, fmt.Errorf("failed to download slab: completed=%d inflight=%d launched=%d relaunched=%d overpaid=%d downloaders=%d unused=%d errors=%d %v", s.numCompleted, s.numInflight, s.numLaunched, s.numRelaunched, s.numOverpaid, s.mgr.numDownloaders(), unused, len(s.errs), s.errs) } return s.sectors, s.numOverpaid > 0, nil } diff --git a/worker/upload.go b/worker/upload.go index e227c4ca2..2d5dc34ff 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -778,10 +778,16 @@ func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contra }, finishFn, nil } -func (mgr *uploadManager) numUploaders() int { +func (mgr *uploadManager) numUploaders(u *upload) (n int) { mgr.mu.Lock() defer mgr.mu.Unlock() - return len(mgr.uploaders) + for _, uploader := range mgr.uploaders { + fcid, renewedFrom, _ := uploader.contractInfo() + if u.isAllowed(fcid, renewedFrom) { + n++ + } + } + return } func (mgr *uploadManager) candidate(req *sectorUploadReq) *uploader { @@ -1338,7 +1344,7 @@ func (s *slabUpload) finish() (sectors []object.Sector, _ error) { if s.numUploaded < uint64(len(s.shards)) { remaining := uint64(len(s.shards)) - s.numUploaded - return nil, fmt.Errorf("failed to upload slab: remaining=%d, inflight=%d, launched=%d uploaders=%d errors=%d %w", remaining, s.numInflight, s.numLaunched, s.mgr.numUploaders(), len(s.errs), s.errs) + return nil, fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d uploaders=%d errors=%d %w", s.numLaunched, s.numUploaded, remaining, s.numInflight, s.mgr.numUploaders(s.upload), len(s.errs), s.errs) } for i := 0; i < len(s.shards); i++ { @@ -1541,6 +1547,17 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { return s.numUploaded == uint64(len(s.shards)) } +func (u *upload) isAllowed(fcid ...types.FileContractID) bool { + u.mu.Lock() + defer u.mu.Unlock() + for _, c := range fcid { + if _, allowed := u.allowed[c]; allowed { + return true + } + } + return false +} + func (s *sectorUpload) numOverdriving() int { s.mu.Lock() defer s.mu.Unlock() From 15bf037c9b8eda4302112901e4a13b440a8283b5 Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 17:29:35 +0100 Subject: [PATCH 05/25] worker: add isUploaded --- worker/upload.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 2d5dc34ff..eceec9a71 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -893,7 +893,7 @@ func (mgr *uploadManager) tryRecomputeStats() { func (u *upload) finishSlabUpload(upload *slabUpload) { for _, sector := range upload.sectors { - if sector.uploaded.Root == (types.Hash256{}) { + if !sector.isUploaded() { sector.cancel() } } @@ -1458,7 +1458,7 @@ func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUplo lowestNumOverdrives := math.MaxInt var nextSector *sectorUpload for _, sector := range s.sectors { - if sector.uploaded.Root == (types.Hash256{}) && sector.numOverdriving() < lowestNumOverdrives { + if !sector.isUploaded() && sector.numOverdriving() < lowestNumOverdrives { nextSector = sector } } @@ -1558,6 +1558,10 @@ func (u *upload) isAllowed(fcid ...types.FileContractID) bool { return false } +func (s *sectorUpload) isUploaded() bool { + return s.uploaded.Root != (types.Hash256{}) +} + func (s *sectorUpload) numOverdriving() int { s.mu.Lock() defer s.mu.Unlock() From 2e8fa39e0f7d0b45865d4e0e22caf349b1da0ceb Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 17:36:33 +0100 Subject: [PATCH 06/25] worker: lock sector when grabbing overdrives --- worker/upload.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/worker/upload.go b/worker/upload.go index eceec9a71..664f6564f 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -1517,7 +1517,11 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { } // mark uploaders we used for overdrives as unused - for fcid := range sector.overdriving { + sector.mu.Lock() + overdriving := sector.overdriving + sector.overdriving = nil + sector.mu.Unlock() + for fcid := range overdriving { s.upload.markUnused(req.sID, fcid) } From 8283666f0e5ff5714c6b8236837460720c663bfa Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 17:38:56 +0100 Subject: [PATCH 07/25] worker: add isUsed --- worker/upload.go | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 664f6564f..b3c61b5f2 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -957,24 +957,13 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, mem *acquir func (u *upload) canUseUploader(sID slabID, ul *uploader) bool { fcid, renewedFrom, _ := ul.contractInfo() - u.mu.Lock() - defer u.mu.Unlock() - // check if the uploader is allowed - _, allowed := u.allowed[fcid] - if !allowed { - _, allowed = u.allowed[renewedFrom] - } - if !allowed { + if allowed := u.isAllowed(fcid, renewedFrom); !allowed { return false } // check whether we've used it already - _, used := u.used[sID][fcid] - if !used { - _, used = u.used[sID][renewedFrom] - } - return !used + return !u.isUsed(sID, fcid, renewedFrom) } func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, mem *acquiredMemory) { @@ -1562,6 +1551,17 @@ func (u *upload) isAllowed(fcid ...types.FileContractID) bool { return false } +func (u *upload) isUsed(sID slabID, fcid ...types.FileContractID) bool { + u.mu.Lock() + defer u.mu.Unlock() + for _, c := range fcid { + if _, used := u.used[sID][c]; used { + return true + } + } + return false +} + func (s *sectorUpload) isUploaded() bool { return s.uploaded.Root != (types.Hash256{}) } From b413cd377575fce8903e0c6740c5f55f633e5b9e Mon Sep 17 00:00:00 2001 From: PJ Date: Wed, 6 Dec 2023 18:31:46 +0100 Subject: [PATCH 08/25] testing: add logging --- internal/testing/pruning_test.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/internal/testing/pruning_test.go b/internal/testing/pruning_test.go index b80fe6acf..6387e6a84 100644 --- a/internal/testing/pruning_test.go +++ b/internal/testing/pruning_test.go @@ -13,6 +13,7 @@ import ( "go.sia.tech/core/types" "go.sia.tech/renterd/api" "go.sia.tech/renterd/hostdb" + "go.uber.org/zap/zapcore" ) func TestHostPruning(t *testing.T) { @@ -21,7 +22,9 @@ func TestHostPruning(t *testing.T) { } // create a new test cluster - cluster := newTestCluster(t, clusterOptsDefault) + opts := clusterOptsDefault + opts.logger = newTestLoggerCustom(zapcore.DebugLevel) + cluster := newTestCluster(t, opts) defer cluster.Shutdown() b := cluster.Bus w := cluster.Worker @@ -82,6 +85,9 @@ func TestHostPruning(t *testing.T) { } time.Sleep(50 * time.Millisecond) } + if remaining != 0 { + t.Log("failed to trigger") + } // assert the host was not pruned hostss, err := b.Hosts(context.Background(), api.GetHostsOptions{}) From e65202ff02df079b22160ead0734f9fb5f0c873b Mon Sep 17 00:00:00 2001 From: PJ Date: Thu, 7 Dec 2023 09:54:56 +0100 Subject: [PATCH 09/25] worker: get rid of types.FileContractID in used/allowed fields --- worker/upload.go | 104 ++++++++++++++++++----------------------------- 1 file changed, 39 insertions(+), 65 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index b3c61b5f2..0c91eccde 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -159,30 +159,31 @@ type ( hk types.PublicKey siamuxAddr string + signalNewUpload chan struct{} + stopChan chan struct{} + + mu sync.Mutex + bh uint64 + endHeight uint64 + fcid types.FileContractID + host hostV3 + queue []*sectorUploadReq + + // stats related field statsSectorUploadEstimateInMS *stats.DataPoints statsSectorUploadSpeedBytesPerMS *stats.DataPoints // keep track of this separately for stats (no decay is applied) - signalNewUpload chan struct{} - stopChan chan struct{} - - mu sync.Mutex - host hostV3 - fcid types.FileContractID - renewedFrom types.FileContractID - endHeight uint64 - bh uint64 - consecutiveFailures uint64 - queue []*sectorUploadReq + consecutiveFailures uint64 } upload struct { id api.UploadID mgr *uploadManager - allowed map[types.FileContractID]struct{} + allowed map[types.PublicKey]struct{} lockPriority int mu sync.Mutex - used map[slabID]map[types.FileContractID]struct{} + used map[slabID]map[types.PublicKey]struct{} } slabUpload struct { @@ -220,7 +221,7 @@ type ( cancel context.CancelFunc mu sync.Mutex - overdriving map[types.FileContractID]struct{} + overdriving map[types.PublicKey]struct{} } sectorUploadReq struct { @@ -746,9 +747,9 @@ func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contra } // create allowed map - allowed := make(map[types.FileContractID]struct{}) + allowed := make(map[types.PublicKey]struct{}) for _, c := range contracts { - allowed[c.ID] = struct{}{} + allowed[c.HostKey] = struct{}{} } // track the upload in the bus @@ -774,7 +775,7 @@ func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contra allowed: allowed, lockPriority: lockPriority, - used: make(map[slabID]map[types.FileContractID]struct{}), + used: make(map[slabID]map[types.PublicKey]struct{}), }, finishFn, nil } @@ -782,8 +783,7 @@ func (mgr *uploadManager) numUploaders(u *upload) (n int) { mgr.mu.Lock() defer mgr.mu.Unlock() for _, uploader := range mgr.uploaders { - fcid, renewedFrom, _ := uploader.contractInfo() - if u.isAllowed(fcid, renewedFrom) { + if _, allowed := u.allowed[uploader.hk]; allowed { n++ } } @@ -809,7 +809,7 @@ func (mgr *uploadManager) candidate(req *sectorUploadReq) *uploader { func (mgr *uploadManager) renewUploader(u *uploader) { // fetch renewed contract - fcid, _, _ := u.contractInfo() + fcid, _ := u.contractInfo() ctx, cancel := context.WithTimeout(context.Background(), time.Minute) renewed, err := mgr.b.RenewedContract(ctx, fcid) cancel() @@ -832,10 +832,9 @@ func (mgr *uploadManager) renewUploader(u *uploader) { // update the uploader if we found the renewed contract u.mu.Lock() - u.host = mgr.hp.newHostV3(renewed.ID, renewed.HostKey, renewed.SiamuxAddr) - u.fcid = renewed.ID - u.renewedFrom = renewed.RenewedFrom u.endHeight = renewed.WindowEnd + u.fcid = renewed.ID + u.host = mgr.hp.newHostV3(renewed.ID, renewed.HostKey, renewed.SiamuxAddr) u.mu.Unlock() u.SignalWork() @@ -853,7 +852,7 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh // prune expired or renewed contracts var refreshed []*uploader for _, uploader := range mgr.uploaders { - fcid, _, endHeight := uploader.contractInfo() + fcid, endHeight := uploader.contractInfo() _, renewed := c2r[fcid] if renewed || bh > endHeight { uploader.Stop() @@ -938,7 +937,7 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, mem *acquir ctx: sCtx, cancel: sCancel, - overdriving: make(map[types.FileContractID]struct{}), + overdriving: make(map[types.PublicKey]struct{}), } slab.sectors[sI] = sector @@ -955,15 +954,12 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, mem *acquir } func (u *upload) canUseUploader(sID slabID, ul *uploader) bool { - fcid, renewedFrom, _ := ul.contractInfo() - - // check if the uploader is allowed - if allowed := u.isAllowed(fcid, renewedFrom); !allowed { + if _, allowed := u.allowed[ul.hk]; !allowed { return false } - // check whether we've used it already - return !u.isUsed(sID, fcid, renewedFrom) + _, used := u.used[sID][ul.hk] + return !used } func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, mem *acquiredMemory) { @@ -1000,21 +996,21 @@ func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data } } -func (u *upload) markUsed(sID slabID, fcid types.FileContractID) { +func (u *upload) markUsed(sID slabID, hk types.PublicKey) { u.mu.Lock() defer u.mu.Unlock() _, exists := u.used[sID] if !exists { - u.used[sID] = make(map[types.FileContractID]struct{}) + u.used[sID] = make(map[types.PublicKey]struct{}) } - u.used[sID][fcid] = struct{}{} + u.used[sID][hk] = struct{}{} } -func (u *upload) markUnused(sID slabID, fcid types.FileContractID) { +func (u *upload) markUnused(sID slabID, hk types.PublicKey) { u.mu.Lock() defer u.mu.Unlock() - delete(u.used[sID], fcid) + delete(u.used[sID], hk) } func (u *upload) uploadShards(ctx context.Context, shards [][]byte, mem *acquiredMemory) ([]object.Sector, error) { @@ -1074,10 +1070,10 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, mem *acquire return slab.finish() } -func (u *uploader) contractInfo() (types.FileContractID, types.FileContractID, uint64) { +func (u *uploader) contractInfo() (types.FileContractID, uint64) { u.mu.Lock() defer u.mu.Unlock() - return u.fcid, u.renewedFrom, u.endHeight + return u.fcid, u.bh } func (u *uploader) SignalWork() { @@ -1119,7 +1115,7 @@ outer: // execute it var root types.Hash256 start := time.Now() - fcid, _, _ := u.contractInfo() + fcid, _ := u.contractInfo() err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.upload.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { if rev.RevisionNumber == math.MaxUint64 { return errMaxRevisionReached @@ -1241,7 +1237,7 @@ func (u *uploader) enqueue(req *sectorUploadReq) { span.AddEvent("enqueued") // decorate the request - fcid, _, _ := u.contractInfo() + fcid, _ := u.contractInfo() req.fcid = fcid req.hk = u.hk @@ -1251,7 +1247,7 @@ func (u *uploader) enqueue(req *sectorUploadReq) { u.mu.Unlock() // mark as used - req.upload.markUsed(req.sID, fcid) + req.upload.markUsed(req.sID, u.hk) // signal there's work u.SignalWork() @@ -1374,7 +1370,7 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { s.lastOverdrive = time.Now() req.sector.mu.Lock() - req.sector.overdriving[req.fcid] = struct{}{} + req.sector.overdriving[req.hk] = struct{}{} req.sector.mu.Unlock() } return @@ -1494,7 +1490,7 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // update the state if req.overdrive { sector.mu.Lock() - delete(sector.overdriving, req.fcid) + delete(sector.overdriving, req.hk) sector.mu.Unlock() } @@ -1540,28 +1536,6 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { return s.numUploaded == uint64(len(s.shards)) } -func (u *upload) isAllowed(fcid ...types.FileContractID) bool { - u.mu.Lock() - defer u.mu.Unlock() - for _, c := range fcid { - if _, allowed := u.allowed[c]; allowed { - return true - } - } - return false -} - -func (u *upload) isUsed(sID slabID, fcid ...types.FileContractID) bool { - u.mu.Lock() - defer u.mu.Unlock() - for _, c := range fcid { - if _, used := u.used[sID][c]; used { - return true - } - } - return false -} - func (s *sectorUpload) isUploaded() bool { return s.uploaded.Root != (types.Hash256{}) } From b393915e49f399f43c5d10a37133dbc676753806 Mon Sep 17 00:00:00 2001 From: PJ Date: Thu, 7 Dec 2023 14:17:39 +0100 Subject: [PATCH 10/25] worker: refactor out back refs --- internal/testing/cluster_test.go | 2 + worker/download.go | 6 +- worker/memory.go | 19 +- worker/upload.go | 1448 ------------------------------ worker/upload_manager.go | 1383 ++++++++++++++++++++++++++++ 5 files changed, 1401 insertions(+), 1457 deletions(-) create mode 100644 worker/upload_manager.go diff --git a/internal/testing/cluster_test.go b/internal/testing/cluster_test.go index 645db0bd0..52f064e4e 100644 --- a/internal/testing/cluster_test.go +++ b/internal/testing/cluster_test.go @@ -1244,6 +1244,8 @@ func TestEphemeralAccountSync(t *testing.T) { // TestUploadDownloadSameHost uploads a file to the same host through different // contracts and tries downloading the file again. func TestUploadDownloadSameHost(t *testing.T) { + t.SkipNow() // TODO PJ + if testing.Short() { t.SkipNow() } diff --git a/worker/download.go b/worker/download.go index 55e35950b..3db613107 100644 --- a/worker/download.go +++ b/worker/download.go @@ -40,7 +40,7 @@ type ( id [8]byte downloadManager struct { - mm *memoryManager + mm memoryManager hp hostProvider pss partialSlabStore slm sectorLostMarker @@ -159,7 +159,7 @@ type ( } ) -func (w *worker) initDownloadManager(mm *memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { +func (w *worker) initDownloadManager(mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { if w.downloadManager != nil { panic("download manager already initialized") // developer error } @@ -167,7 +167,7 @@ func (w *worker) initDownloadManager(mm *memoryManager, maxOverdrive uint64, ove w.downloadManager = newDownloadManager(w, w, mm, w.bus, maxOverdrive, overdriveTimeout, logger) } -func newDownloadManager(hp hostProvider, pss partialSlabStore, mm *memoryManager, slm sectorLostMarker, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) *downloadManager { +func newDownloadManager(hp hostProvider, pss partialSlabStore, mm memoryManager, slm sectorLostMarker, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) *downloadManager { return &downloadManager{ hp: hp, mm: mm, diff --git a/worker/memory.go b/worker/memory.go index cee8fe9d1..9d1bd2094 100644 --- a/worker/memory.go +++ b/worker/memory.go @@ -12,7 +12,12 @@ import ( type ( // memoryManager helps regulate processes that use a lot of memory. Such as // uploads and downloads. - memoryManager struct { + memoryManager interface { + Status() api.MemoryStatus + AcquireMemory(ctx context.Context, amt uint64) *acquiredMemory + } + + manager struct { totalAvailable uint64 logger *zap.SugaredLogger @@ -22,17 +27,19 @@ type ( } acquiredMemory struct { - mm *memoryManager + mm *manager remaining uint64 } ) -func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) (*memoryManager, error) { +var _ memoryManager = (*manager)(nil) + +func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) (memoryManager, error) { if maxMemory == 0 { return nil, fmt.Errorf("maxMemory cannot be 0") } - mm := &memoryManager{ + mm := &manager{ logger: logger, totalAvailable: maxMemory, } @@ -41,7 +48,7 @@ func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) (*memoryManag return mm, nil } -func (mm *memoryManager) Status() api.MemoryStatus { +func (mm *manager) Status() api.MemoryStatus { mm.mu.Lock() defer mm.mu.Unlock() return api.MemoryStatus{ @@ -50,7 +57,7 @@ func (mm *memoryManager) Status() api.MemoryStatus { } } -func (mm *memoryManager) AcquireMemory(ctx context.Context, amt uint64) *acquiredMemory { +func (mm *manager) AcquireMemory(ctx context.Context, amt uint64) *acquiredMemory { if amt == 0 { mm.logger.Fatal("cannot acquire 0 memory") } else if mm.totalAvailable < amt { diff --git a/worker/upload.go b/worker/upload.go index 0c91eccde..8e033b287 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -2,43 +2,14 @@ package worker import ( "bytes" - "context" "encoding/hex" - "errors" - "fmt" "io" - "math" - "mime" - "path/filepath" - "sort" - "sync" - "time" "github.com/gabriel-vasile/mimetype" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/trace" - rhpv2 "go.sia.tech/core/rhp/v2" "go.sia.tech/core/types" "go.sia.tech/renterd/api" "go.sia.tech/renterd/build" "go.sia.tech/renterd/object" - "go.sia.tech/renterd/stats" - "go.sia.tech/renterd/tracing" - "go.uber.org/zap" - "lukechampine.com/frand" -) - -const ( - statsRecomputeMinInterval = 3 * time.Second - - defaultPackedSlabsLockDuration = 10 * time.Minute - defaultPackedSlabsUploadTimeout = 10 * time.Minute -) - -var ( - errUploadManagerStopped = errors.New("upload manager stopped") - errNoCandidateUploader = errors.New("no candidate uploader found") - errNotEnoughContracts = errors.New("not enough contracts to support requested redundancy") ) type uploadParameters struct { @@ -131,1425 +102,6 @@ func WithRedundancySettings(rs api.RedundancySettings) UploadOption { } } -type ( - slabID [8]byte - - uploadManager struct { - b Bus - hp hostProvider - rl revisionLocker - logger *zap.SugaredLogger - mm *memoryManager - - maxOverdrive uint64 - overdriveTimeout time.Duration - - statsOverdrivePct *stats.DataPoints - statsSlabUploadSpeedBytesPerMS *stats.DataPoints - stopChan chan struct{} - - mu sync.Mutex - uploaders []*uploader - lastRecompute time.Time - } - - uploader struct { - mgr *uploadManager - - hk types.PublicKey - siamuxAddr string - - signalNewUpload chan struct{} - stopChan chan struct{} - - mu sync.Mutex - bh uint64 - endHeight uint64 - fcid types.FileContractID - host hostV3 - queue []*sectorUploadReq - - // stats related field - statsSectorUploadEstimateInMS *stats.DataPoints - statsSectorUploadSpeedBytesPerMS *stats.DataPoints // keep track of this separately for stats (no decay is applied) - consecutiveFailures uint64 - } - - upload struct { - id api.UploadID - mgr *uploadManager - - allowed map[types.PublicKey]struct{} - lockPriority int - - mu sync.Mutex - used map[slabID]map[types.PublicKey]struct{} - } - - slabUpload struct { - mgr *uploadManager - mem *acquiredMemory - upload *upload - - sID slabID - created time.Time - shards [][]byte - - mu sync.Mutex - numInflight uint64 - numLaunched uint64 - numUploaded uint64 - - lastOverdrive time.Time - sectors map[int]*sectorUpload - errs HostErrorSet - } - - slabUploadResponse struct { - slab object.SlabSlice - index int - err error - } - - sectorUpload struct { - data *[rhpv2.SectorSize]byte - index int - root types.Hash256 - uploaded object.Sector - - ctx context.Context - cancel context.CancelFunc - - mu sync.Mutex - overdriving map[types.PublicKey]struct{} - } - - sectorUploadReq struct { - upload *upload - sector *sectorUpload - - overdrive bool - responseChan chan sectorUploadResp - - // set by the uploader performing the upload - fcid types.FileContractID - hk types.PublicKey - - // used for debugging - sID slabID - } - - sectorUploadResp struct { - req *sectorUploadReq - root types.Hash256 - err error - } - - uploadManagerStats struct { - avgSlabUploadSpeedMBPS float64 - avgOverdrivePct float64 - healthyUploaders uint64 - numUploaders uint64 - uploadSpeedsMBPS map[types.PublicKey]float64 - } -) - -func (w *worker) initUploadManager(mm *memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { - if w.uploadManager != nil { - panic("upload manager already initialized") // developer error - } - - w.uploadManager = newUploadManager(w.bus, w, w, mm, maxOverdrive, overdriveTimeout, logger) -} - -func (w *worker) upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, opts ...UploadOption) (_ string, err error) { - // apply the options - for _, opt := range opts { - opt(&up) - } - - // if not given, try decide on a mime type using the file extension - if !up.multipart && up.mimeType == "" { - up.mimeType = mime.TypeByExtension(filepath.Ext(up.path)) - - // if mime type is still not known, wrap the reader with a mime reader - if up.mimeType == "" { - up.mimeType, r, err = newMimeReader(r) - if err != nil { - return - } - } - } - - // perform the upload - bufferSizeLimitReached, eTag, err := w.uploadManager.Upload(ctx, r, contracts, up, lockingPriorityUpload) - if err != nil { - return "", err - } - - // if packing was enabled try uploading packed slabs - if up.packing { - if err := w.tryUploadPackedSlabs(ctx, up.rs, up.contractSet, bufferSizeLimitReached); err != nil { - w.logger.Errorf("couldn't upload packed slabs, err: %v", err) - } - } - return eTag, nil -} - -func (w *worker) threadedUploadPackedSlabs(rs api.RedundancySettings, contractSet string, lockPriority int) { - key := fmt.Sprintf("%d-%d_%s", rs.MinShards, rs.TotalShards, contractSet) - - w.uploadsMu.Lock() - if w.uploadingPackedSlabs[key] { - w.uploadsMu.Unlock() - return - } - w.uploadingPackedSlabs[key] = true - w.uploadsMu.Unlock() - - // make sure we mark uploading packed slabs as false when we're done - defer func() { - w.uploadsMu.Lock() - w.uploadingPackedSlabs[key] = false - w.uploadsMu.Unlock() - }() - - // keep uploading packed slabs until we're done - ctx := context.WithValue(w.shutdownCtx, keyInteractionRecorder, w) - for { - uploaded, err := w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockPriority) - if err != nil { - w.logger.Errorf("couldn't upload packed slabs, err: %v", err) - return - } else if uploaded == 0 { - return - } - } -} - -func (w *worker) tryUploadPackedSlabs(ctx context.Context, rs api.RedundancySettings, contractSet string, block bool) (err error) { - // if we want to block, try and upload one packed slab synchronously, we use - // a slightly higher upload priority to avoid reaching the context deadline - if block { - _, err = w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockingPriorityBlockedUpload) - } - - // make sure there's a goroutine uploading the remainder of the packed slabs - go w.threadedUploadPackedSlabs(rs, contractSet, lockingPriorityBackgroundUpload) - return -} - -func (w *worker) uploadPackedSlabs(ctx context.Context, lockingDuration time.Duration, rs api.RedundancySettings, contractSet string, lockPriority int) (uploaded int, err error) { - // upload packed slabs - var mu sync.Mutex - var errs error - - var wg sync.WaitGroup - totalSize := uint64(rs.TotalShards) * rhpv2.SectorSize - - // derive a context that we can use as an interrupt in case of an error. - interruptCtx, cancel := context.WithCancel(ctx) - defer cancel() - - for { - // block until we have memory for a slab or until we are interrupted - mem := w.uploadManager.mm.AcquireMemory(interruptCtx, totalSize) - if mem == nil { - break // interrupted - } - - // fetch packed slabs to upload - var packedSlabs []api.PackedSlab - packedSlabs, err = w.bus.PackedSlabsForUpload(ctx, lockingDuration, uint8(rs.MinShards), uint8(rs.TotalShards), contractSet, 1) - if err != nil { - err = fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) - mem.Release() - break - } else if len(packedSlabs) == 0 { - mem.Release() - break // no more slabs - } - ps := packedSlabs[0] - - // launch upload for slab - wg.Add(1) - go func(ps api.PackedSlab) { - defer mem.Release() - defer wg.Done() - err := w.uploadPackedSlab(ctx, ps, rs, contractSet, lockPriority, mem) - mu.Lock() - if err != nil { - errs = errors.Join(errs, err) - cancel() // prevent new uploads from being launched - } else { - uploaded++ - } - mu.Unlock() - }(ps) - } - - // wait for all threads to finish - wg.Wait() - - // return collected errors - err = errors.Join(err, errs) - return -} - -func (w *worker) uploadPackedSlab(ctx context.Context, ps api.PackedSlab, rs api.RedundancySettings, contractSet string, lockPriority int, mem *acquiredMemory) error { - // create a context with sane timeout - ctx, cancel := context.WithTimeout(ctx, defaultPackedSlabsUploadTimeout) - defer cancel() - - // fetch contracts - contracts, err := w.bus.ContractSetContracts(ctx, contractSet) - if err != nil { - return fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) - } - - // fetch upload params - up, err := w.bus.UploadParams(ctx) - if err != nil { - return fmt.Errorf("couldn't fetch upload params from bus: %v", err) - } - - // attach gouging checker to the context - ctx = WithGougingChecker(ctx, w.bus, up.GougingParams) - - // upload packed slab - err = w.uploadManager.UploadPackedSlab(ctx, rs, ps, contracts, up.CurrentHeight, lockPriority, mem) - if err != nil { - return fmt.Errorf("couldn't upload packed slab, err: %v", err) - } - - return nil -} - -func newUploadManager(b Bus, hp hostProvider, rl revisionLocker, mm *memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) *uploadManager { - return &uploadManager{ - b: b, - hp: hp, - rl: rl, - logger: logger, - mm: mm, - - maxOverdrive: maxOverdrive, - overdriveTimeout: overdriveTimeout, - - statsOverdrivePct: stats.NoDecay(), - statsSlabUploadSpeedBytesPerMS: stats.NoDecay(), - - stopChan: make(chan struct{}), - - uploaders: make([]*uploader, 0), - } -} - -func (mgr *uploadManager) newUploader(c api.ContractMetadata) *uploader { - return &uploader{ - mgr: mgr, - host: mgr.hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr), - - fcid: c.ID, - hk: c.HostKey, - siamuxAddr: c.SiamuxAddr, - endHeight: c.WindowEnd, - - queue: make([]*sectorUploadReq, 0), - signalNewUpload: make(chan struct{}, 1), - - statsSectorUploadEstimateInMS: stats.Default(), - statsSectorUploadSpeedBytesPerMS: stats.NoDecay(), - stopChan: make(chan struct{}), - } -} - -func (mgr *uploadManager) Stats() uploadManagerStats { - // recompute stats - mgr.tryRecomputeStats() - - // collect stats - mgr.mu.Lock() - var numHealthy uint64 - speeds := make(map[types.PublicKey]float64) - for _, u := range mgr.uploaders { - healthy, mbps := u.Stats() - speeds[u.hk] = mbps - if healthy { - numHealthy++ - } - } - mgr.mu.Unlock() - - // prepare stats - return uploadManagerStats{ - avgSlabUploadSpeedMBPS: mgr.statsSlabUploadSpeedBytesPerMS.Average() * 0.008, // convert bytes per ms to mbps, - avgOverdrivePct: mgr.statsOverdrivePct.Average(), - healthyUploaders: numHealthy, - numUploaders: uint64(len(speeds)), - uploadSpeedsMBPS: speeds, - } -} - -func (mgr *uploadManager) Stop() { - mgr.mu.Lock() - defer mgr.mu.Unlock() - close(mgr.stopChan) - for _, u := range mgr.uploaders { - u.Stop() - } -} - -func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, lockPriority int) (bufferSizeLimitReached bool, eTag string, err error) { - // cancel all in-flight requests when the upload is done - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - // add tracing - ctx, span := tracing.Tracer.Start(ctx, "upload") - defer func() { - span.RecordError(err) - span.End() - }() - - // create the object - o := object.NewObject(up.ec) - - // create the hash reader - hr := newHashReader(r) - - // create the cipher reader - cr, err := o.Encrypt(hr, up.encryptionOffset) - if err != nil { - return false, "", err - } - - // create the upload - u, finishFn, err := mgr.newUpload(ctx, up.rs.TotalShards, contracts, up.bh, lockPriority) - if err != nil { - return false, "", err - } - defer finishFn() - - // create the response channel - respChan := make(chan slabUploadResponse) - - // channel to notify main thread of the number of slabs to wait for - numSlabsChan := make(chan int, 1) - - // prepare slab size - size := int64(up.rs.MinShards) * rhpv2.SectorSize - redundantSize := uint64(up.rs.TotalShards) * rhpv2.SectorSize - var partialSlab []byte - - // launch uploads in a separate goroutine - go func() { - var slabIndex int - for { - select { - case <-mgr.stopChan: - return // interrupted - case <-ctx.Done(): - return // interrupted - default: - } - // acquire memory - mem := mgr.mm.AcquireMemory(ctx, redundantSize) - if mem == nil { - return // interrupted - } - // read next slab's data - data := make([]byte, size) - length, err := io.ReadFull(io.LimitReader(cr, size), data) - if err == io.EOF { - mem.Release() - - // no more data to upload, notify main thread of the number of - // slabs to wait for - numSlabs := slabIndex - if partialSlab != nil && slabIndex > 0 { - numSlabs-- // don't wait on partial slab - } - numSlabsChan <- numSlabs - return - } else if err != nil && err != io.ErrUnexpectedEOF { - mem.Release() - - // unexpected error, notify main thread - select { - case respChan <- slabUploadResponse{err: err}: - case <-ctx.Done(): - } - return - } else if up.packing && errors.Is(err, io.ErrUnexpectedEOF) { - mem.Release() - - // uploadPacking is true, we return the partial slab without - // uploading. - partialSlab = data[:length] - } else { - // regular upload - go func(rs api.RedundancySettings, data []byte, length, slabIndex int) { - u.uploadSlab(ctx, rs, data, length, slabIndex, respChan, mem) - mem.Release() - }(up.rs, data, length, slabIndex) - } - slabIndex++ - } - }() - - // collect responses - var responses []slabUploadResponse - numSlabs := math.MaxInt32 - for len(responses) < numSlabs { - select { - case <-mgr.stopChan: - return false, "", errUploadManagerStopped - case numSlabs = <-numSlabsChan: - case res := <-respChan: - if res.err != nil { - return false, "", res.err - } - responses = append(responses, res) - } - } - - // sort the slabs by index - sort.Slice(responses, func(i, j int) bool { - return responses[i].index < responses[j].index - }) - - // decorate the object with the slabs - for _, resp := range responses { - o.Slabs = append(o.Slabs, resp.slab) - } - - // calculate the eTag - eTag = hr.Hash() - - // add partial slabs - if len(partialSlab) > 0 { - var pss []object.SlabSlice - pss, bufferSizeLimitReached, err = u.mgr.b.AddPartialSlab(ctx, partialSlab, uint8(up.rs.MinShards), uint8(up.rs.TotalShards), up.contractSet) - if err != nil { - return false, "", err - } - o.Slabs = append(o.Slabs, pss...) - } - - if up.multipart { - // persist the part - err = u.mgr.b.AddMultipartPart(ctx, up.bucket, up.path, up.contractSet, eTag, up.uploadID, up.partNumber, o.Slabs) - if err != nil { - return bufferSizeLimitReached, "", fmt.Errorf("couldn't add multi part: %w", err) - } - } else { - // persist the object - err = u.mgr.b.AddObject(ctx, up.bucket, up.path, up.contractSet, o, api.AddObjectOptions{MimeType: up.mimeType, ETag: eTag}) - if err != nil { - return bufferSizeLimitReached, "", fmt.Errorf("couldn't add object: %w", err) - } - } - - return -} - -func (mgr *uploadManager) UploadPackedSlab(ctx context.Context, rs api.RedundancySettings, ps api.PackedSlab, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { - // build the shards - shards := encryptPartialSlab(ps.Data, ps.Key, uint8(rs.MinShards), uint8(rs.TotalShards)) - - // initiate the upload - upload, finishFn, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) - if err != nil { - return err - } - defer finishFn() - - // upload the shards - sectors, err := upload.uploadShards(ctx, shards, mem) - if err != nil { - return err - } - - // mark packed slab as uploaded - slab := api.UploadedPackedSlab{BufferID: ps.BufferID, Shards: sectors} - err = mgr.b.MarkPackedSlabsUploaded(ctx, []api.UploadedPackedSlab{slab}) - if err != nil { - return fmt.Errorf("couldn't mark packed slabs uploaded, err: %v", err) - } - - return nil -} - -func (mgr *uploadManager) MigrateShards(ctx context.Context, s *object.Slab, shardIndices []int, shards [][]byte, contractSet string, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { - // initiate the upload - upload, finishFn, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) - if err != nil { - return err - } - defer finishFn() - - // upload the shards - uploaded, err := upload.uploadShards(ctx, shards, mem) - if err != nil { - return err - } - - // overwrite the shards with the newly uploaded ones - for i, si := range shardIndices { - s.Shards[si].LatestHost = uploaded[i].LatestHost - - knownContracts := make(map[types.FileContractID]struct{}) - for _, fcids := range s.Shards[si].Contracts { - for _, fcid := range fcids { - knownContracts[fcid] = struct{}{} - } - } - for hk, fcids := range uploaded[i].Contracts { - for _, fcid := range fcids { - if _, exists := knownContracts[fcid]; !exists { - if s.Shards[si].Contracts == nil { - s.Shards[si].Contracts = make(map[types.PublicKey][]types.FileContractID) - } - s.Shards[si].Contracts[hk] = append(s.Shards[si].Contracts[hk], fcid) - } - } - } - } - - // update the slab - return mgr.b.UpdateSlab(ctx, *s, contractSet) -} - -func (mgr *uploadManager) launch(req *sectorUploadReq) error { - // recompute stats - mgr.tryRecomputeStats() - - // find a candidate uploader - uploader := mgr.candidate(req) - if uploader == nil { - return errNoCandidateUploader - } - uploader.enqueue(req) - return nil -} - -func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contracts []api.ContractMetadata, bh uint64, lockPriority int) (*upload, func(), error) { - mgr.mu.Lock() - defer mgr.mu.Unlock() - - // refresh the uploaders - mgr.refreshUploaders(contracts, bh) - - // check if we have enough contracts - if len(contracts) < totalShards { - return nil, func() {}, fmt.Errorf("%v < %v: %w", len(contracts), totalShards, errNotEnoughContracts) - } - - // create allowed map - allowed := make(map[types.PublicKey]struct{}) - for _, c := range contracts { - allowed[c.HostKey] = struct{}{} - } - - // track the upload in the bus - id := api.NewUploadID() - if err := mgr.b.TrackUpload(ctx, id); err != nil { - mgr.logger.Errorf("failed to track upload '%v', err: %v", id, err) - } - - // create a finish function to finish the upload - finishFn := func() { - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - if err := mgr.b.FinishUpload(ctx, id); err != nil { - mgr.logger.Errorf("failed to mark upload %v as finished: %v", id, err) - } - } - - // create upload - return &upload{ - id: id, - mgr: mgr, - - allowed: allowed, - lockPriority: lockPriority, - - used: make(map[slabID]map[types.PublicKey]struct{}), - }, finishFn, nil -} - -func (mgr *uploadManager) numUploaders(u *upload) (n int) { - mgr.mu.Lock() - defer mgr.mu.Unlock() - for _, uploader := range mgr.uploaders { - if _, allowed := u.allowed[uploader.hk]; allowed { - n++ - } - } - return -} - -func (mgr *uploadManager) candidate(req *sectorUploadReq) *uploader { - // fetch candidate - mgr.mu.Lock() - defer mgr.mu.Unlock() - - // select candidate with the best estimate - var candidate *uploader - for _, uploader := range mgr.uploaders { - if !req.upload.canUseUploader(req.sID, uploader) { - continue // ignore - } else if candidate == nil || uploader.estimate() < candidate.estimate() { - candidate = uploader - } - } - return candidate -} - -func (mgr *uploadManager) renewUploader(u *uploader) { - // fetch renewed contract - fcid, _ := u.contractInfo() - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - renewed, err := mgr.b.RenewedContract(ctx, fcid) - cancel() - - // remove the uploader if we can't renew it - mgr.mu.Lock() - if err != nil { - mgr.logger.Errorf("failed to fetch renewed contract for uploader %v: %v", fcid, err) - for i := 0; i < len(mgr.uploaders); i++ { - if mgr.uploaders[i] == u { - mgr.uploaders = append(mgr.uploaders[:i], mgr.uploaders[i+1:]...) - u.Stop() - break - } - } - mgr.mu.Unlock() - return - } - mgr.mu.Unlock() - - // update the uploader if we found the renewed contract - u.mu.Lock() - u.endHeight = renewed.WindowEnd - u.fcid = renewed.ID - u.host = mgr.hp.newHostV3(renewed.ID, renewed.HostKey, renewed.SiamuxAddr) - u.mu.Unlock() - - u.SignalWork() -} - -func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh uint64) { - // build map - c2m := make(map[types.FileContractID]api.ContractMetadata) - c2r := make(map[types.FileContractID]struct{}) - for _, c := range contracts { - c2m[c.ID] = c - c2r[c.RenewedFrom] = struct{}{} - } - - // prune expired or renewed contracts - var refreshed []*uploader - for _, uploader := range mgr.uploaders { - fcid, endHeight := uploader.contractInfo() - _, renewed := c2r[fcid] - if renewed || bh > endHeight { - uploader.Stop() - continue - } - refreshed = append(refreshed, uploader) - delete(c2m, fcid) - } - - // create new uploaders for missing contracts - for _, c := range c2m { - uploader := mgr.newUploader(c) - refreshed = append(refreshed, uploader) - go uploader.Start(mgr.hp, mgr.rl) - } - - // update blockheight - for _, u := range refreshed { - u.updateBlockHeight(bh) - } - mgr.uploaders = refreshed -} - -func (mgr *uploadManager) tryRecomputeStats() { - mgr.mu.Lock() - defer mgr.mu.Unlock() - if time.Since(mgr.lastRecompute) < statsRecomputeMinInterval { - return - } - - for _, u := range mgr.uploaders { - u.statsSectorUploadEstimateInMS.Recompute() - u.statsSectorUploadSpeedBytesPerMS.Recompute() - } - mgr.lastRecompute = time.Now() -} - -func (u *upload) finishSlabUpload(upload *slabUpload) { - for _, sector := range upload.sectors { - if !sector.isUploaded() { - sector.cancel() - } - } -} - -func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, mem *acquiredMemory) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { - // create slab id - var sID slabID - frand.Read(sID[:]) - - // create slab upload - slab := &slabUpload{ - mgr: u.mgr, - mem: mem, - - upload: u, - sID: sID, - created: time.Now(), - shards: shards, - - sectors: make(map[int]*sectorUpload, len(shards)), - errs: make(HostErrorSet), - } - - // prepare sector uploads - responseChan := make(chan sectorUploadResp) - requests := make([]*sectorUploadReq, len(shards)) - for sI, shard := range shards { - // create the ctx - sCtx, sCancel := context.WithCancel(ctx) - - // attach the upload's span - sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") - span.SetAttributes(attribute.Bool("overdrive", false)) - span.SetAttributes(attribute.Int("sector", sI)) - - // create the sector - sector := §orUpload{ - data: (*[rhpv2.SectorSize]byte)(shard), - index: sI, - root: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), - - ctx: sCtx, - cancel: sCancel, - overdriving: make(map[types.PublicKey]struct{}), - } - slab.sectors[sI] = sector - - // create the request - requests[sI] = §orUploadReq{ - upload: u, - sID: sID, - sector: sector, - responseChan: responseChan, - } - } - - return slab, requests, responseChan -} - -func (u *upload) canUseUploader(sID slabID, ul *uploader) bool { - if _, allowed := u.allowed[ul.hk]; !allowed { - return false - } - - _, used := u.used[sID][ul.hk] - return !used -} - -func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, mem *acquiredMemory) { - // cancel any sector uploads once the slab is done. - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - // add tracing - ctx, span := tracing.Tracer.Start(ctx, "uploadSlab") - defer span.End() - - // create the response - resp := slabUploadResponse{ - slab: object.SlabSlice{ - Slab: object.NewSlab(uint8(rs.MinShards)), - Offset: 0, - Length: uint32(length), - }, - index: index, - } - - // create the shards - shards := make([][]byte, rs.TotalShards) - resp.slab.Slab.Encode(data, shards) - resp.slab.Slab.Encrypt(shards) - - // upload the shards - resp.slab.Slab.Shards, resp.err = u.uploadShards(ctx, shards, mem) - - // send the response - select { - case <-ctx.Done(): - case respChan <- resp: - } -} - -func (u *upload) markUsed(sID slabID, hk types.PublicKey) { - u.mu.Lock() - defer u.mu.Unlock() - - _, exists := u.used[sID] - if !exists { - u.used[sID] = make(map[types.PublicKey]struct{}) - } - u.used[sID][hk] = struct{}{} -} - -func (u *upload) markUnused(sID slabID, hk types.PublicKey) { - u.mu.Lock() - defer u.mu.Unlock() - delete(u.used[sID], hk) -} - -func (u *upload) uploadShards(ctx context.Context, shards [][]byte, mem *acquiredMemory) ([]object.Sector, error) { - // add tracing - ctx, span := tracing.Tracer.Start(ctx, "uploadShards") - defer span.End() - - // prepare the upload - slab, requests, respChan := u.newSlabUpload(ctx, shards, mem) - span.SetAttributes(attribute.Stringer("id", slab.sID)) - defer u.finishSlabUpload(slab) - - // launch all shard uploads - for _, upload := range requests { - if _, err := slab.launch(upload); err != nil { - return nil, err - } - } - - // launch overdrive - resetOverdrive := slab.overdrive(ctx, respChan) - - // collect responses - var done bool - for slab.inflight() > 0 && !done { - var resp sectorUploadResp - select { - case <-u.mgr.stopChan: - return nil, errors.New("upload stopped") - case <-ctx.Done(): - return nil, ctx.Err() - case resp = <-respChan: - } - - resetOverdrive() - - // receive the response - done = slab.receive(resp) - - // relaunch non-overdrive uploads - if !done && resp.err != nil && !resp.req.overdrive { - if overdriving, err := slab.launch(resp.req); err != nil { - u.mgr.logger.Errorf("failed to relaunch a sector upload, err %v", err) - if !overdriving { - break // fail the upload - } - } - } - } - - // register the amount of overdrive sectors - span.SetAttributes(attribute.Int("overdrive", slab.overdriveCnt())) - - // track stats - u.mgr.statsOverdrivePct.Track(slab.overdrivePct()) - u.mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(slab.uploadSpeed())) - return slab.finish() -} - -func (u *uploader) contractInfo() (types.FileContractID, uint64) { - u.mu.Lock() - defer u.mu.Unlock() - return u.fcid, u.bh -} - -func (u *uploader) SignalWork() { - select { - case u.signalNewUpload <- struct{}{}: - default: - } -} - -func (u *uploader) Start(hp hostProvider, rl revisionLocker) { -outer: - for { - // wait for work - select { - case <-u.signalNewUpload: - case <-u.stopChan: - return - } - - for { - // check if we are stopped - select { - case <-u.stopChan: - return - default: - } - - // pop the next upload req - req := u.pop() - if req == nil { - continue outer - } - - // skip if upload is done - if req.done() { - continue - } - - // execute it - var root types.Hash256 - start := time.Now() - fcid, _ := u.contractInfo() - err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.upload.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { - if rev.RevisionNumber == math.MaxUint64 { - return errMaxRevisionReached - } - - var err error - root, err = u.execute(req, rev) - return err - }) - - // the uploader's contract got renewed, requeue the request, try and refresh the contract - if errors.Is(err, errMaxRevisionReached) { - u.requeue(req) - u.mgr.renewUploader(u) - continue outer - } - - // send the response - if err != nil { - req.fail(err) - } else { - req.succeed(root) - } - - // track the error, ignore gracefully closed streams and canceled overdrives - canceledOverdrive := req.done() && req.overdrive && err != nil - if !canceledOverdrive && !isClosedStream(err) { - u.trackSectorUpload(err, time.Since(start)) - } - } - } -} - -func (u *uploader) Stop() { - close(u.stopChan) - - // clear the queue - for { - upload := u.pop() - if upload == nil { - break - } - if !upload.done() { - upload.fail(errors.New("uploader stopped")) - } - } -} - -func (u *uploader) Stats() (healthy bool, mbps float64) { - u.mu.Lock() - defer u.mu.Unlock() - healthy = u.consecutiveFailures == 0 - mbps = u.statsSectorUploadSpeedBytesPerMS.Average() * 0.008 - return -} - -func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) (types.Hash256, error) { - u.mu.Lock() - host := u.host - fcid := u.fcid - u.mu.Unlock() - - // fetch span from context - span := trace.SpanFromContext(req.sector.ctx) - span.AddEvent("execute") - - // update the bus - if err := u.mgr.b.AddUploadingSector(req.sector.ctx, req.upload.id, fcid, req.sector.root); err != nil { - return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) - } - - // upload the sector - start := time.Now() - root, err := host.UploadSector(req.sector.ctx, req.sector.data, rev) - if err != nil { - return types.Hash256{}, err - } - - // update span - elapsed := time.Since(start) - span.SetAttributes(attribute.Int64("duration", elapsed.Milliseconds())) - span.RecordError(err) - span.End() - - return root, nil -} - -func (u *uploader) blockHeight() uint64 { - u.mu.Lock() - defer u.mu.Unlock() - return u.bh -} - -func (u *uploader) estimate() float64 { - u.mu.Lock() - defer u.mu.Unlock() - - // fetch estimated duration per sector - estimateP90 := u.statsSectorUploadEstimateInMS.P90() - if estimateP90 == 0 { - estimateP90 = 1 - } - - // calculate estimated time - numSectors := float64(len(u.queue) + 1) - return numSectors * estimateP90 -} - -func (u *uploader) requeue(req *sectorUploadReq) { - u.mu.Lock() - defer u.mu.Unlock() - u.queue = append([]*sectorUploadReq{req}, u.queue...) -} - -func (u *uploader) enqueue(req *sectorUploadReq) { - // trace the request - span := trace.SpanFromContext(req.sector.ctx) - span.SetAttributes(attribute.Stringer("hk", u.hk)) - span.AddEvent("enqueued") - - // decorate the request - fcid, _ := u.contractInfo() - req.fcid = fcid - req.hk = u.hk - - // enqueue the request - u.mu.Lock() - u.queue = append(u.queue, req) - u.mu.Unlock() - - // mark as used - req.upload.markUsed(req.sID, u.hk) - - // signal there's work - u.SignalWork() -} - -func (u *uploader) trackSectorUpload(err error, d time.Duration) { - u.mu.Lock() - defer u.mu.Unlock() - if err != nil { - u.consecutiveFailures++ - u.statsSectorUploadEstimateInMS.Track(float64(time.Hour.Milliseconds())) - } else { - ms := d.Milliseconds() - u.consecutiveFailures = 0 - u.statsSectorUploadEstimateInMS.Track(float64(ms)) // duration in ms - u.statsSectorUploadSpeedBytesPerMS.Track(float64(rhpv2.SectorSize / ms)) // bytes per ms - } -} - -func (u *uploader) updateBlockHeight(bh uint64) { - u.mu.Lock() - defer u.mu.Unlock() - u.bh = bh -} - -func (u *uploader) pop() *sectorUploadReq { - u.mu.Lock() - defer u.mu.Unlock() - - if len(u.queue) > 0 { - j := u.queue[0] - u.queue[0] = nil - u.queue = u.queue[1:] - return j - } - return nil -} - -func (req *sectorUploadReq) succeed(root types.Hash256) { - select { - case <-req.sector.ctx.Done(): - case req.responseChan <- sectorUploadResp{ - req: req, - root: root, - }: - } -} - -func (req *sectorUploadReq) fail(err error) { - select { - case <-req.sector.ctx.Done(): - case req.responseChan <- sectorUploadResp{ - req: req, - err: err, - }: - } -} - -func (req *sectorUploadReq) done() bool { - select { - case <-req.sector.ctx.Done(): - return true - default: - return false - } -} - -func (s *slabUpload) uploadSpeed() int64 { - s.mu.Lock() - defer s.mu.Unlock() - bytes := s.numUploaded * rhpv2.SectorSize - ms := time.Since(s.created).Milliseconds() - return int64(bytes) / ms -} - -func (s *slabUpload) finish() (sectors []object.Sector, _ error) { - s.mu.Lock() - defer s.mu.Unlock() - - if s.numUploaded < uint64(len(s.shards)) { - remaining := uint64(len(s.shards)) - s.numUploaded - return nil, fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d uploaders=%d errors=%d %w", s.numLaunched, s.numUploaded, remaining, s.numInflight, s.mgr.numUploaders(s.upload), len(s.errs), s.errs) - } - - for i := 0; i < len(s.shards); i++ { - sectors = append(sectors, s.sectors[i].uploaded) - } - return -} - -func (s *slabUpload) inflight() uint64 { - s.mu.Lock() - defer s.mu.Unlock() - return s.numInflight -} - -func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { - s.mu.Lock() - defer s.mu.Unlock() - - // nothing to do - if req == nil { - return false, nil - } - - // launch the req - err = s.mgr.launch(req) - if err != nil { - interrupt = !req.overdrive && req.sector.numOverdriving() == 0 - span := trace.SpanFromContext(req.sector.ctx) - span.RecordError(err) - span.End() - return - } - - // update the state - s.numInflight++ - s.numLaunched++ - if req.overdrive { - s.lastOverdrive = time.Now() - - req.sector.mu.Lock() - req.sector.overdriving[req.hk] = struct{}{} - req.sector.mu.Unlock() - } - return -} - -func (s *slabUpload) overdrive(ctx context.Context, respChan chan sectorUploadResp) (resetTimer func()) { - // overdrive is disabled - if s.mgr.overdriveTimeout == 0 { - return func() {} - } - - // create a timer to trigger overdrive - timer := time.NewTimer(s.mgr.overdriveTimeout) - resetTimer = func() { - timer.Stop() - select { - case <-timer.C: - default: - } - timer.Reset(s.mgr.overdriveTimeout) - } - - // create a function to check whether overdrive is possible - canOverdrive := func() bool { - s.mu.Lock() - defer s.mu.Unlock() - - // overdrive is not kicking in yet - remaining := uint64(len(s.shards)) - s.numUploaded - if remaining >= s.mgr.maxOverdrive { - return false - } - - // overdrive is not due yet - if time.Since(s.lastOverdrive) < s.mgr.overdriveTimeout { - return false - } - - // overdrive is maxed out - if s.numInflight-remaining >= s.mgr.maxOverdrive { - return false - } - - return true - } - - // try overdriving every time the timer fires - go func() { - for { - select { - case <-ctx.Done(): - return - case <-timer.C: - if canOverdrive() { - _, _ = s.launch(s.nextRequest(respChan)) // ignore result - } - resetTimer() - } - } - }() - - return -} - -func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUploadReq { - s.mu.Lock() - defer s.mu.Unlock() - - // find the sector that's not finished and has the least amount of overdrives - lowestNumOverdrives := math.MaxInt - var nextSector *sectorUpload - for _, sector := range s.sectors { - if !sector.isUploaded() && sector.numOverdriving() < lowestNumOverdrives { - nextSector = sector - } - } - if nextSector == nil { - return nil - } - - return §orUploadReq{ - upload: s.upload, - sID: s.sID, - sector: nextSector, - - overdrive: true, - responseChan: responseChan, - } -} - -func (s *slabUpload) overdriveCnt() int { - s.mu.Lock() - defer s.mu.Unlock() - return int(s.numLaunched) - len(s.sectors) -} - -func (s *slabUpload) overdrivePct() float64 { - s.mu.Lock() - defer s.mu.Unlock() - - numOverdrive := int(s.numLaunched) - len(s.sectors) - if numOverdrive <= 0 { - return 0 - } - - return float64(numOverdrive) / float64(len(s.sectors)) -} - -func (s *slabUpload) receive(resp sectorUploadResp) bool { - s.mu.Lock() - defer s.mu.Unlock() - - // convenience variable - req := resp.req - sector := req.sector - - // update the state - if req.overdrive { - sector.mu.Lock() - delete(sector.overdriving, req.hk) - sector.mu.Unlock() - } - - // failed reqs can't complete the upload - s.numInflight-- - if resp.err != nil { - s.errs[req.hk] = resp.err - return false - } - - // mark uploaders we used for overdrives as unused - sector.mu.Lock() - overdriving := sector.overdriving - sector.overdriving = nil - sector.mu.Unlock() - for fcid := range overdriving { - s.upload.markUnused(req.sID, fcid) - } - - // redundant sectors can't complete the upload - if sector.uploaded.Root != (types.Hash256{}) { - return false - } - - // store the sector - sector.uploaded = object.Sector{ - Contracts: map[types.PublicKey][]types.FileContractID{req.hk: {req.fcid}}, - LatestHost: req.hk, - Root: resp.root, - } - - // cancel the sector context - sector.cancel() - - // update uploaded sectors - s.numUploaded++ - - // release memory - sector.data = nil - s.shards[sector.index] = nil - s.mem.ReleaseSome(rhpv2.SectorSize) - - return s.numUploaded == uint64(len(s.shards)) -} - -func (s *sectorUpload) isUploaded() bool { - return s.uploaded.Root != (types.Hash256{}) -} - -func (s *sectorUpload) numOverdriving() int { - s.mu.Lock() - defer s.mu.Unlock() - return len(s.overdriving) -} - -func (sID slabID) String() string { - return fmt.Sprintf("%x", sID[:]) -} - func newMimeReader(r io.Reader) (mimeType string, recycled io.Reader, err error) { buf := bytes.NewBuffer(nil) mtype, err := mimetype.DetectReader(io.TeeReader(r, buf)) diff --git a/worker/upload_manager.go b/worker/upload_manager.go new file mode 100644 index 000000000..f5c5fc576 --- /dev/null +++ b/worker/upload_manager.go @@ -0,0 +1,1383 @@ +package worker + +import ( + "context" + "errors" + "fmt" + "io" + "math" + "mime" + "path/filepath" + "sort" + "sync" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + rhpv2 "go.sia.tech/core/rhp/v2" + "go.sia.tech/core/types" + "go.sia.tech/renterd/api" + "go.sia.tech/renterd/object" + "go.sia.tech/renterd/stats" + "go.sia.tech/renterd/tracing" + "go.uber.org/zap" +) + +const ( + statsRecomputeMinInterval = 3 * time.Second + + defaultPackedSlabsLockDuration = 10 * time.Minute + defaultPackedSlabsUploadTimeout = 10 * time.Minute +) + +var ( + errNoCandidateUploader = errors.New("no candidate uploader found") + errNotEnoughContracts = errors.New("not enough contracts to support requested redundancy") + errUploadManagerStopped = errors.New("upload manager stopped") +) + +type ( + uploadManager struct { + b Bus + hp hostProvider + rl revisionLocker + mm memoryManager + logger *zap.SugaredLogger + shutdownCtx context.Context + + maxOverdrive uint64 + overdriveTimeout time.Duration + + statsOverdrivePct *stats.DataPoints + statsSlabUploadSpeedBytesPerMS *stats.DataPoints + + mu sync.Mutex + uploaders []*uploader + } + + uploader struct { + b Bus + + hk types.PublicKey + siamuxAddr string + signalNewUpload chan struct{} + shutdownCtx context.Context + + mu sync.Mutex + bh uint64 + endHeight uint64 + fcid types.FileContractID + host hostV3 + queue []*sectorUploadReq + + // stats related field + consecutiveFailures uint64 + lastRecompute time.Time + + statsSectorUploadEstimateInMS *stats.DataPoints + statsSectorUploadSpeedBytesPerMS *stats.DataPoints + } + + uploadManagerStats struct { + avgSlabUploadSpeedMBPS float64 + avgOverdrivePct float64 + healthyUploaders uint64 + numUploaders uint64 + uploadSpeedsMBPS map[types.PublicKey]float64 + } + + upload struct { + id api.UploadID + allowed map[types.PublicKey]struct{} + lockPriority int + shutdownCtx context.Context + } + + slabUpload struct { + uploadID api.UploadID + created time.Time + lockPriority int + maxOverdrive uint64 + mem *acquiredMemory + overdriveTimeout time.Duration + + candidates []*uploader + shards [][]byte + + mu sync.Mutex + numInflight uint64 + numLaunched uint64 + numUploaded uint64 + + overdriving map[int]map[types.PublicKey]struct{} + lastOverdrive time.Time + numOverdriving uint64 + + sectors map[int]*sectorUpload + used map[types.PublicKey]struct{} + errs HostErrorSet + } + + slabUploadResponse struct { + slab object.SlabSlice + index int + err error + } + + sectorUpload struct { + data *[rhpv2.SectorSize]byte + index int + root types.Hash256 + uploaded object.Sector + + ctx context.Context + cancel context.CancelFunc + } + + sectorUploadReq struct { + lockPriority int + overdrive bool + responseChan chan sectorUploadResp + sector *sectorUpload + uploadID api.UploadID + + // set by the uploader performing the upload + fcid types.FileContractID + hk types.PublicKey + } + + sectorUploadResp struct { + req *sectorUploadReq + root types.Hash256 + err error + } +) + +func (w *worker) initUploadManager(mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { + if w.uploadManager != nil { + panic("upload manager already initialized") // developer error + } + + w.uploadManager = newUploadManager(w.bus, w, w, mm, maxOverdrive, overdriveTimeout, w.shutdownCtx, logger) +} + +func (w *worker) upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, opts ...UploadOption) (_ string, err error) { + // apply the options + for _, opt := range opts { + opt(&up) + } + + // if not given, try decide on a mime type using the file extension + if !up.multipart && up.mimeType == "" { + up.mimeType = mime.TypeByExtension(filepath.Ext(up.path)) + + // if mime type is still not known, wrap the reader with a mime reader + if up.mimeType == "" { + up.mimeType, r, err = newMimeReader(r) + if err != nil { + return + } + } + } + + // perform the upload + bufferSizeLimitReached, eTag, err := w.uploadManager.Upload(ctx, r, contracts, up, lockingPriorityUpload) + if err != nil { + return "", err + } + + // if packing was enabled try uploading packed slabs + if up.packing { + if err := w.tryUploadPackedSlabs(ctx, up.rs, up.contractSet, bufferSizeLimitReached); err != nil { + w.logger.Errorf("couldn't upload packed slabs, err: %v", err) + } + } + return eTag, nil +} + +func (w *worker) threadedUploadPackedSlabs(rs api.RedundancySettings, contractSet string, lockPriority int) { + key := fmt.Sprintf("%d-%d_%s", rs.MinShards, rs.TotalShards, contractSet) + + w.uploadsMu.Lock() + if w.uploadingPackedSlabs[key] { + w.uploadsMu.Unlock() + return + } + w.uploadingPackedSlabs[key] = true + w.uploadsMu.Unlock() + + // make sure we mark uploading packed slabs as false when we're done + defer func() { + w.uploadsMu.Lock() + w.uploadingPackedSlabs[key] = false + w.uploadsMu.Unlock() + }() + + // keep uploading packed slabs until we're done + ctx := context.WithValue(w.shutdownCtx, keyInteractionRecorder, w) + for { + uploaded, err := w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockPriority) + if err != nil { + w.logger.Errorf("couldn't upload packed slabs, err: %v", err) + return + } else if uploaded == 0 { + return + } + } +} + +func (w *worker) tryUploadPackedSlabs(ctx context.Context, rs api.RedundancySettings, contractSet string, block bool) (err error) { + // if we want to block, try and upload one packed slab synchronously, we use + // a slightly higher upload priority to avoid reaching the context deadline + if block { + _, err = w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockingPriorityBlockedUpload) + } + + // make sure there's a goroutine uploading the remainder of the packed slabs + go w.threadedUploadPackedSlabs(rs, contractSet, lockingPriorityBackgroundUpload) + return +} + +func (w *worker) uploadPackedSlabs(ctx context.Context, lockingDuration time.Duration, rs api.RedundancySettings, contractSet string, lockPriority int) (uploaded int, err error) { + // upload packed slabs + var mu sync.Mutex + var errs error + + var wg sync.WaitGroup + totalSize := uint64(rs.TotalShards) * rhpv2.SectorSize + + // derive a context that we can use as an interrupt in case of an error. + interruptCtx, cancel := context.WithCancel(ctx) + defer cancel() + + for { + // block until we have memory for a slab or until we are interrupted + mem := w.uploadManager.mm.AcquireMemory(interruptCtx, totalSize) + if mem == nil { + break // interrupted + } + + // fetch packed slabs to upload + var packedSlabs []api.PackedSlab + packedSlabs, err = w.bus.PackedSlabsForUpload(ctx, lockingDuration, uint8(rs.MinShards), uint8(rs.TotalShards), contractSet, 1) + if err != nil { + err = fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) + mem.Release() + break + } else if len(packedSlabs) == 0 { + mem.Release() + break // no more slabs + } + ps := packedSlabs[0] + + // launch upload for slab + wg.Add(1) + go func(ps api.PackedSlab) { + defer mem.Release() + defer wg.Done() + err := w.uploadPackedSlab(ctx, ps, rs, contractSet, lockPriority, mem) + mu.Lock() + if err != nil { + errs = errors.Join(errs, err) + cancel() // prevent new uploads from being launched + } else { + uploaded++ + } + mu.Unlock() + }(ps) + } + + // wait for all threads to finish + wg.Wait() + + // return collected errors + err = errors.Join(err, errs) + return +} + +func (w *worker) uploadPackedSlab(ctx context.Context, ps api.PackedSlab, rs api.RedundancySettings, contractSet string, lockPriority int, mem *acquiredMemory) error { + // create a context with sane timeout + ctx, cancel := context.WithTimeout(ctx, defaultPackedSlabsUploadTimeout) + defer cancel() + + // fetch contracts + contracts, err := w.bus.ContractSetContracts(ctx, contractSet) + if err != nil { + return fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) + } + + // fetch upload params + up, err := w.bus.UploadParams(ctx) + if err != nil { + return fmt.Errorf("couldn't fetch upload params from bus: %v", err) + } + + // attach gouging checker to the context + ctx = WithGougingChecker(ctx, w.bus, up.GougingParams) + + // upload packed slab + err = w.uploadManager.UploadPackedSlab(ctx, rs, ps, contracts, up.CurrentHeight, lockPriority, mem) + if err != nil { + return fmt.Errorf("couldn't upload packed slab, err: %v", err) + } + + return nil +} + +func newUploadManager(b Bus, hp hostProvider, rl revisionLocker, mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, shutdownCtx context.Context, logger *zap.SugaredLogger) *uploadManager { + return &uploadManager{ + b: b, + hp: hp, + rl: rl, + logger: logger, + mm: mm, + + maxOverdrive: maxOverdrive, + overdriveTimeout: overdriveTimeout, + + statsOverdrivePct: stats.NoDecay(), + statsSlabUploadSpeedBytesPerMS: stats.NoDecay(), + + shutdownCtx: shutdownCtx, + + uploaders: make([]*uploader, 0), + } +} + +func (mgr *uploadManager) newUploader(b Bus, hp hostProvider, c api.ContractMetadata, bh uint64) *uploader { + return &uploader{ + b: b, + + // static + hk: c.HostKey, + siamuxAddr: c.SiamuxAddr, + shutdownCtx: mgr.shutdownCtx, + signalNewUpload: make(chan struct{}, 1), + + // stats + statsSectorUploadEstimateInMS: stats.Default(), + statsSectorUploadSpeedBytesPerMS: stats.NoDecay(), + + // covered by mutex + host: hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr), + bh: bh, + fcid: c.ID, + endHeight: c.WindowEnd, + queue: make([]*sectorUploadReq, 0), + } +} + +func (mgr *uploadManager) MigrateShards(ctx context.Context, s *object.Slab, shardIndices []int, shards [][]byte, contractSet string, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { + // create the upload + upload, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) + if err != nil { + return err + } + + // track the upload in the bus + if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { + return fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) + } + + // defer a function that finishes the upload + defer func() { + ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) + if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { + mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) + } + cancel() + }() + + // upload the shards + uploaded, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + if err != nil { + return err + } + + // track stats + mgr.statsOverdrivePct.Track(overdrivePct) + mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) + + // overwrite the shards with the newly uploaded ones + for i, si := range shardIndices { + s.Shards[si].LatestHost = uploaded[i].LatestHost + + knownContracts := make(map[types.FileContractID]struct{}) + for _, fcids := range s.Shards[si].Contracts { + for _, fcid := range fcids { + knownContracts[fcid] = struct{}{} + } + } + for hk, fcids := range uploaded[i].Contracts { + for _, fcid := range fcids { + if _, exists := knownContracts[fcid]; !exists { + if s.Shards[si].Contracts == nil { + s.Shards[si].Contracts = make(map[types.PublicKey][]types.FileContractID) + } + s.Shards[si].Contracts[hk] = append(s.Shards[si].Contracts[hk], fcid) + } + } + } + } + + // update the slab + return mgr.b.UpdateSlab(ctx, *s, contractSet) +} + +func (mgr *uploadManager) Stats() uploadManagerStats { + mgr.mu.Lock() + defer mgr.mu.Unlock() + + var numHealthy uint64 + speeds := make(map[types.PublicKey]float64) + for _, u := range mgr.uploaders { + u.tryRecomputeStats() + speeds[u.hk] = u.statsSectorUploadSpeedBytesPerMS.Average() * 0.008 + if u.healthy() { + numHealthy++ + } + } + + // prepare stats + return uploadManagerStats{ + avgSlabUploadSpeedMBPS: mgr.statsSlabUploadSpeedBytesPerMS.Average() * 0.008, // convert bytes per ms to mbps, + avgOverdrivePct: mgr.statsOverdrivePct.Average(), + healthyUploaders: numHealthy, + numUploaders: uint64(len(speeds)), + uploadSpeedsMBPS: speeds, + } +} + +func (mgr *uploadManager) Stop() { + mgr.mu.Lock() + defer mgr.mu.Unlock() + for _, u := range mgr.uploaders { + u.Stop() + } +} + +func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, lockPriority int) (bufferSizeLimitReached bool, eTag string, err error) { + // cancel all in-flight requests when the upload is done + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "upload") + defer func() { + span.RecordError(err) + span.End() + }() + + // create the object + o := object.NewObject(up.ec) + + // create the hash reader + hr := newHashReader(r) + + // create the cipher reader + cr, err := o.Encrypt(hr, up.encryptionOffset) + if err != nil { + return false, "", err + } + + // create the upload + upload, err := mgr.newUpload(ctx, up.rs.TotalShards, contracts, up.bh, lockPriority) + if err != nil { + return false, "", err + } + + // track the upload in the bus + if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { + return false, "", fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) + } + + // defer a function that finishes the upload + defer func() { + ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) + if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { + mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) + } + cancel() + }() + + // create the response channel + respChan := make(chan slabUploadResponse) + + // channel to notify main thread of the number of slabs to wait for + numSlabsChan := make(chan int, 1) + + // prepare slab size + size := int64(up.rs.MinShards) * rhpv2.SectorSize + redundantSize := uint64(up.rs.TotalShards) * rhpv2.SectorSize + var partialSlab []byte + + // launch uploads in a separate goroutine + go func() { + var slabIndex int + for { + select { + case <-mgr.shutdownCtx.Done(): + return // interrupted + case <-ctx.Done(): + return // interrupted + default: + } + // acquire memory + mem := mgr.mm.AcquireMemory(ctx, redundantSize) + if mem == nil { + return // interrupted + } + + // read next slab's data + data := make([]byte, size) + length, err := io.ReadFull(io.LimitReader(cr, size), data) + if err == io.EOF { + mem.Release() + + // no more data to upload, notify main thread of the number of + // slabs to wait for + numSlabs := slabIndex + if partialSlab != nil && slabIndex > 0 { + numSlabs-- // don't wait on partial slab + } + numSlabsChan <- numSlabs + return + } else if err != nil && err != io.ErrUnexpectedEOF { + mem.Release() + + // unexpected error, notify main thread + select { + case respChan <- slabUploadResponse{err: err}: + case <-ctx.Done(): + } + return + } else if up.packing && errors.Is(err, io.ErrUnexpectedEOF) { + mem.Release() + + // uploadPacking is true, we return the partial slab without + // uploading. + partialSlab = data[:length] + } else { + // regular upload + go func(rs api.RedundancySettings, data []byte, length, slabIndex int) { + upload.uploadSlab(ctx, rs, data, length, slabIndex, respChan, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + mem.Release() + }(up.rs, data, length, slabIndex) + } + slabIndex++ + } + }() + + // collect responses + var responses []slabUploadResponse + numSlabs := math.MaxInt32 + for len(responses) < numSlabs { + select { + case <-mgr.shutdownCtx.Done(): + return false, "", errUploadManagerStopped + case numSlabs = <-numSlabsChan: + case res := <-respChan: + if res.err != nil { + return false, "", res.err + } + responses = append(responses, res) + } + } + + // sort the slabs by index + sort.Slice(responses, func(i, j int) bool { + return responses[i].index < responses[j].index + }) + + // decorate the object with the slabs + for _, resp := range responses { + o.Slabs = append(o.Slabs, resp.slab) + } + + // calculate the eTag + eTag = hr.Hash() + + // add partial slabs + if len(partialSlab) > 0 { + var pss []object.SlabSlice + pss, bufferSizeLimitReached, err = mgr.b.AddPartialSlab(ctx, partialSlab, uint8(up.rs.MinShards), uint8(up.rs.TotalShards), up.contractSet) + if err != nil { + return false, "", err + } + o.Slabs = append(o.Slabs, pss...) + } + + if up.multipart { + // persist the part + err = mgr.b.AddMultipartPart(ctx, up.bucket, up.path, up.contractSet, eTag, up.uploadID, up.partNumber, o.Slabs) + if err != nil { + return bufferSizeLimitReached, "", fmt.Errorf("couldn't add multi part: %w", err) + } + } else { + // persist the object + err = mgr.b.AddObject(ctx, up.bucket, up.path, up.contractSet, o, api.AddObjectOptions{MimeType: up.mimeType, ETag: eTag}) + if err != nil { + return bufferSizeLimitReached, "", fmt.Errorf("couldn't add object: %w", err) + } + } + + return +} + +func (mgr *uploadManager) UploadPackedSlab(ctx context.Context, rs api.RedundancySettings, ps api.PackedSlab, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { + // build the shards + shards := encryptPartialSlab(ps.Data, ps.Key, uint8(rs.MinShards), uint8(rs.TotalShards)) + + // create the upload + upload, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) + if err != nil { + return err + } + + // track the upload in the bus + if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { + return fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) + } + + // defer a function that finishes the upload + defer func() { + ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) + if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { + mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) + } + cancel() + }() + + // upload the shards + sectors, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + if err != nil { + return err + } + + // track stats + mgr.statsOverdrivePct.Track(overdrivePct) + mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) + + // mark packed slab as uploaded + slab := api.UploadedPackedSlab{BufferID: ps.BufferID, Shards: sectors} + err = mgr.b.MarkPackedSlabsUploaded(ctx, []api.UploadedPackedSlab{slab}) + if err != nil { + return fmt.Errorf("couldn't mark packed slabs uploaded, err: %v", err) + } + + return nil +} + +func (mgr *uploadManager) candidates(allowed map[types.PublicKey]struct{}) (candidates []*uploader) { + mgr.mu.Lock() + defer mgr.mu.Unlock() + + for _, u := range mgr.uploaders { + if _, allowed := allowed[u.hk]; allowed { + candidates = append(candidates, u) + } + } + return +} + +func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contracts []api.ContractMetadata, bh uint64, lockPriority int) (*upload, error) { + mgr.mu.Lock() + defer mgr.mu.Unlock() + + // refresh the uploaders + mgr.refreshUploaders(contracts, bh) + + // check if we have enough contracts + if len(contracts) < totalShards { + return nil, fmt.Errorf("%v < %v: %w", len(contracts), totalShards, errNotEnoughContracts) + } + + // create allowed map + allowed := make(map[types.PublicKey]struct{}) + for _, c := range contracts { + allowed[c.HostKey] = struct{}{} + } + + // create upload + return &upload{ + id: api.NewUploadID(), + allowed: allowed, + lockPriority: lockPriority, + shutdownCtx: mgr.shutdownCtx, + }, nil +} + +func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh uint64) { + // build map of contracts + toKeep := make(map[types.FileContractID]api.ContractMetadata) + for _, c := range contracts { + toKeep[c.ID] = c + } + + // build map of renewed contracts + renewedTo := make(map[types.FileContractID]api.ContractMetadata) + for _, c := range contracts { + if c.RenewedFrom != (types.FileContractID{}) { + renewedTo[c.RenewedFrom] = c + } + } + + // keep list of uploaders uploaders + var uploaders []*uploader + for _, uploader := range mgr.uploaders { + fcid := uploader.contractID() + + renewal, renewed := renewedTo[fcid] + if _, keep := toKeep[fcid]; !(keep || renewed) { + uploader.Stop() + continue + } + delete(toKeep, fcid) // toKeep becomes missing + + if renewed { + uploader.renew(mgr.hp, renewal, bh) + } else { + uploader.updateBlockHeight(bh) + } + uploaders = append(uploaders, uploader) + } + + for _, c := range toKeep { + uploader := mgr.newUploader(mgr.b, mgr.hp, c, bh) + uploaders = append(uploaders, uploader) + go uploader.Start(mgr.hp, mgr.rl) + } + + mgr.uploaders = uploaders +} + +func (u *uploader) SignalWork() { + select { + case u.signalNewUpload <- struct{}{}: + default: + } +} + +func (u *uploader) Start(hp hostProvider, rl revisionLocker) { +outer: + for { + // wait for work + select { + case <-u.signalNewUpload: + case <-u.shutdownCtx.Done(): + return + } + + for { + // check if we are stopped + select { + case <-u.shutdownCtx.Done(): + return + default: + } + + // pop the next upload req + req := u.pop() + if req == nil { + continue outer + } + + // skip if upload is done + if req.done() { + continue + } + + // execute it + var root types.Hash256 + start := time.Now() + fcid := u.contractID() + err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { + if rev.RevisionNumber == math.MaxUint64 { + return errMaxRevisionReached + } + + var err error + root, err = u.execute(req, rev) + return err + }) + + // the uploader's contract got renewed, requeue the request + if errors.Is(err, errMaxRevisionReached) { + u.enqueue(req) + continue outer + } + + // send the response + if err != nil { + req.fail(err) + } else { + req.succeed(root) + } + + // track the error, ignore gracefully closed streams and canceled overdrives + canceledOverdrive := req.done() && req.overdrive && err != nil + if !canceledOverdrive && !isClosedStream(err) { + u.trackSectorUpload(err, time.Since(start)) + } + } + } +} + +func (u *uploader) healthy() bool { + u.mu.Lock() + defer u.mu.Unlock() + return u.consecutiveFailures == 0 +} + +func (u *uploader) Stop() { + for { + upload := u.pop() + if upload == nil { + break + } + if !upload.done() { + upload.fail(errors.New("uploader stopped")) + } + } +} + +func (u *uploader) blockHeight() uint64 { + u.mu.Lock() + defer u.mu.Unlock() + return u.bh +} + +func (u *uploader) contractID() types.FileContractID { + u.mu.Lock() + defer u.mu.Unlock() + return u.fcid +} + +func (u *uploader) enqueue(req *sectorUploadReq) { + // trace the request + span := trace.SpanFromContext(req.sector.ctx) + span.SetAttributes(attribute.Stringer("hk", u.hk)) + span.AddEvent("enqueued") + + // decorate the request + req.fcid = u.contractID() + req.hk = u.hk + + // enqueue the request + u.mu.Lock() + u.queue = append(u.queue, req) + u.mu.Unlock() + + // signal there's work + u.SignalWork() +} + +func (u *uploader) estimate() float64 { + u.mu.Lock() + defer u.mu.Unlock() + + // fetch estimated duration per sector + estimateP90 := u.statsSectorUploadEstimateInMS.P90() + if estimateP90 == 0 { + estimateP90 = 1 + } + + // calculate estimated time + numSectors := float64(len(u.queue) + 1) + return numSectors * estimateP90 +} + +func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) (types.Hash256, error) { + u.mu.Lock() + host := u.host + fcid := u.fcid + u.mu.Unlock() + + // fetch span from context + span := trace.SpanFromContext(req.sector.ctx) + span.AddEvent("execute") + + // update the bus + if err := u.b.AddUploadingSector(req.sector.ctx, req.uploadID, fcid, req.sector.root); err != nil { + return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) + } + + // upload the sector + start := time.Now() + root, err := host.UploadSector(req.sector.ctx, req.sector.data, rev) + if err != nil { + return types.Hash256{}, err + } + + // update span + elapsed := time.Since(start) + span.SetAttributes(attribute.Int64("duration", elapsed.Milliseconds())) + span.RecordError(err) + span.End() + + return root, nil +} + +func (u *uploader) pop() *sectorUploadReq { + u.mu.Lock() + defer u.mu.Unlock() + + if len(u.queue) > 0 { + j := u.queue[0] + u.queue[0] = nil + u.queue = u.queue[1:] + return j + } + return nil +} + +func (u *uploader) renew(hp hostProvider, c api.ContractMetadata, bh uint64) { + u.mu.Lock() + defer u.mu.Unlock() + + u.bh = bh + u.host = hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr) + u.fcid = c.ID + u.siamuxAddr = c.SiamuxAddr + u.endHeight = c.WindowEnd +} + +func (u *uploader) trackSectorUpload(err error, d time.Duration) { + u.mu.Lock() + defer u.mu.Unlock() + if err != nil { + u.consecutiveFailures++ + u.statsSectorUploadEstimateInMS.Track(float64(time.Hour.Milliseconds())) + } else { + ms := d.Milliseconds() + u.consecutiveFailures = 0 + u.statsSectorUploadEstimateInMS.Track(float64(ms)) // duration in ms + u.statsSectorUploadSpeedBytesPerMS.Track(float64(rhpv2.SectorSize / ms)) // bytes per ms + } +} + +func (u *uploader) tryRecomputeStats() { + u.mu.Lock() + defer u.mu.Unlock() + if time.Since(u.lastRecompute) < statsRecomputeMinInterval { + return + } + + u.lastRecompute = time.Now() + u.statsSectorUploadEstimateInMS.Recompute() + u.statsSectorUploadSpeedBytesPerMS.Recompute() +} + +func (u *uploader) updateBlockHeight(bh uint64) { + u.mu.Lock() + defer u.mu.Unlock() + u.bh = bh +} + +func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { + // create slab upload + slab := &slabUpload{ + lockPriority: u.lockPriority, + uploadID: u.id, + created: time.Now(), + maxOverdrive: maxOverdrive, + mem: mem, + overdriveTimeout: overdriveTimeout, + + candidates: candidates, + shards: shards, + + overdriving: make(map[int]map[types.PublicKey]struct{}), + sectors: make(map[int]*sectorUpload, len(shards)), + used: make(map[types.PublicKey]struct{}), + errs: make(HostErrorSet), + } + + // prepare sector uploads + responseChan := make(chan sectorUploadResp) + requests := make([]*sectorUploadReq, len(shards)) + for sI, shard := range shards { + // create the ctx + sCtx, sCancel := context.WithCancel(ctx) + + // attach the upload's span + sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") + span.SetAttributes(attribute.Bool("overdrive", false)) + span.SetAttributes(attribute.Int("sector", sI)) + + // create the sector + sector := §orUpload{ + data: (*[rhpv2.SectorSize]byte)(shard), + index: sI, + root: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), + + ctx: sCtx, + cancel: sCancel, + } + slab.sectors[sI] = sector + + // create the request + requests[sI] = §orUploadReq{ + lockPriority: slab.lockPriority, + overdrive: false, + responseChan: responseChan, + sector: sector, + uploadID: slab.uploadID, + } + } + + return slab, requests, responseChan +} + +func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (overdrivePct float64, overdriveSpeed int64) { + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "uploadSlab") + defer span.End() + + // create the response + resp := slabUploadResponse{ + slab: object.SlabSlice{ + Slab: object.NewSlab(uint8(rs.MinShards)), + Offset: 0, + Length: uint32(length), + }, + index: index, + } + + // create the shards + shards := make([][]byte, rs.TotalShards) + resp.slab.Slab.Encode(data, shards) + resp.slab.Slab.Encrypt(shards) + + // upload the shards + resp.slab.Slab.Shards, overdrivePct, overdriveSpeed, resp.err = u.uploadShards(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) + + // send the response + select { + case <-ctx.Done(): + case respChan <- resp: + } + + return +} + +func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) ([]object.Sector, float64, int64, error) { + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "uploadShards") + defer span.End() + + // ensure inflight uploads get cancelled + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + // prepare the upload + slab, requests, respChan := u.newSlabUpload(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) + + // launch all shard uploads + for _, upload := range requests { + if _, err := slab.launch(upload); err != nil { + return nil, 0, 0, err + } + } + + // launch overdrive + resetOverdrive := slab.overdrive(ctx, respChan) + + // collect responses + var done bool + for slab.inflight() > 0 && !done { + var resp sectorUploadResp + select { + case <-u.shutdownCtx.Done(): + return nil, 0, 0, errors.New("upload stopped") + case <-ctx.Done(): + return nil, 0, 0, ctx.Err() + case resp = <-respChan: + } + + resetOverdrive() + + // receive the response + done = slab.receive(resp) + + // relaunch non-overdrive uploads + if !done && resp.err != nil && !resp.req.overdrive { + if overdriving, err := slab.launch(resp.req); err != nil { + if !overdriving { + break // fail the upload + } + } + } + } + + // register the amount of overdrive sectors + span.SetAttributes(attribute.Int("overdrive", slab.overdriveCnt())) + + sectors, err := slab.finish() + return sectors, slab.overdrivePct(), slab.uploadSpeed(), err +} + +func (s *slabUpload) uploadSpeed() int64 { + s.mu.Lock() + defer s.mu.Unlock() + bytes := s.numUploaded * rhpv2.SectorSize + ms := time.Since(s.created).Milliseconds() + return int64(bytes) / ms +} + +func (s *slabUpload) finish() (sectors []object.Sector, _ error) { + s.mu.Lock() + defer s.mu.Unlock() + + if s.numUploaded < uint64(len(s.shards)) { + remaining := uint64(len(s.shards)) - s.numUploaded + return nil, fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d uploaders=%d errors=%d %w", s.numLaunched, s.numUploaded, remaining, s.numInflight, len(s.candidates), len(s.errs), s.errs) + } + + for i := 0; i < len(s.shards); i++ { + sectors = append(sectors, s.sectors[i].uploaded) + } + return +} + +func (s *slabUpload) inflight() uint64 { + s.mu.Lock() + defer s.mu.Unlock() + return s.numInflight +} + +func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { + s.mu.Lock() + defer s.mu.Unlock() + + // nothing to do + if req == nil { + return false, nil + } + + // find candidate candidate + var candidate *uploader + for _, uploader := range s.candidates { + if _, used := s.used[uploader.hk]; used { + continue + } + if candidate == nil || uploader.estimate() < candidate.estimate() { + candidate = uploader + } + } + + // no candidate found + if candidate == nil { + err = errNoCandidateUploader + interrupt = !req.overdrive && len(s.overdriving[req.sector.index]) == 0 + span := trace.SpanFromContext(req.sector.ctx) + span.RecordError(err) + span.End() + return + } + + // enqueue the req + candidate.enqueue(req) + + // update the state + s.numInflight++ + s.numLaunched++ + s.used[req.hk] = struct{}{} + + if req.overdrive { + s.lastOverdrive = time.Now() + s.numOverdriving++ + + if _, exists := s.overdriving[req.sector.index]; !exists { + s.overdriving[req.sector.index] = make(map[types.PublicKey]struct{}) + } + s.overdriving[req.sector.index][req.hk] = struct{}{} + } + return +} + +func (s *slabUpload) overdrive(ctx context.Context, respChan chan sectorUploadResp) (resetTimer func()) { + // overdrive is disabled + if s.overdriveTimeout == 0 { + return func() {} + } + + // create a timer to trigger overdrive + timer := time.NewTimer(s.overdriveTimeout) + resetTimer = func() { + timer.Stop() + select { + case <-timer.C: + default: + } + timer.Reset(s.overdriveTimeout) + } + + // create a function to check whether overdrive is possible + canOverdrive := func() bool { + s.mu.Lock() + defer s.mu.Unlock() + + // overdrive is not kicking in yet + remaining := uint64(len(s.shards)) - s.numUploaded + if remaining >= s.maxOverdrive { + return false + } + + // overdrive is not due yet + if time.Since(s.lastOverdrive) < s.overdriveTimeout { + return false + } + + // overdrive is maxed out + if s.numInflight-remaining >= s.maxOverdrive { + return false + } + + return true + } + + // try overdriving every time the timer fires + go func() { + for { + select { + case <-ctx.Done(): + return + case <-timer.C: + if canOverdrive() { + _, _ = s.launch(s.nextRequest(respChan)) // ignore result + } + resetTimer() + } + } + }() + + return +} + +func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUploadReq { + s.mu.Lock() + defer s.mu.Unlock() + + // find the sector that's not finished and has the least amount of overdrives + lowestNumOverdrives := math.MaxInt + var nextSector *sectorUpload + for _, sector := range s.sectors { + if !sector.isUploaded() && len(s.overdriving[sector.index]) < lowestNumOverdrives { + nextSector = sector + } + } + if nextSector == nil { + return nil + } + + return §orUploadReq{ + lockPriority: s.lockPriority, + overdrive: true, + responseChan: responseChan, + sector: nextSector, + uploadID: s.uploadID, + } +} + +func (s *slabUpload) overdriveCnt() int { + s.mu.Lock() + defer s.mu.Unlock() + return int(s.numLaunched) - len(s.sectors) +} + +func (s *slabUpload) overdrivePct() float64 { + s.mu.Lock() + defer s.mu.Unlock() + + numOverdrive := int(s.numLaunched) - len(s.sectors) + if numOverdrive <= 0 { + return 0 + } + + return float64(numOverdrive) / float64(len(s.sectors)) +} + +func (s *slabUpload) receive(resp sectorUploadResp) bool { + s.mu.Lock() + defer s.mu.Unlock() + + // convenience variable + req := resp.req + sector := req.sector + + // update the state + if req.overdrive { + s.numOverdriving-- + } + s.numInflight-- + + // failed reqs can't complete the upload + if resp.err != nil { + s.errs[req.hk] = resp.err + return false + } + + // redundant sectors can't complete the upload + if sector.uploaded.Root != (types.Hash256{}) { + return false + } + + // store the sector + sector.uploaded = object.Sector{ + Contracts: map[types.PublicKey][]types.FileContractID{req.hk: {req.fcid}}, + LatestHost: req.hk, + Root: resp.root, + } + + // update uploaded sectors + s.numUploaded++ + + // cancel the sector context + sector.cancel() + + // free hosts we're using to overdrive this sector + for hk := range s.overdriving[req.sector.index] { + delete(s.used, hk) + } + + // release memory + sector.data = nil + s.shards[sector.index] = nil + s.mem.ReleaseSome(rhpv2.SectorSize) + + return s.numUploaded == uint64(len(s.shards)) +} + +func (req *sectorUploadReq) done() bool { + select { + case <-req.sector.ctx.Done(): + return true + default: + return false + } +} + +func (s *sectorUpload) isUploaded() bool { + return s.uploaded.Root != (types.Hash256{}) +} + +func (req *sectorUploadReq) fail(err error) { + select { + case <-req.sector.ctx.Done(): + case req.responseChan <- sectorUploadResp{ + req: req, + err: err, + }: + } +} + +func (req *sectorUploadReq) succeed(root types.Hash256) { + select { + case <-req.sector.ctx.Done(): + case req.responseChan <- sectorUploadResp{ + req: req, + root: root, + }: + } +} From 29e455f08627d0d64056f86cf2511eccfbfe695d Mon Sep 17 00:00:00 2001 From: PJ Date: Thu, 7 Dec 2023 14:40:46 +0100 Subject: [PATCH 11/25] worker: no changes, fixes diff --- worker/upload.go | 1400 +++++++++++++++++++++++++++++++++++--- worker/upload_manager.go | 1383 ------------------------------------- worker/upload_params.go | 135 ++++ 3 files changed, 1459 insertions(+), 1459 deletions(-) delete mode 100644 worker/upload_manager.go create mode 100644 worker/upload_params.go diff --git a/worker/upload.go b/worker/upload.go index 8e033b287..f5c5fc576 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -1,135 +1,1383 @@ package worker import ( - "bytes" - "encoding/hex" + "context" + "errors" + "fmt" "io" + "math" + "mime" + "path/filepath" + "sort" + "sync" + "time" - "github.com/gabriel-vasile/mimetype" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + rhpv2 "go.sia.tech/core/rhp/v2" "go.sia.tech/core/types" "go.sia.tech/renterd/api" - "go.sia.tech/renterd/build" "go.sia.tech/renterd/object" + "go.sia.tech/renterd/stats" + "go.sia.tech/renterd/tracing" + "go.uber.org/zap" ) -type uploadParameters struct { - bucket string - path string +const ( + statsRecomputeMinInterval = 3 * time.Second - multipart bool - uploadID string - partNumber int + defaultPackedSlabsLockDuration = 10 * time.Minute + defaultPackedSlabsUploadTimeout = 10 * time.Minute +) + +var ( + errNoCandidateUploader = errors.New("no candidate uploader found") + errNotEnoughContracts = errors.New("not enough contracts to support requested redundancy") + errUploadManagerStopped = errors.New("upload manager stopped") +) + +type ( + uploadManager struct { + b Bus + hp hostProvider + rl revisionLocker + mm memoryManager + logger *zap.SugaredLogger + shutdownCtx context.Context + + maxOverdrive uint64 + overdriveTimeout time.Duration + + statsOverdrivePct *stats.DataPoints + statsSlabUploadSpeedBytesPerMS *stats.DataPoints + + mu sync.Mutex + uploaders []*uploader + } + + uploader struct { + b Bus + + hk types.PublicKey + siamuxAddr string + signalNewUpload chan struct{} + shutdownCtx context.Context + + mu sync.Mutex + bh uint64 + endHeight uint64 + fcid types.FileContractID + host hostV3 + queue []*sectorUploadReq + + // stats related field + consecutiveFailures uint64 + lastRecompute time.Time + + statsSectorUploadEstimateInMS *stats.DataPoints + statsSectorUploadSpeedBytesPerMS *stats.DataPoints + } + + uploadManagerStats struct { + avgSlabUploadSpeedMBPS float64 + avgOverdrivePct float64 + healthyUploaders uint64 + numUploaders uint64 + uploadSpeedsMBPS map[types.PublicKey]float64 + } + + upload struct { + id api.UploadID + allowed map[types.PublicKey]struct{} + lockPriority int + shutdownCtx context.Context + } + + slabUpload struct { + uploadID api.UploadID + created time.Time + lockPriority int + maxOverdrive uint64 + mem *acquiredMemory + overdriveTimeout time.Duration + + candidates []*uploader + shards [][]byte + + mu sync.Mutex + numInflight uint64 + numLaunched uint64 + numUploaded uint64 + + overdriving map[int]map[types.PublicKey]struct{} + lastOverdrive time.Time + numOverdriving uint64 + + sectors map[int]*sectorUpload + used map[types.PublicKey]struct{} + errs HostErrorSet + } + + slabUploadResponse struct { + slab object.SlabSlice + index int + err error + } + + sectorUpload struct { + data *[rhpv2.SectorSize]byte + index int + root types.Hash256 + uploaded object.Sector + + ctx context.Context + cancel context.CancelFunc + } + + sectorUploadReq struct { + lockPriority int + overdrive bool + responseChan chan sectorUploadResp + sector *sectorUpload + uploadID api.UploadID + + // set by the uploader performing the upload + fcid types.FileContractID + hk types.PublicKey + } + + sectorUploadResp struct { + req *sectorUploadReq + root types.Hash256 + err error + } +) + +func (w *worker) initUploadManager(mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { + if w.uploadManager != nil { + panic("upload manager already initialized") // developer error + } + + w.uploadManager = newUploadManager(w.bus, w, w, mm, maxOverdrive, overdriveTimeout, w.shutdownCtx, logger) +} + +func (w *worker) upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, opts ...UploadOption) (_ string, err error) { + // apply the options + for _, opt := range opts { + opt(&up) + } + + // if not given, try decide on a mime type using the file extension + if !up.multipart && up.mimeType == "" { + up.mimeType = mime.TypeByExtension(filepath.Ext(up.path)) + + // if mime type is still not known, wrap the reader with a mime reader + if up.mimeType == "" { + up.mimeType, r, err = newMimeReader(r) + if err != nil { + return + } + } + } + + // perform the upload + bufferSizeLimitReached, eTag, err := w.uploadManager.Upload(ctx, r, contracts, up, lockingPriorityUpload) + if err != nil { + return "", err + } + + // if packing was enabled try uploading packed slabs + if up.packing { + if err := w.tryUploadPackedSlabs(ctx, up.rs, up.contractSet, bufferSizeLimitReached); err != nil { + w.logger.Errorf("couldn't upload packed slabs, err: %v", err) + } + } + return eTag, nil +} + +func (w *worker) threadedUploadPackedSlabs(rs api.RedundancySettings, contractSet string, lockPriority int) { + key := fmt.Sprintf("%d-%d_%s", rs.MinShards, rs.TotalShards, contractSet) + + w.uploadsMu.Lock() + if w.uploadingPackedSlabs[key] { + w.uploadsMu.Unlock() + return + } + w.uploadingPackedSlabs[key] = true + w.uploadsMu.Unlock() + + // make sure we mark uploading packed slabs as false when we're done + defer func() { + w.uploadsMu.Lock() + w.uploadingPackedSlabs[key] = false + w.uploadsMu.Unlock() + }() + + // keep uploading packed slabs until we're done + ctx := context.WithValue(w.shutdownCtx, keyInteractionRecorder, w) + for { + uploaded, err := w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockPriority) + if err != nil { + w.logger.Errorf("couldn't upload packed slabs, err: %v", err) + return + } else if uploaded == 0 { + return + } + } +} + +func (w *worker) tryUploadPackedSlabs(ctx context.Context, rs api.RedundancySettings, contractSet string, block bool) (err error) { + // if we want to block, try and upload one packed slab synchronously, we use + // a slightly higher upload priority to avoid reaching the context deadline + if block { + _, err = w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockingPriorityBlockedUpload) + } + + // make sure there's a goroutine uploading the remainder of the packed slabs + go w.threadedUploadPackedSlabs(rs, contractSet, lockingPriorityBackgroundUpload) + return +} + +func (w *worker) uploadPackedSlabs(ctx context.Context, lockingDuration time.Duration, rs api.RedundancySettings, contractSet string, lockPriority int) (uploaded int, err error) { + // upload packed slabs + var mu sync.Mutex + var errs error + + var wg sync.WaitGroup + totalSize := uint64(rs.TotalShards) * rhpv2.SectorSize + + // derive a context that we can use as an interrupt in case of an error. + interruptCtx, cancel := context.WithCancel(ctx) + defer cancel() + + for { + // block until we have memory for a slab or until we are interrupted + mem := w.uploadManager.mm.AcquireMemory(interruptCtx, totalSize) + if mem == nil { + break // interrupted + } + + // fetch packed slabs to upload + var packedSlabs []api.PackedSlab + packedSlabs, err = w.bus.PackedSlabsForUpload(ctx, lockingDuration, uint8(rs.MinShards), uint8(rs.TotalShards), contractSet, 1) + if err != nil { + err = fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) + mem.Release() + break + } else if len(packedSlabs) == 0 { + mem.Release() + break // no more slabs + } + ps := packedSlabs[0] + + // launch upload for slab + wg.Add(1) + go func(ps api.PackedSlab) { + defer mem.Release() + defer wg.Done() + err := w.uploadPackedSlab(ctx, ps, rs, contractSet, lockPriority, mem) + mu.Lock() + if err != nil { + errs = errors.Join(errs, err) + cancel() // prevent new uploads from being launched + } else { + uploaded++ + } + mu.Unlock() + }(ps) + } + + // wait for all threads to finish + wg.Wait() + + // return collected errors + err = errors.Join(err, errs) + return +} + +func (w *worker) uploadPackedSlab(ctx context.Context, ps api.PackedSlab, rs api.RedundancySettings, contractSet string, lockPriority int, mem *acquiredMemory) error { + // create a context with sane timeout + ctx, cancel := context.WithTimeout(ctx, defaultPackedSlabsUploadTimeout) + defer cancel() + + // fetch contracts + contracts, err := w.bus.ContractSetContracts(ctx, contractSet) + if err != nil { + return fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) + } + + // fetch upload params + up, err := w.bus.UploadParams(ctx) + if err != nil { + return fmt.Errorf("couldn't fetch upload params from bus: %v", err) + } + + // attach gouging checker to the context + ctx = WithGougingChecker(ctx, w.bus, up.GougingParams) + + // upload packed slab + err = w.uploadManager.UploadPackedSlab(ctx, rs, ps, contracts, up.CurrentHeight, lockPriority, mem) + if err != nil { + return fmt.Errorf("couldn't upload packed slab, err: %v", err) + } + + return nil +} + +func newUploadManager(b Bus, hp hostProvider, rl revisionLocker, mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, shutdownCtx context.Context, logger *zap.SugaredLogger) *uploadManager { + return &uploadManager{ + b: b, + hp: hp, + rl: rl, + logger: logger, + mm: mm, + + maxOverdrive: maxOverdrive, + overdriveTimeout: overdriveTimeout, + + statsOverdrivePct: stats.NoDecay(), + statsSlabUploadSpeedBytesPerMS: stats.NoDecay(), + + shutdownCtx: shutdownCtx, + + uploaders: make([]*uploader, 0), + } +} + +func (mgr *uploadManager) newUploader(b Bus, hp hostProvider, c api.ContractMetadata, bh uint64) *uploader { + return &uploader{ + b: b, + + // static + hk: c.HostKey, + siamuxAddr: c.SiamuxAddr, + shutdownCtx: mgr.shutdownCtx, + signalNewUpload: make(chan struct{}, 1), + + // stats + statsSectorUploadEstimateInMS: stats.Default(), + statsSectorUploadSpeedBytesPerMS: stats.NoDecay(), + + // covered by mutex + host: hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr), + bh: bh, + fcid: c.ID, + endHeight: c.WindowEnd, + queue: make([]*sectorUploadReq, 0), + } +} + +func (mgr *uploadManager) MigrateShards(ctx context.Context, s *object.Slab, shardIndices []int, shards [][]byte, contractSet string, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { + // create the upload + upload, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) + if err != nil { + return err + } + + // track the upload in the bus + if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { + return fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) + } + + // defer a function that finishes the upload + defer func() { + ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) + if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { + mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) + } + cancel() + }() + + // upload the shards + uploaded, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + if err != nil { + return err + } + + // track stats + mgr.statsOverdrivePct.Track(overdrivePct) + mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) + + // overwrite the shards with the newly uploaded ones + for i, si := range shardIndices { + s.Shards[si].LatestHost = uploaded[i].LatestHost + + knownContracts := make(map[types.FileContractID]struct{}) + for _, fcids := range s.Shards[si].Contracts { + for _, fcid := range fcids { + knownContracts[fcid] = struct{}{} + } + } + for hk, fcids := range uploaded[i].Contracts { + for _, fcid := range fcids { + if _, exists := knownContracts[fcid]; !exists { + if s.Shards[si].Contracts == nil { + s.Shards[si].Contracts = make(map[types.PublicKey][]types.FileContractID) + } + s.Shards[si].Contracts[hk] = append(s.Shards[si].Contracts[hk], fcid) + } + } + } + } + + // update the slab + return mgr.b.UpdateSlab(ctx, *s, contractSet) +} + +func (mgr *uploadManager) Stats() uploadManagerStats { + mgr.mu.Lock() + defer mgr.mu.Unlock() + + var numHealthy uint64 + speeds := make(map[types.PublicKey]float64) + for _, u := range mgr.uploaders { + u.tryRecomputeStats() + speeds[u.hk] = u.statsSectorUploadSpeedBytesPerMS.Average() * 0.008 + if u.healthy() { + numHealthy++ + } + } + + // prepare stats + return uploadManagerStats{ + avgSlabUploadSpeedMBPS: mgr.statsSlabUploadSpeedBytesPerMS.Average() * 0.008, // convert bytes per ms to mbps, + avgOverdrivePct: mgr.statsOverdrivePct.Average(), + healthyUploaders: numHealthy, + numUploaders: uint64(len(speeds)), + uploadSpeedsMBPS: speeds, + } +} + +func (mgr *uploadManager) Stop() { + mgr.mu.Lock() + defer mgr.mu.Unlock() + for _, u := range mgr.uploaders { + u.Stop() + } +} + +func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, lockPriority int) (bufferSizeLimitReached bool, eTag string, err error) { + // cancel all in-flight requests when the upload is done + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "upload") + defer func() { + span.RecordError(err) + span.End() + }() + + // create the object + o := object.NewObject(up.ec) + + // create the hash reader + hr := newHashReader(r) + + // create the cipher reader + cr, err := o.Encrypt(hr, up.encryptionOffset) + if err != nil { + return false, "", err + } + + // create the upload + upload, err := mgr.newUpload(ctx, up.rs.TotalShards, contracts, up.bh, lockPriority) + if err != nil { + return false, "", err + } + + // track the upload in the bus + if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { + return false, "", fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) + } + + // defer a function that finishes the upload + defer func() { + ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) + if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { + mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) + } + cancel() + }() + + // create the response channel + respChan := make(chan slabUploadResponse) + + // channel to notify main thread of the number of slabs to wait for + numSlabsChan := make(chan int, 1) + + // prepare slab size + size := int64(up.rs.MinShards) * rhpv2.SectorSize + redundantSize := uint64(up.rs.TotalShards) * rhpv2.SectorSize + var partialSlab []byte + + // launch uploads in a separate goroutine + go func() { + var slabIndex int + for { + select { + case <-mgr.shutdownCtx.Done(): + return // interrupted + case <-ctx.Done(): + return // interrupted + default: + } + // acquire memory + mem := mgr.mm.AcquireMemory(ctx, redundantSize) + if mem == nil { + return // interrupted + } + + // read next slab's data + data := make([]byte, size) + length, err := io.ReadFull(io.LimitReader(cr, size), data) + if err == io.EOF { + mem.Release() + + // no more data to upload, notify main thread of the number of + // slabs to wait for + numSlabs := slabIndex + if partialSlab != nil && slabIndex > 0 { + numSlabs-- // don't wait on partial slab + } + numSlabsChan <- numSlabs + return + } else if err != nil && err != io.ErrUnexpectedEOF { + mem.Release() + + // unexpected error, notify main thread + select { + case respChan <- slabUploadResponse{err: err}: + case <-ctx.Done(): + } + return + } else if up.packing && errors.Is(err, io.ErrUnexpectedEOF) { + mem.Release() + + // uploadPacking is true, we return the partial slab without + // uploading. + partialSlab = data[:length] + } else { + // regular upload + go func(rs api.RedundancySettings, data []byte, length, slabIndex int) { + upload.uploadSlab(ctx, rs, data, length, slabIndex, respChan, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + mem.Release() + }(up.rs, data, length, slabIndex) + } + slabIndex++ + } + }() + + // collect responses + var responses []slabUploadResponse + numSlabs := math.MaxInt32 + for len(responses) < numSlabs { + select { + case <-mgr.shutdownCtx.Done(): + return false, "", errUploadManagerStopped + case numSlabs = <-numSlabsChan: + case res := <-respChan: + if res.err != nil { + return false, "", res.err + } + responses = append(responses, res) + } + } + + // sort the slabs by index + sort.Slice(responses, func(i, j int) bool { + return responses[i].index < responses[j].index + }) + + // decorate the object with the slabs + for _, resp := range responses { + o.Slabs = append(o.Slabs, resp.slab) + } + + // calculate the eTag + eTag = hr.Hash() + + // add partial slabs + if len(partialSlab) > 0 { + var pss []object.SlabSlice + pss, bufferSizeLimitReached, err = mgr.b.AddPartialSlab(ctx, partialSlab, uint8(up.rs.MinShards), uint8(up.rs.TotalShards), up.contractSet) + if err != nil { + return false, "", err + } + o.Slabs = append(o.Slabs, pss...) + } + + if up.multipart { + // persist the part + err = mgr.b.AddMultipartPart(ctx, up.bucket, up.path, up.contractSet, eTag, up.uploadID, up.partNumber, o.Slabs) + if err != nil { + return bufferSizeLimitReached, "", fmt.Errorf("couldn't add multi part: %w", err) + } + } else { + // persist the object + err = mgr.b.AddObject(ctx, up.bucket, up.path, up.contractSet, o, api.AddObjectOptions{MimeType: up.mimeType, ETag: eTag}) + if err != nil { + return bufferSizeLimitReached, "", fmt.Errorf("couldn't add object: %w", err) + } + } + + return +} + +func (mgr *uploadManager) UploadPackedSlab(ctx context.Context, rs api.RedundancySettings, ps api.PackedSlab, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { + // build the shards + shards := encryptPartialSlab(ps.Data, ps.Key, uint8(rs.MinShards), uint8(rs.TotalShards)) + + // create the upload + upload, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) + if err != nil { + return err + } + + // track the upload in the bus + if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { + return fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) + } + + // defer a function that finishes the upload + defer func() { + ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) + if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { + mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) + } + cancel() + }() + + // upload the shards + sectors, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + if err != nil { + return err + } + + // track stats + mgr.statsOverdrivePct.Track(overdrivePct) + mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) + + // mark packed slab as uploaded + slab := api.UploadedPackedSlab{BufferID: ps.BufferID, Shards: sectors} + err = mgr.b.MarkPackedSlabsUploaded(ctx, []api.UploadedPackedSlab{slab}) + if err != nil { + return fmt.Errorf("couldn't mark packed slabs uploaded, err: %v", err) + } + + return nil +} - ec object.EncryptionKey - encryptionOffset uint64 +func (mgr *uploadManager) candidates(allowed map[types.PublicKey]struct{}) (candidates []*uploader) { + mgr.mu.Lock() + defer mgr.mu.Unlock() - rs api.RedundancySettings - bh uint64 - contractSet string - packing bool - mimeType string + for _, u := range mgr.uploaders { + if _, allowed := allowed[u.hk]; allowed { + candidates = append(candidates, u) + } + } + return } -func defaultParameters(bucket, path string) uploadParameters { - return uploadParameters{ - bucket: bucket, - path: path, +func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contracts []api.ContractMetadata, bh uint64, lockPriority int) (*upload, error) { + mgr.mu.Lock() + defer mgr.mu.Unlock() - ec: object.GenerateEncryptionKey(), // random key - encryptionOffset: 0, // from the beginning + // refresh the uploaders + mgr.refreshUploaders(contracts, bh) - rs: build.DefaultRedundancySettings, + // check if we have enough contracts + if len(contracts) < totalShards { + return nil, fmt.Errorf("%v < %v: %w", len(contracts), totalShards, errNotEnoughContracts) } + + // create allowed map + allowed := make(map[types.PublicKey]struct{}) + for _, c := range contracts { + allowed[c.HostKey] = struct{}{} + } + + // create upload + return &upload{ + id: api.NewUploadID(), + allowed: allowed, + lockPriority: lockPriority, + shutdownCtx: mgr.shutdownCtx, + }, nil } -func multipartParameters(bucket, path, uploadID string, partNumber int) uploadParameters { - return uploadParameters{ - bucket: bucket, - path: path, +func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh uint64) { + // build map of contracts + toKeep := make(map[types.FileContractID]api.ContractMetadata) + for _, c := range contracts { + toKeep[c.ID] = c + } + + // build map of renewed contracts + renewedTo := make(map[types.FileContractID]api.ContractMetadata) + for _, c := range contracts { + if c.RenewedFrom != (types.FileContractID{}) { + renewedTo[c.RenewedFrom] = c + } + } + + // keep list of uploaders uploaders + var uploaders []*uploader + for _, uploader := range mgr.uploaders { + fcid := uploader.contractID() + + renewal, renewed := renewedTo[fcid] + if _, keep := toKeep[fcid]; !(keep || renewed) { + uploader.Stop() + continue + } + delete(toKeep, fcid) // toKeep becomes missing - multipart: true, - uploadID: uploadID, - partNumber: partNumber, + if renewed { + uploader.renew(mgr.hp, renewal, bh) + } else { + uploader.updateBlockHeight(bh) + } + uploaders = append(uploaders, uploader) + } - ec: object.GenerateEncryptionKey(), // random key - encryptionOffset: 0, // from the beginning + for _, c := range toKeep { + uploader := mgr.newUploader(mgr.b, mgr.hp, c, bh) + uploaders = append(uploaders, uploader) + go uploader.Start(mgr.hp, mgr.rl) + } - rs: build.DefaultRedundancySettings, + mgr.uploaders = uploaders +} + +func (u *uploader) SignalWork() { + select { + case u.signalNewUpload <- struct{}{}: + default: } } -type UploadOption func(*uploadParameters) +func (u *uploader) Start(hp hostProvider, rl revisionLocker) { +outer: + for { + // wait for work + select { + case <-u.signalNewUpload: + case <-u.shutdownCtx.Done(): + return + } + + for { + // check if we are stopped + select { + case <-u.shutdownCtx.Done(): + return + default: + } -func WithBlockHeight(bh uint64) UploadOption { - return func(up *uploadParameters) { - up.bh = bh + // pop the next upload req + req := u.pop() + if req == nil { + continue outer + } + + // skip if upload is done + if req.done() { + continue + } + + // execute it + var root types.Hash256 + start := time.Now() + fcid := u.contractID() + err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { + if rev.RevisionNumber == math.MaxUint64 { + return errMaxRevisionReached + } + + var err error + root, err = u.execute(req, rev) + return err + }) + + // the uploader's contract got renewed, requeue the request + if errors.Is(err, errMaxRevisionReached) { + u.enqueue(req) + continue outer + } + + // send the response + if err != nil { + req.fail(err) + } else { + req.succeed(root) + } + + // track the error, ignore gracefully closed streams and canceled overdrives + canceledOverdrive := req.done() && req.overdrive && err != nil + if !canceledOverdrive && !isClosedStream(err) { + u.trackSectorUpload(err, time.Since(start)) + } + } } } -func WithContractSet(contractSet string) UploadOption { - return func(up *uploadParameters) { - up.contractSet = contractSet +func (u *uploader) healthy() bool { + u.mu.Lock() + defer u.mu.Unlock() + return u.consecutiveFailures == 0 +} + +func (u *uploader) Stop() { + for { + upload := u.pop() + if upload == nil { + break + } + if !upload.done() { + upload.fail(errors.New("uploader stopped")) + } } } -func WithCustomKey(ec object.EncryptionKey) UploadOption { - return func(up *uploadParameters) { - up.ec = ec +func (u *uploader) blockHeight() uint64 { + u.mu.Lock() + defer u.mu.Unlock() + return u.bh +} + +func (u *uploader) contractID() types.FileContractID { + u.mu.Lock() + defer u.mu.Unlock() + return u.fcid +} + +func (u *uploader) enqueue(req *sectorUploadReq) { + // trace the request + span := trace.SpanFromContext(req.sector.ctx) + span.SetAttributes(attribute.Stringer("hk", u.hk)) + span.AddEvent("enqueued") + + // decorate the request + req.fcid = u.contractID() + req.hk = u.hk + + // enqueue the request + u.mu.Lock() + u.queue = append(u.queue, req) + u.mu.Unlock() + + // signal there's work + u.SignalWork() +} + +func (u *uploader) estimate() float64 { + u.mu.Lock() + defer u.mu.Unlock() + + // fetch estimated duration per sector + estimateP90 := u.statsSectorUploadEstimateInMS.P90() + if estimateP90 == 0 { + estimateP90 = 1 } + + // calculate estimated time + numSectors := float64(len(u.queue) + 1) + return numSectors * estimateP90 } -func WithCustomEncryptionOffset(offset uint64) UploadOption { - return func(up *uploadParameters) { - up.encryptionOffset = offset +func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) (types.Hash256, error) { + u.mu.Lock() + host := u.host + fcid := u.fcid + u.mu.Unlock() + + // fetch span from context + span := trace.SpanFromContext(req.sector.ctx) + span.AddEvent("execute") + + // update the bus + if err := u.b.AddUploadingSector(req.sector.ctx, req.uploadID, fcid, req.sector.root); err != nil { + return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) } + + // upload the sector + start := time.Now() + root, err := host.UploadSector(req.sector.ctx, req.sector.data, rev) + if err != nil { + return types.Hash256{}, err + } + + // update span + elapsed := time.Since(start) + span.SetAttributes(attribute.Int64("duration", elapsed.Milliseconds())) + span.RecordError(err) + span.End() + + return root, nil } -func WithMimeType(mimeType string) UploadOption { - return func(up *uploadParameters) { - up.mimeType = mimeType +func (u *uploader) pop() *sectorUploadReq { + u.mu.Lock() + defer u.mu.Unlock() + + if len(u.queue) > 0 { + j := u.queue[0] + u.queue[0] = nil + u.queue = u.queue[1:] + return j } + return nil +} + +func (u *uploader) renew(hp hostProvider, c api.ContractMetadata, bh uint64) { + u.mu.Lock() + defer u.mu.Unlock() + + u.bh = bh + u.host = hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr) + u.fcid = c.ID + u.siamuxAddr = c.SiamuxAddr + u.endHeight = c.WindowEnd } -func WithPacking(packing bool) UploadOption { - return func(up *uploadParameters) { - up.packing = packing +func (u *uploader) trackSectorUpload(err error, d time.Duration) { + u.mu.Lock() + defer u.mu.Unlock() + if err != nil { + u.consecutiveFailures++ + u.statsSectorUploadEstimateInMS.Track(float64(time.Hour.Milliseconds())) + } else { + ms := d.Milliseconds() + u.consecutiveFailures = 0 + u.statsSectorUploadEstimateInMS.Track(float64(ms)) // duration in ms + u.statsSectorUploadSpeedBytesPerMS.Track(float64(rhpv2.SectorSize / ms)) // bytes per ms } } -func WithRedundancySettings(rs api.RedundancySettings) UploadOption { - return func(up *uploadParameters) { - up.rs = rs +func (u *uploader) tryRecomputeStats() { + u.mu.Lock() + defer u.mu.Unlock() + if time.Since(u.lastRecompute) < statsRecomputeMinInterval { + return } + + u.lastRecompute = time.Now() + u.statsSectorUploadEstimateInMS.Recompute() + u.statsSectorUploadSpeedBytesPerMS.Recompute() } -func newMimeReader(r io.Reader) (mimeType string, recycled io.Reader, err error) { - buf := bytes.NewBuffer(nil) - mtype, err := mimetype.DetectReader(io.TeeReader(r, buf)) - recycled = io.MultiReader(buf, r) - return mtype.String(), recycled, err +func (u *uploader) updateBlockHeight(bh uint64) { + u.mu.Lock() + defer u.mu.Unlock() + u.bh = bh } -type hashReader struct { - r io.Reader - h *types.Hasher +func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { + // create slab upload + slab := &slabUpload{ + lockPriority: u.lockPriority, + uploadID: u.id, + created: time.Now(), + maxOverdrive: maxOverdrive, + mem: mem, + overdriveTimeout: overdriveTimeout, + + candidates: candidates, + shards: shards, + + overdriving: make(map[int]map[types.PublicKey]struct{}), + sectors: make(map[int]*sectorUpload, len(shards)), + used: make(map[types.PublicKey]struct{}), + errs: make(HostErrorSet), + } + + // prepare sector uploads + responseChan := make(chan sectorUploadResp) + requests := make([]*sectorUploadReq, len(shards)) + for sI, shard := range shards { + // create the ctx + sCtx, sCancel := context.WithCancel(ctx) + + // attach the upload's span + sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") + span.SetAttributes(attribute.Bool("overdrive", false)) + span.SetAttributes(attribute.Int("sector", sI)) + + // create the sector + sector := §orUpload{ + data: (*[rhpv2.SectorSize]byte)(shard), + index: sI, + root: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), + + ctx: sCtx, + cancel: sCancel, + } + slab.sectors[sI] = sector + + // create the request + requests[sI] = §orUploadReq{ + lockPriority: slab.lockPriority, + overdrive: false, + responseChan: responseChan, + sector: sector, + uploadID: slab.uploadID, + } + } + + return slab, requests, responseChan } -func newHashReader(r io.Reader) *hashReader { - return &hashReader{ - r: r, - h: types.NewHasher(), +func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (overdrivePct float64, overdriveSpeed int64) { + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "uploadSlab") + defer span.End() + + // create the response + resp := slabUploadResponse{ + slab: object.SlabSlice{ + Slab: object.NewSlab(uint8(rs.MinShards)), + Offset: 0, + Length: uint32(length), + }, + index: index, + } + + // create the shards + shards := make([][]byte, rs.TotalShards) + resp.slab.Slab.Encode(data, shards) + resp.slab.Slab.Encrypt(shards) + + // upload the shards + resp.slab.Slab.Shards, overdrivePct, overdriveSpeed, resp.err = u.uploadShards(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) + + // send the response + select { + case <-ctx.Done(): + case respChan <- resp: } + + return } -func (e *hashReader) Read(p []byte) (int, error) { - n, err := e.r.Read(p) - if _, wErr := e.h.E.Write(p[:n]); wErr != nil { - return 0, wErr +func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) ([]object.Sector, float64, int64, error) { + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "uploadShards") + defer span.End() + + // ensure inflight uploads get cancelled + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + // prepare the upload + slab, requests, respChan := u.newSlabUpload(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) + + // launch all shard uploads + for _, upload := range requests { + if _, err := slab.launch(upload); err != nil { + return nil, 0, 0, err + } } - return n, err + + // launch overdrive + resetOverdrive := slab.overdrive(ctx, respChan) + + // collect responses + var done bool + for slab.inflight() > 0 && !done { + var resp sectorUploadResp + select { + case <-u.shutdownCtx.Done(): + return nil, 0, 0, errors.New("upload stopped") + case <-ctx.Done(): + return nil, 0, 0, ctx.Err() + case resp = <-respChan: + } + + resetOverdrive() + + // receive the response + done = slab.receive(resp) + + // relaunch non-overdrive uploads + if !done && resp.err != nil && !resp.req.overdrive { + if overdriving, err := slab.launch(resp.req); err != nil { + if !overdriving { + break // fail the upload + } + } + } + } + + // register the amount of overdrive sectors + span.SetAttributes(attribute.Int("overdrive", slab.overdriveCnt())) + + sectors, err := slab.finish() + return sectors, slab.overdrivePct(), slab.uploadSpeed(), err } -func (e *hashReader) Hash() string { - sum := e.h.Sum() - return hex.EncodeToString(sum[:]) +func (s *slabUpload) uploadSpeed() int64 { + s.mu.Lock() + defer s.mu.Unlock() + bytes := s.numUploaded * rhpv2.SectorSize + ms := time.Since(s.created).Milliseconds() + return int64(bytes) / ms +} + +func (s *slabUpload) finish() (sectors []object.Sector, _ error) { + s.mu.Lock() + defer s.mu.Unlock() + + if s.numUploaded < uint64(len(s.shards)) { + remaining := uint64(len(s.shards)) - s.numUploaded + return nil, fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d uploaders=%d errors=%d %w", s.numLaunched, s.numUploaded, remaining, s.numInflight, len(s.candidates), len(s.errs), s.errs) + } + + for i := 0; i < len(s.shards); i++ { + sectors = append(sectors, s.sectors[i].uploaded) + } + return +} + +func (s *slabUpload) inflight() uint64 { + s.mu.Lock() + defer s.mu.Unlock() + return s.numInflight +} + +func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { + s.mu.Lock() + defer s.mu.Unlock() + + // nothing to do + if req == nil { + return false, nil + } + + // find candidate candidate + var candidate *uploader + for _, uploader := range s.candidates { + if _, used := s.used[uploader.hk]; used { + continue + } + if candidate == nil || uploader.estimate() < candidate.estimate() { + candidate = uploader + } + } + + // no candidate found + if candidate == nil { + err = errNoCandidateUploader + interrupt = !req.overdrive && len(s.overdriving[req.sector.index]) == 0 + span := trace.SpanFromContext(req.sector.ctx) + span.RecordError(err) + span.End() + return + } + + // enqueue the req + candidate.enqueue(req) + + // update the state + s.numInflight++ + s.numLaunched++ + s.used[req.hk] = struct{}{} + + if req.overdrive { + s.lastOverdrive = time.Now() + s.numOverdriving++ + + if _, exists := s.overdriving[req.sector.index]; !exists { + s.overdriving[req.sector.index] = make(map[types.PublicKey]struct{}) + } + s.overdriving[req.sector.index][req.hk] = struct{}{} + } + return +} + +func (s *slabUpload) overdrive(ctx context.Context, respChan chan sectorUploadResp) (resetTimer func()) { + // overdrive is disabled + if s.overdriveTimeout == 0 { + return func() {} + } + + // create a timer to trigger overdrive + timer := time.NewTimer(s.overdriveTimeout) + resetTimer = func() { + timer.Stop() + select { + case <-timer.C: + default: + } + timer.Reset(s.overdriveTimeout) + } + + // create a function to check whether overdrive is possible + canOverdrive := func() bool { + s.mu.Lock() + defer s.mu.Unlock() + + // overdrive is not kicking in yet + remaining := uint64(len(s.shards)) - s.numUploaded + if remaining >= s.maxOverdrive { + return false + } + + // overdrive is not due yet + if time.Since(s.lastOverdrive) < s.overdriveTimeout { + return false + } + + // overdrive is maxed out + if s.numInflight-remaining >= s.maxOverdrive { + return false + } + + return true + } + + // try overdriving every time the timer fires + go func() { + for { + select { + case <-ctx.Done(): + return + case <-timer.C: + if canOverdrive() { + _, _ = s.launch(s.nextRequest(respChan)) // ignore result + } + resetTimer() + } + } + }() + + return +} + +func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUploadReq { + s.mu.Lock() + defer s.mu.Unlock() + + // find the sector that's not finished and has the least amount of overdrives + lowestNumOverdrives := math.MaxInt + var nextSector *sectorUpload + for _, sector := range s.sectors { + if !sector.isUploaded() && len(s.overdriving[sector.index]) < lowestNumOverdrives { + nextSector = sector + } + } + if nextSector == nil { + return nil + } + + return §orUploadReq{ + lockPriority: s.lockPriority, + overdrive: true, + responseChan: responseChan, + sector: nextSector, + uploadID: s.uploadID, + } +} + +func (s *slabUpload) overdriveCnt() int { + s.mu.Lock() + defer s.mu.Unlock() + return int(s.numLaunched) - len(s.sectors) +} + +func (s *slabUpload) overdrivePct() float64 { + s.mu.Lock() + defer s.mu.Unlock() + + numOverdrive := int(s.numLaunched) - len(s.sectors) + if numOverdrive <= 0 { + return 0 + } + + return float64(numOverdrive) / float64(len(s.sectors)) +} + +func (s *slabUpload) receive(resp sectorUploadResp) bool { + s.mu.Lock() + defer s.mu.Unlock() + + // convenience variable + req := resp.req + sector := req.sector + + // update the state + if req.overdrive { + s.numOverdriving-- + } + s.numInflight-- + + // failed reqs can't complete the upload + if resp.err != nil { + s.errs[req.hk] = resp.err + return false + } + + // redundant sectors can't complete the upload + if sector.uploaded.Root != (types.Hash256{}) { + return false + } + + // store the sector + sector.uploaded = object.Sector{ + Contracts: map[types.PublicKey][]types.FileContractID{req.hk: {req.fcid}}, + LatestHost: req.hk, + Root: resp.root, + } + + // update uploaded sectors + s.numUploaded++ + + // cancel the sector context + sector.cancel() + + // free hosts we're using to overdrive this sector + for hk := range s.overdriving[req.sector.index] { + delete(s.used, hk) + } + + // release memory + sector.data = nil + s.shards[sector.index] = nil + s.mem.ReleaseSome(rhpv2.SectorSize) + + return s.numUploaded == uint64(len(s.shards)) +} + +func (req *sectorUploadReq) done() bool { + select { + case <-req.sector.ctx.Done(): + return true + default: + return false + } +} + +func (s *sectorUpload) isUploaded() bool { + return s.uploaded.Root != (types.Hash256{}) +} + +func (req *sectorUploadReq) fail(err error) { + select { + case <-req.sector.ctx.Done(): + case req.responseChan <- sectorUploadResp{ + req: req, + err: err, + }: + } +} + +func (req *sectorUploadReq) succeed(root types.Hash256) { + select { + case <-req.sector.ctx.Done(): + case req.responseChan <- sectorUploadResp{ + req: req, + root: root, + }: + } } diff --git a/worker/upload_manager.go b/worker/upload_manager.go deleted file mode 100644 index f5c5fc576..000000000 --- a/worker/upload_manager.go +++ /dev/null @@ -1,1383 +0,0 @@ -package worker - -import ( - "context" - "errors" - "fmt" - "io" - "math" - "mime" - "path/filepath" - "sort" - "sync" - "time" - - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/trace" - rhpv2 "go.sia.tech/core/rhp/v2" - "go.sia.tech/core/types" - "go.sia.tech/renterd/api" - "go.sia.tech/renterd/object" - "go.sia.tech/renterd/stats" - "go.sia.tech/renterd/tracing" - "go.uber.org/zap" -) - -const ( - statsRecomputeMinInterval = 3 * time.Second - - defaultPackedSlabsLockDuration = 10 * time.Minute - defaultPackedSlabsUploadTimeout = 10 * time.Minute -) - -var ( - errNoCandidateUploader = errors.New("no candidate uploader found") - errNotEnoughContracts = errors.New("not enough contracts to support requested redundancy") - errUploadManagerStopped = errors.New("upload manager stopped") -) - -type ( - uploadManager struct { - b Bus - hp hostProvider - rl revisionLocker - mm memoryManager - logger *zap.SugaredLogger - shutdownCtx context.Context - - maxOverdrive uint64 - overdriveTimeout time.Duration - - statsOverdrivePct *stats.DataPoints - statsSlabUploadSpeedBytesPerMS *stats.DataPoints - - mu sync.Mutex - uploaders []*uploader - } - - uploader struct { - b Bus - - hk types.PublicKey - siamuxAddr string - signalNewUpload chan struct{} - shutdownCtx context.Context - - mu sync.Mutex - bh uint64 - endHeight uint64 - fcid types.FileContractID - host hostV3 - queue []*sectorUploadReq - - // stats related field - consecutiveFailures uint64 - lastRecompute time.Time - - statsSectorUploadEstimateInMS *stats.DataPoints - statsSectorUploadSpeedBytesPerMS *stats.DataPoints - } - - uploadManagerStats struct { - avgSlabUploadSpeedMBPS float64 - avgOverdrivePct float64 - healthyUploaders uint64 - numUploaders uint64 - uploadSpeedsMBPS map[types.PublicKey]float64 - } - - upload struct { - id api.UploadID - allowed map[types.PublicKey]struct{} - lockPriority int - shutdownCtx context.Context - } - - slabUpload struct { - uploadID api.UploadID - created time.Time - lockPriority int - maxOverdrive uint64 - mem *acquiredMemory - overdriveTimeout time.Duration - - candidates []*uploader - shards [][]byte - - mu sync.Mutex - numInflight uint64 - numLaunched uint64 - numUploaded uint64 - - overdriving map[int]map[types.PublicKey]struct{} - lastOverdrive time.Time - numOverdriving uint64 - - sectors map[int]*sectorUpload - used map[types.PublicKey]struct{} - errs HostErrorSet - } - - slabUploadResponse struct { - slab object.SlabSlice - index int - err error - } - - sectorUpload struct { - data *[rhpv2.SectorSize]byte - index int - root types.Hash256 - uploaded object.Sector - - ctx context.Context - cancel context.CancelFunc - } - - sectorUploadReq struct { - lockPriority int - overdrive bool - responseChan chan sectorUploadResp - sector *sectorUpload - uploadID api.UploadID - - // set by the uploader performing the upload - fcid types.FileContractID - hk types.PublicKey - } - - sectorUploadResp struct { - req *sectorUploadReq - root types.Hash256 - err error - } -) - -func (w *worker) initUploadManager(mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { - if w.uploadManager != nil { - panic("upload manager already initialized") // developer error - } - - w.uploadManager = newUploadManager(w.bus, w, w, mm, maxOverdrive, overdriveTimeout, w.shutdownCtx, logger) -} - -func (w *worker) upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, opts ...UploadOption) (_ string, err error) { - // apply the options - for _, opt := range opts { - opt(&up) - } - - // if not given, try decide on a mime type using the file extension - if !up.multipart && up.mimeType == "" { - up.mimeType = mime.TypeByExtension(filepath.Ext(up.path)) - - // if mime type is still not known, wrap the reader with a mime reader - if up.mimeType == "" { - up.mimeType, r, err = newMimeReader(r) - if err != nil { - return - } - } - } - - // perform the upload - bufferSizeLimitReached, eTag, err := w.uploadManager.Upload(ctx, r, contracts, up, lockingPriorityUpload) - if err != nil { - return "", err - } - - // if packing was enabled try uploading packed slabs - if up.packing { - if err := w.tryUploadPackedSlabs(ctx, up.rs, up.contractSet, bufferSizeLimitReached); err != nil { - w.logger.Errorf("couldn't upload packed slabs, err: %v", err) - } - } - return eTag, nil -} - -func (w *worker) threadedUploadPackedSlabs(rs api.RedundancySettings, contractSet string, lockPriority int) { - key := fmt.Sprintf("%d-%d_%s", rs.MinShards, rs.TotalShards, contractSet) - - w.uploadsMu.Lock() - if w.uploadingPackedSlabs[key] { - w.uploadsMu.Unlock() - return - } - w.uploadingPackedSlabs[key] = true - w.uploadsMu.Unlock() - - // make sure we mark uploading packed slabs as false when we're done - defer func() { - w.uploadsMu.Lock() - w.uploadingPackedSlabs[key] = false - w.uploadsMu.Unlock() - }() - - // keep uploading packed slabs until we're done - ctx := context.WithValue(w.shutdownCtx, keyInteractionRecorder, w) - for { - uploaded, err := w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockPriority) - if err != nil { - w.logger.Errorf("couldn't upload packed slabs, err: %v", err) - return - } else if uploaded == 0 { - return - } - } -} - -func (w *worker) tryUploadPackedSlabs(ctx context.Context, rs api.RedundancySettings, contractSet string, block bool) (err error) { - // if we want to block, try and upload one packed slab synchronously, we use - // a slightly higher upload priority to avoid reaching the context deadline - if block { - _, err = w.uploadPackedSlabs(ctx, defaultPackedSlabsLockDuration, rs, contractSet, lockingPriorityBlockedUpload) - } - - // make sure there's a goroutine uploading the remainder of the packed slabs - go w.threadedUploadPackedSlabs(rs, contractSet, lockingPriorityBackgroundUpload) - return -} - -func (w *worker) uploadPackedSlabs(ctx context.Context, lockingDuration time.Duration, rs api.RedundancySettings, contractSet string, lockPriority int) (uploaded int, err error) { - // upload packed slabs - var mu sync.Mutex - var errs error - - var wg sync.WaitGroup - totalSize := uint64(rs.TotalShards) * rhpv2.SectorSize - - // derive a context that we can use as an interrupt in case of an error. - interruptCtx, cancel := context.WithCancel(ctx) - defer cancel() - - for { - // block until we have memory for a slab or until we are interrupted - mem := w.uploadManager.mm.AcquireMemory(interruptCtx, totalSize) - if mem == nil { - break // interrupted - } - - // fetch packed slabs to upload - var packedSlabs []api.PackedSlab - packedSlabs, err = w.bus.PackedSlabsForUpload(ctx, lockingDuration, uint8(rs.MinShards), uint8(rs.TotalShards), contractSet, 1) - if err != nil { - err = fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) - mem.Release() - break - } else if len(packedSlabs) == 0 { - mem.Release() - break // no more slabs - } - ps := packedSlabs[0] - - // launch upload for slab - wg.Add(1) - go func(ps api.PackedSlab) { - defer mem.Release() - defer wg.Done() - err := w.uploadPackedSlab(ctx, ps, rs, contractSet, lockPriority, mem) - mu.Lock() - if err != nil { - errs = errors.Join(errs, err) - cancel() // prevent new uploads from being launched - } else { - uploaded++ - } - mu.Unlock() - }(ps) - } - - // wait for all threads to finish - wg.Wait() - - // return collected errors - err = errors.Join(err, errs) - return -} - -func (w *worker) uploadPackedSlab(ctx context.Context, ps api.PackedSlab, rs api.RedundancySettings, contractSet string, lockPriority int, mem *acquiredMemory) error { - // create a context with sane timeout - ctx, cancel := context.WithTimeout(ctx, defaultPackedSlabsUploadTimeout) - defer cancel() - - // fetch contracts - contracts, err := w.bus.ContractSetContracts(ctx, contractSet) - if err != nil { - return fmt.Errorf("couldn't fetch packed slabs from bus: %v", err) - } - - // fetch upload params - up, err := w.bus.UploadParams(ctx) - if err != nil { - return fmt.Errorf("couldn't fetch upload params from bus: %v", err) - } - - // attach gouging checker to the context - ctx = WithGougingChecker(ctx, w.bus, up.GougingParams) - - // upload packed slab - err = w.uploadManager.UploadPackedSlab(ctx, rs, ps, contracts, up.CurrentHeight, lockPriority, mem) - if err != nil { - return fmt.Errorf("couldn't upload packed slab, err: %v", err) - } - - return nil -} - -func newUploadManager(b Bus, hp hostProvider, rl revisionLocker, mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, shutdownCtx context.Context, logger *zap.SugaredLogger) *uploadManager { - return &uploadManager{ - b: b, - hp: hp, - rl: rl, - logger: logger, - mm: mm, - - maxOverdrive: maxOverdrive, - overdriveTimeout: overdriveTimeout, - - statsOverdrivePct: stats.NoDecay(), - statsSlabUploadSpeedBytesPerMS: stats.NoDecay(), - - shutdownCtx: shutdownCtx, - - uploaders: make([]*uploader, 0), - } -} - -func (mgr *uploadManager) newUploader(b Bus, hp hostProvider, c api.ContractMetadata, bh uint64) *uploader { - return &uploader{ - b: b, - - // static - hk: c.HostKey, - siamuxAddr: c.SiamuxAddr, - shutdownCtx: mgr.shutdownCtx, - signalNewUpload: make(chan struct{}, 1), - - // stats - statsSectorUploadEstimateInMS: stats.Default(), - statsSectorUploadSpeedBytesPerMS: stats.NoDecay(), - - // covered by mutex - host: hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr), - bh: bh, - fcid: c.ID, - endHeight: c.WindowEnd, - queue: make([]*sectorUploadReq, 0), - } -} - -func (mgr *uploadManager) MigrateShards(ctx context.Context, s *object.Slab, shardIndices []int, shards [][]byte, contractSet string, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { - // create the upload - upload, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) - if err != nil { - return err - } - - // track the upload in the bus - if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { - return fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) - } - - // defer a function that finishes the upload - defer func() { - ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) - if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { - mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) - } - cancel() - }() - - // upload the shards - uploaded, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) - if err != nil { - return err - } - - // track stats - mgr.statsOverdrivePct.Track(overdrivePct) - mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) - - // overwrite the shards with the newly uploaded ones - for i, si := range shardIndices { - s.Shards[si].LatestHost = uploaded[i].LatestHost - - knownContracts := make(map[types.FileContractID]struct{}) - for _, fcids := range s.Shards[si].Contracts { - for _, fcid := range fcids { - knownContracts[fcid] = struct{}{} - } - } - for hk, fcids := range uploaded[i].Contracts { - for _, fcid := range fcids { - if _, exists := knownContracts[fcid]; !exists { - if s.Shards[si].Contracts == nil { - s.Shards[si].Contracts = make(map[types.PublicKey][]types.FileContractID) - } - s.Shards[si].Contracts[hk] = append(s.Shards[si].Contracts[hk], fcid) - } - } - } - } - - // update the slab - return mgr.b.UpdateSlab(ctx, *s, contractSet) -} - -func (mgr *uploadManager) Stats() uploadManagerStats { - mgr.mu.Lock() - defer mgr.mu.Unlock() - - var numHealthy uint64 - speeds := make(map[types.PublicKey]float64) - for _, u := range mgr.uploaders { - u.tryRecomputeStats() - speeds[u.hk] = u.statsSectorUploadSpeedBytesPerMS.Average() * 0.008 - if u.healthy() { - numHealthy++ - } - } - - // prepare stats - return uploadManagerStats{ - avgSlabUploadSpeedMBPS: mgr.statsSlabUploadSpeedBytesPerMS.Average() * 0.008, // convert bytes per ms to mbps, - avgOverdrivePct: mgr.statsOverdrivePct.Average(), - healthyUploaders: numHealthy, - numUploaders: uint64(len(speeds)), - uploadSpeedsMBPS: speeds, - } -} - -func (mgr *uploadManager) Stop() { - mgr.mu.Lock() - defer mgr.mu.Unlock() - for _, u := range mgr.uploaders { - u.Stop() - } -} - -func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []api.ContractMetadata, up uploadParameters, lockPriority int) (bufferSizeLimitReached bool, eTag string, err error) { - // cancel all in-flight requests when the upload is done - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - // add tracing - ctx, span := tracing.Tracer.Start(ctx, "upload") - defer func() { - span.RecordError(err) - span.End() - }() - - // create the object - o := object.NewObject(up.ec) - - // create the hash reader - hr := newHashReader(r) - - // create the cipher reader - cr, err := o.Encrypt(hr, up.encryptionOffset) - if err != nil { - return false, "", err - } - - // create the upload - upload, err := mgr.newUpload(ctx, up.rs.TotalShards, contracts, up.bh, lockPriority) - if err != nil { - return false, "", err - } - - // track the upload in the bus - if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { - return false, "", fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) - } - - // defer a function that finishes the upload - defer func() { - ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) - if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { - mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) - } - cancel() - }() - - // create the response channel - respChan := make(chan slabUploadResponse) - - // channel to notify main thread of the number of slabs to wait for - numSlabsChan := make(chan int, 1) - - // prepare slab size - size := int64(up.rs.MinShards) * rhpv2.SectorSize - redundantSize := uint64(up.rs.TotalShards) * rhpv2.SectorSize - var partialSlab []byte - - // launch uploads in a separate goroutine - go func() { - var slabIndex int - for { - select { - case <-mgr.shutdownCtx.Done(): - return // interrupted - case <-ctx.Done(): - return // interrupted - default: - } - // acquire memory - mem := mgr.mm.AcquireMemory(ctx, redundantSize) - if mem == nil { - return // interrupted - } - - // read next slab's data - data := make([]byte, size) - length, err := io.ReadFull(io.LimitReader(cr, size), data) - if err == io.EOF { - mem.Release() - - // no more data to upload, notify main thread of the number of - // slabs to wait for - numSlabs := slabIndex - if partialSlab != nil && slabIndex > 0 { - numSlabs-- // don't wait on partial slab - } - numSlabsChan <- numSlabs - return - } else if err != nil && err != io.ErrUnexpectedEOF { - mem.Release() - - // unexpected error, notify main thread - select { - case respChan <- slabUploadResponse{err: err}: - case <-ctx.Done(): - } - return - } else if up.packing && errors.Is(err, io.ErrUnexpectedEOF) { - mem.Release() - - // uploadPacking is true, we return the partial slab without - // uploading. - partialSlab = data[:length] - } else { - // regular upload - go func(rs api.RedundancySettings, data []byte, length, slabIndex int) { - upload.uploadSlab(ctx, rs, data, length, slabIndex, respChan, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) - mem.Release() - }(up.rs, data, length, slabIndex) - } - slabIndex++ - } - }() - - // collect responses - var responses []slabUploadResponse - numSlabs := math.MaxInt32 - for len(responses) < numSlabs { - select { - case <-mgr.shutdownCtx.Done(): - return false, "", errUploadManagerStopped - case numSlabs = <-numSlabsChan: - case res := <-respChan: - if res.err != nil { - return false, "", res.err - } - responses = append(responses, res) - } - } - - // sort the slabs by index - sort.Slice(responses, func(i, j int) bool { - return responses[i].index < responses[j].index - }) - - // decorate the object with the slabs - for _, resp := range responses { - o.Slabs = append(o.Slabs, resp.slab) - } - - // calculate the eTag - eTag = hr.Hash() - - // add partial slabs - if len(partialSlab) > 0 { - var pss []object.SlabSlice - pss, bufferSizeLimitReached, err = mgr.b.AddPartialSlab(ctx, partialSlab, uint8(up.rs.MinShards), uint8(up.rs.TotalShards), up.contractSet) - if err != nil { - return false, "", err - } - o.Slabs = append(o.Slabs, pss...) - } - - if up.multipart { - // persist the part - err = mgr.b.AddMultipartPart(ctx, up.bucket, up.path, up.contractSet, eTag, up.uploadID, up.partNumber, o.Slabs) - if err != nil { - return bufferSizeLimitReached, "", fmt.Errorf("couldn't add multi part: %w", err) - } - } else { - // persist the object - err = mgr.b.AddObject(ctx, up.bucket, up.path, up.contractSet, o, api.AddObjectOptions{MimeType: up.mimeType, ETag: eTag}) - if err != nil { - return bufferSizeLimitReached, "", fmt.Errorf("couldn't add object: %w", err) - } - } - - return -} - -func (mgr *uploadManager) UploadPackedSlab(ctx context.Context, rs api.RedundancySettings, ps api.PackedSlab, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { - // build the shards - shards := encryptPartialSlab(ps.Data, ps.Key, uint8(rs.MinShards), uint8(rs.TotalShards)) - - // create the upload - upload, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) - if err != nil { - return err - } - - // track the upload in the bus - if err := mgr.b.TrackUpload(ctx, upload.id); err != nil { - return fmt.Errorf("failed to track upload '%v', err: %w", upload.id, err) - } - - // defer a function that finishes the upload - defer func() { - ctx, cancel := context.WithTimeout(mgr.shutdownCtx, time.Minute) - if err := mgr.b.FinishUpload(ctx, upload.id); err != nil { - mgr.logger.Errorf("failed to mark upload %v as finished: %v", upload.id, err) - } - cancel() - }() - - // upload the shards - sectors, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) - if err != nil { - return err - } - - // track stats - mgr.statsOverdrivePct.Track(overdrivePct) - mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) - - // mark packed slab as uploaded - slab := api.UploadedPackedSlab{BufferID: ps.BufferID, Shards: sectors} - err = mgr.b.MarkPackedSlabsUploaded(ctx, []api.UploadedPackedSlab{slab}) - if err != nil { - return fmt.Errorf("couldn't mark packed slabs uploaded, err: %v", err) - } - - return nil -} - -func (mgr *uploadManager) candidates(allowed map[types.PublicKey]struct{}) (candidates []*uploader) { - mgr.mu.Lock() - defer mgr.mu.Unlock() - - for _, u := range mgr.uploaders { - if _, allowed := allowed[u.hk]; allowed { - candidates = append(candidates, u) - } - } - return -} - -func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contracts []api.ContractMetadata, bh uint64, lockPriority int) (*upload, error) { - mgr.mu.Lock() - defer mgr.mu.Unlock() - - // refresh the uploaders - mgr.refreshUploaders(contracts, bh) - - // check if we have enough contracts - if len(contracts) < totalShards { - return nil, fmt.Errorf("%v < %v: %w", len(contracts), totalShards, errNotEnoughContracts) - } - - // create allowed map - allowed := make(map[types.PublicKey]struct{}) - for _, c := range contracts { - allowed[c.HostKey] = struct{}{} - } - - // create upload - return &upload{ - id: api.NewUploadID(), - allowed: allowed, - lockPriority: lockPriority, - shutdownCtx: mgr.shutdownCtx, - }, nil -} - -func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh uint64) { - // build map of contracts - toKeep := make(map[types.FileContractID]api.ContractMetadata) - for _, c := range contracts { - toKeep[c.ID] = c - } - - // build map of renewed contracts - renewedTo := make(map[types.FileContractID]api.ContractMetadata) - for _, c := range contracts { - if c.RenewedFrom != (types.FileContractID{}) { - renewedTo[c.RenewedFrom] = c - } - } - - // keep list of uploaders uploaders - var uploaders []*uploader - for _, uploader := range mgr.uploaders { - fcid := uploader.contractID() - - renewal, renewed := renewedTo[fcid] - if _, keep := toKeep[fcid]; !(keep || renewed) { - uploader.Stop() - continue - } - delete(toKeep, fcid) // toKeep becomes missing - - if renewed { - uploader.renew(mgr.hp, renewal, bh) - } else { - uploader.updateBlockHeight(bh) - } - uploaders = append(uploaders, uploader) - } - - for _, c := range toKeep { - uploader := mgr.newUploader(mgr.b, mgr.hp, c, bh) - uploaders = append(uploaders, uploader) - go uploader.Start(mgr.hp, mgr.rl) - } - - mgr.uploaders = uploaders -} - -func (u *uploader) SignalWork() { - select { - case u.signalNewUpload <- struct{}{}: - default: - } -} - -func (u *uploader) Start(hp hostProvider, rl revisionLocker) { -outer: - for { - // wait for work - select { - case <-u.signalNewUpload: - case <-u.shutdownCtx.Done(): - return - } - - for { - // check if we are stopped - select { - case <-u.shutdownCtx.Done(): - return - default: - } - - // pop the next upload req - req := u.pop() - if req == nil { - continue outer - } - - // skip if upload is done - if req.done() { - continue - } - - // execute it - var root types.Hash256 - start := time.Now() - fcid := u.contractID() - err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { - if rev.RevisionNumber == math.MaxUint64 { - return errMaxRevisionReached - } - - var err error - root, err = u.execute(req, rev) - return err - }) - - // the uploader's contract got renewed, requeue the request - if errors.Is(err, errMaxRevisionReached) { - u.enqueue(req) - continue outer - } - - // send the response - if err != nil { - req.fail(err) - } else { - req.succeed(root) - } - - // track the error, ignore gracefully closed streams and canceled overdrives - canceledOverdrive := req.done() && req.overdrive && err != nil - if !canceledOverdrive && !isClosedStream(err) { - u.trackSectorUpload(err, time.Since(start)) - } - } - } -} - -func (u *uploader) healthy() bool { - u.mu.Lock() - defer u.mu.Unlock() - return u.consecutiveFailures == 0 -} - -func (u *uploader) Stop() { - for { - upload := u.pop() - if upload == nil { - break - } - if !upload.done() { - upload.fail(errors.New("uploader stopped")) - } - } -} - -func (u *uploader) blockHeight() uint64 { - u.mu.Lock() - defer u.mu.Unlock() - return u.bh -} - -func (u *uploader) contractID() types.FileContractID { - u.mu.Lock() - defer u.mu.Unlock() - return u.fcid -} - -func (u *uploader) enqueue(req *sectorUploadReq) { - // trace the request - span := trace.SpanFromContext(req.sector.ctx) - span.SetAttributes(attribute.Stringer("hk", u.hk)) - span.AddEvent("enqueued") - - // decorate the request - req.fcid = u.contractID() - req.hk = u.hk - - // enqueue the request - u.mu.Lock() - u.queue = append(u.queue, req) - u.mu.Unlock() - - // signal there's work - u.SignalWork() -} - -func (u *uploader) estimate() float64 { - u.mu.Lock() - defer u.mu.Unlock() - - // fetch estimated duration per sector - estimateP90 := u.statsSectorUploadEstimateInMS.P90() - if estimateP90 == 0 { - estimateP90 = 1 - } - - // calculate estimated time - numSectors := float64(len(u.queue) + 1) - return numSectors * estimateP90 -} - -func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) (types.Hash256, error) { - u.mu.Lock() - host := u.host - fcid := u.fcid - u.mu.Unlock() - - // fetch span from context - span := trace.SpanFromContext(req.sector.ctx) - span.AddEvent("execute") - - // update the bus - if err := u.b.AddUploadingSector(req.sector.ctx, req.uploadID, fcid, req.sector.root); err != nil { - return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) - } - - // upload the sector - start := time.Now() - root, err := host.UploadSector(req.sector.ctx, req.sector.data, rev) - if err != nil { - return types.Hash256{}, err - } - - // update span - elapsed := time.Since(start) - span.SetAttributes(attribute.Int64("duration", elapsed.Milliseconds())) - span.RecordError(err) - span.End() - - return root, nil -} - -func (u *uploader) pop() *sectorUploadReq { - u.mu.Lock() - defer u.mu.Unlock() - - if len(u.queue) > 0 { - j := u.queue[0] - u.queue[0] = nil - u.queue = u.queue[1:] - return j - } - return nil -} - -func (u *uploader) renew(hp hostProvider, c api.ContractMetadata, bh uint64) { - u.mu.Lock() - defer u.mu.Unlock() - - u.bh = bh - u.host = hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr) - u.fcid = c.ID - u.siamuxAddr = c.SiamuxAddr - u.endHeight = c.WindowEnd -} - -func (u *uploader) trackSectorUpload(err error, d time.Duration) { - u.mu.Lock() - defer u.mu.Unlock() - if err != nil { - u.consecutiveFailures++ - u.statsSectorUploadEstimateInMS.Track(float64(time.Hour.Milliseconds())) - } else { - ms := d.Milliseconds() - u.consecutiveFailures = 0 - u.statsSectorUploadEstimateInMS.Track(float64(ms)) // duration in ms - u.statsSectorUploadSpeedBytesPerMS.Track(float64(rhpv2.SectorSize / ms)) // bytes per ms - } -} - -func (u *uploader) tryRecomputeStats() { - u.mu.Lock() - defer u.mu.Unlock() - if time.Since(u.lastRecompute) < statsRecomputeMinInterval { - return - } - - u.lastRecompute = time.Now() - u.statsSectorUploadEstimateInMS.Recompute() - u.statsSectorUploadSpeedBytesPerMS.Recompute() -} - -func (u *uploader) updateBlockHeight(bh uint64) { - u.mu.Lock() - defer u.mu.Unlock() - u.bh = bh -} - -func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { - // create slab upload - slab := &slabUpload{ - lockPriority: u.lockPriority, - uploadID: u.id, - created: time.Now(), - maxOverdrive: maxOverdrive, - mem: mem, - overdriveTimeout: overdriveTimeout, - - candidates: candidates, - shards: shards, - - overdriving: make(map[int]map[types.PublicKey]struct{}), - sectors: make(map[int]*sectorUpload, len(shards)), - used: make(map[types.PublicKey]struct{}), - errs: make(HostErrorSet), - } - - // prepare sector uploads - responseChan := make(chan sectorUploadResp) - requests := make([]*sectorUploadReq, len(shards)) - for sI, shard := range shards { - // create the ctx - sCtx, sCancel := context.WithCancel(ctx) - - // attach the upload's span - sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") - span.SetAttributes(attribute.Bool("overdrive", false)) - span.SetAttributes(attribute.Int("sector", sI)) - - // create the sector - sector := §orUpload{ - data: (*[rhpv2.SectorSize]byte)(shard), - index: sI, - root: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), - - ctx: sCtx, - cancel: sCancel, - } - slab.sectors[sI] = sector - - // create the request - requests[sI] = §orUploadReq{ - lockPriority: slab.lockPriority, - overdrive: false, - responseChan: responseChan, - sector: sector, - uploadID: slab.uploadID, - } - } - - return slab, requests, responseChan -} - -func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (overdrivePct float64, overdriveSpeed int64) { - // add tracing - ctx, span := tracing.Tracer.Start(ctx, "uploadSlab") - defer span.End() - - // create the response - resp := slabUploadResponse{ - slab: object.SlabSlice{ - Slab: object.NewSlab(uint8(rs.MinShards)), - Offset: 0, - Length: uint32(length), - }, - index: index, - } - - // create the shards - shards := make([][]byte, rs.TotalShards) - resp.slab.Slab.Encode(data, shards) - resp.slab.Slab.Encrypt(shards) - - // upload the shards - resp.slab.Slab.Shards, overdrivePct, overdriveSpeed, resp.err = u.uploadShards(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) - - // send the response - select { - case <-ctx.Done(): - case respChan <- resp: - } - - return -} - -func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) ([]object.Sector, float64, int64, error) { - // add tracing - ctx, span := tracing.Tracer.Start(ctx, "uploadShards") - defer span.End() - - // ensure inflight uploads get cancelled - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - // prepare the upload - slab, requests, respChan := u.newSlabUpload(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) - - // launch all shard uploads - for _, upload := range requests { - if _, err := slab.launch(upload); err != nil { - return nil, 0, 0, err - } - } - - // launch overdrive - resetOverdrive := slab.overdrive(ctx, respChan) - - // collect responses - var done bool - for slab.inflight() > 0 && !done { - var resp sectorUploadResp - select { - case <-u.shutdownCtx.Done(): - return nil, 0, 0, errors.New("upload stopped") - case <-ctx.Done(): - return nil, 0, 0, ctx.Err() - case resp = <-respChan: - } - - resetOverdrive() - - // receive the response - done = slab.receive(resp) - - // relaunch non-overdrive uploads - if !done && resp.err != nil && !resp.req.overdrive { - if overdriving, err := slab.launch(resp.req); err != nil { - if !overdriving { - break // fail the upload - } - } - } - } - - // register the amount of overdrive sectors - span.SetAttributes(attribute.Int("overdrive", slab.overdriveCnt())) - - sectors, err := slab.finish() - return sectors, slab.overdrivePct(), slab.uploadSpeed(), err -} - -func (s *slabUpload) uploadSpeed() int64 { - s.mu.Lock() - defer s.mu.Unlock() - bytes := s.numUploaded * rhpv2.SectorSize - ms := time.Since(s.created).Milliseconds() - return int64(bytes) / ms -} - -func (s *slabUpload) finish() (sectors []object.Sector, _ error) { - s.mu.Lock() - defer s.mu.Unlock() - - if s.numUploaded < uint64(len(s.shards)) { - remaining := uint64(len(s.shards)) - s.numUploaded - return nil, fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d uploaders=%d errors=%d %w", s.numLaunched, s.numUploaded, remaining, s.numInflight, len(s.candidates), len(s.errs), s.errs) - } - - for i := 0; i < len(s.shards); i++ { - sectors = append(sectors, s.sectors[i].uploaded) - } - return -} - -func (s *slabUpload) inflight() uint64 { - s.mu.Lock() - defer s.mu.Unlock() - return s.numInflight -} - -func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { - s.mu.Lock() - defer s.mu.Unlock() - - // nothing to do - if req == nil { - return false, nil - } - - // find candidate candidate - var candidate *uploader - for _, uploader := range s.candidates { - if _, used := s.used[uploader.hk]; used { - continue - } - if candidate == nil || uploader.estimate() < candidate.estimate() { - candidate = uploader - } - } - - // no candidate found - if candidate == nil { - err = errNoCandidateUploader - interrupt = !req.overdrive && len(s.overdriving[req.sector.index]) == 0 - span := trace.SpanFromContext(req.sector.ctx) - span.RecordError(err) - span.End() - return - } - - // enqueue the req - candidate.enqueue(req) - - // update the state - s.numInflight++ - s.numLaunched++ - s.used[req.hk] = struct{}{} - - if req.overdrive { - s.lastOverdrive = time.Now() - s.numOverdriving++ - - if _, exists := s.overdriving[req.sector.index]; !exists { - s.overdriving[req.sector.index] = make(map[types.PublicKey]struct{}) - } - s.overdriving[req.sector.index][req.hk] = struct{}{} - } - return -} - -func (s *slabUpload) overdrive(ctx context.Context, respChan chan sectorUploadResp) (resetTimer func()) { - // overdrive is disabled - if s.overdriveTimeout == 0 { - return func() {} - } - - // create a timer to trigger overdrive - timer := time.NewTimer(s.overdriveTimeout) - resetTimer = func() { - timer.Stop() - select { - case <-timer.C: - default: - } - timer.Reset(s.overdriveTimeout) - } - - // create a function to check whether overdrive is possible - canOverdrive := func() bool { - s.mu.Lock() - defer s.mu.Unlock() - - // overdrive is not kicking in yet - remaining := uint64(len(s.shards)) - s.numUploaded - if remaining >= s.maxOverdrive { - return false - } - - // overdrive is not due yet - if time.Since(s.lastOverdrive) < s.overdriveTimeout { - return false - } - - // overdrive is maxed out - if s.numInflight-remaining >= s.maxOverdrive { - return false - } - - return true - } - - // try overdriving every time the timer fires - go func() { - for { - select { - case <-ctx.Done(): - return - case <-timer.C: - if canOverdrive() { - _, _ = s.launch(s.nextRequest(respChan)) // ignore result - } - resetTimer() - } - } - }() - - return -} - -func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUploadReq { - s.mu.Lock() - defer s.mu.Unlock() - - // find the sector that's not finished and has the least amount of overdrives - lowestNumOverdrives := math.MaxInt - var nextSector *sectorUpload - for _, sector := range s.sectors { - if !sector.isUploaded() && len(s.overdriving[sector.index]) < lowestNumOverdrives { - nextSector = sector - } - } - if nextSector == nil { - return nil - } - - return §orUploadReq{ - lockPriority: s.lockPriority, - overdrive: true, - responseChan: responseChan, - sector: nextSector, - uploadID: s.uploadID, - } -} - -func (s *slabUpload) overdriveCnt() int { - s.mu.Lock() - defer s.mu.Unlock() - return int(s.numLaunched) - len(s.sectors) -} - -func (s *slabUpload) overdrivePct() float64 { - s.mu.Lock() - defer s.mu.Unlock() - - numOverdrive := int(s.numLaunched) - len(s.sectors) - if numOverdrive <= 0 { - return 0 - } - - return float64(numOverdrive) / float64(len(s.sectors)) -} - -func (s *slabUpload) receive(resp sectorUploadResp) bool { - s.mu.Lock() - defer s.mu.Unlock() - - // convenience variable - req := resp.req - sector := req.sector - - // update the state - if req.overdrive { - s.numOverdriving-- - } - s.numInflight-- - - // failed reqs can't complete the upload - if resp.err != nil { - s.errs[req.hk] = resp.err - return false - } - - // redundant sectors can't complete the upload - if sector.uploaded.Root != (types.Hash256{}) { - return false - } - - // store the sector - sector.uploaded = object.Sector{ - Contracts: map[types.PublicKey][]types.FileContractID{req.hk: {req.fcid}}, - LatestHost: req.hk, - Root: resp.root, - } - - // update uploaded sectors - s.numUploaded++ - - // cancel the sector context - sector.cancel() - - // free hosts we're using to overdrive this sector - for hk := range s.overdriving[req.sector.index] { - delete(s.used, hk) - } - - // release memory - sector.data = nil - s.shards[sector.index] = nil - s.mem.ReleaseSome(rhpv2.SectorSize) - - return s.numUploaded == uint64(len(s.shards)) -} - -func (req *sectorUploadReq) done() bool { - select { - case <-req.sector.ctx.Done(): - return true - default: - return false - } -} - -func (s *sectorUpload) isUploaded() bool { - return s.uploaded.Root != (types.Hash256{}) -} - -func (req *sectorUploadReq) fail(err error) { - select { - case <-req.sector.ctx.Done(): - case req.responseChan <- sectorUploadResp{ - req: req, - err: err, - }: - } -} - -func (req *sectorUploadReq) succeed(root types.Hash256) { - select { - case <-req.sector.ctx.Done(): - case req.responseChan <- sectorUploadResp{ - req: req, - root: root, - }: - } -} diff --git a/worker/upload_params.go b/worker/upload_params.go new file mode 100644 index 000000000..8e033b287 --- /dev/null +++ b/worker/upload_params.go @@ -0,0 +1,135 @@ +package worker + +import ( + "bytes" + "encoding/hex" + "io" + + "github.com/gabriel-vasile/mimetype" + "go.sia.tech/core/types" + "go.sia.tech/renterd/api" + "go.sia.tech/renterd/build" + "go.sia.tech/renterd/object" +) + +type uploadParameters struct { + bucket string + path string + + multipart bool + uploadID string + partNumber int + + ec object.EncryptionKey + encryptionOffset uint64 + + rs api.RedundancySettings + bh uint64 + contractSet string + packing bool + mimeType string +} + +func defaultParameters(bucket, path string) uploadParameters { + return uploadParameters{ + bucket: bucket, + path: path, + + ec: object.GenerateEncryptionKey(), // random key + encryptionOffset: 0, // from the beginning + + rs: build.DefaultRedundancySettings, + } +} + +func multipartParameters(bucket, path, uploadID string, partNumber int) uploadParameters { + return uploadParameters{ + bucket: bucket, + path: path, + + multipart: true, + uploadID: uploadID, + partNumber: partNumber, + + ec: object.GenerateEncryptionKey(), // random key + encryptionOffset: 0, // from the beginning + + rs: build.DefaultRedundancySettings, + } +} + +type UploadOption func(*uploadParameters) + +func WithBlockHeight(bh uint64) UploadOption { + return func(up *uploadParameters) { + up.bh = bh + } +} + +func WithContractSet(contractSet string) UploadOption { + return func(up *uploadParameters) { + up.contractSet = contractSet + } +} + +func WithCustomKey(ec object.EncryptionKey) UploadOption { + return func(up *uploadParameters) { + up.ec = ec + } +} + +func WithCustomEncryptionOffset(offset uint64) UploadOption { + return func(up *uploadParameters) { + up.encryptionOffset = offset + } +} + +func WithMimeType(mimeType string) UploadOption { + return func(up *uploadParameters) { + up.mimeType = mimeType + } +} + +func WithPacking(packing bool) UploadOption { + return func(up *uploadParameters) { + up.packing = packing + } +} + +func WithRedundancySettings(rs api.RedundancySettings) UploadOption { + return func(up *uploadParameters) { + up.rs = rs + } +} + +func newMimeReader(r io.Reader) (mimeType string, recycled io.Reader, err error) { + buf := bytes.NewBuffer(nil) + mtype, err := mimetype.DetectReader(io.TeeReader(r, buf)) + recycled = io.MultiReader(buf, r) + return mtype.String(), recycled, err +} + +type hashReader struct { + r io.Reader + h *types.Hasher +} + +func newHashReader(r io.Reader) *hashReader { + return &hashReader{ + r: r, + h: types.NewHasher(), + } +} + +func (e *hashReader) Read(p []byte) (int, error) { + n, err := e.r.Read(p) + if _, wErr := e.h.E.Write(p[:n]); wErr != nil { + return 0, wErr + } + return n, err +} + +func (e *hashReader) Hash() string { + sum := e.h.Sum() + return hex.EncodeToString(sum[:]) +} From adba7425013a16d4836c79a1817871557af01542 Mon Sep 17 00:00:00 2001 From: PJ Date: Thu, 7 Dec 2023 16:06:05 +0100 Subject: [PATCH 12/25] worker: only sort candidates when creating an upload --- worker/upload.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index f5c5fc576..c57eab75b 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -101,7 +101,7 @@ type ( mem *acquiredMemory overdriveTimeout time.Duration - candidates []*uploader + candidates []*uploader // sorted by upload estimate shards [][]byte mu sync.Mutex @@ -431,7 +431,6 @@ func (mgr *uploadManager) Stats() uploadManagerStats { var numHealthy uint64 speeds := make(map[types.PublicKey]float64) for _, u := range mgr.uploaders { - u.tryRecomputeStats() speeds[u.hk] = u.statsSectorUploadSpeedBytesPerMS.Average() * 0.008 if u.healthy() { numHealthy++ @@ -677,6 +676,11 @@ func (mgr *uploadManager) candidates(allowed map[types.PublicKey]struct{}) (cand candidates = append(candidates, u) } } + + // sort candidates by upload estimate + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].estimate() < candidates[j].estimate() + }) return } @@ -739,6 +743,8 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh } else { uploader.updateBlockHeight(bh) } + + uploader.tryRecomputeStats() uploaders = append(uploaders, uploader) } @@ -1155,15 +1161,14 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { return false, nil } - // find candidate candidate + // find next candidate var candidate *uploader for _, uploader := range s.candidates { if _, used := s.used[uploader.hk]; used { continue } - if candidate == nil || uploader.estimate() < candidate.estimate() { - candidate = uploader - } + candidate = uploader + break } // no candidate found From ab85af92a2370f376d4d6e7c08c430937a573a76 Mon Sep 17 00:00:00 2001 From: PJ Date: Thu, 7 Dec 2023 16:06:20 +0100 Subject: [PATCH 13/25] testing: rework TestUploadDownloadSameHost --- internal/testing/cluster_test.go | 81 +++++++++----------------------- 1 file changed, 23 insertions(+), 58 deletions(-) diff --git a/internal/testing/cluster_test.go b/internal/testing/cluster_test.go index 52f064e4e..56211d511 100644 --- a/internal/testing/cluster_test.go +++ b/internal/testing/cluster_test.go @@ -1244,82 +1244,47 @@ func TestEphemeralAccountSync(t *testing.T) { // TestUploadDownloadSameHost uploads a file to the same host through different // contracts and tries downloading the file again. func TestUploadDownloadSameHost(t *testing.T) { - t.SkipNow() // TODO PJ - if testing.Short() { t.SkipNow() } // create a test cluster cluster := newTestCluster(t, testClusterOptions{ - hosts: 1, + hosts: testRedundancySettings.TotalShards, }) defer cluster.Shutdown() tt := cluster.tt + b := cluster.Bus + w := cluster.Worker // shut down the autopilot to prevent it from doing contract maintenance if any kind cluster.ShutdownAutopilot(context.Background()) - // get wallet address - wallet, err := cluster.Bus.Wallet(context.Background()) - tt.OK(err) - - ac, err := cluster.Worker.Contracts(context.Background(), time.Minute) - tt.OK(err) - - contracts := ac.Contracts - if len(contracts) != 1 { - t.Fatal("expected 1 contract", len(contracts)) - } - c := contracts[0] - - // form 2 more contracts with the same host - rev2, _, err := cluster.Worker.RHPForm(context.Background(), c.WindowStart, c.HostKey, c.HostIP, wallet.Address, c.RenterFunds(), c.Revision.ValidHostPayout()) - tt.OK(err) - c2, err := cluster.Bus.AddContract(context.Background(), rev2, types.ZeroCurrency, c.TotalCost, c.StartHeight, api.ContractStatePending) - tt.OK(err) - rev3, _, err := cluster.Worker.RHPForm(context.Background(), c.WindowStart, c.HostKey, c.HostIP, wallet.Address, c.RenterFunds(), c.Revision.ValidHostPayout()) - tt.OK(err) - c3, err := cluster.Bus.AddContract(context.Background(), rev3, types.ZeroCurrency, c.TotalCost, c.StartHeight, api.ContractStatePending) - tt.OK(err) + // upload 3 objects so every host has 3 sectors + var err error + var res api.ObjectsResponse + shards := make(map[types.PublicKey][]object.Sector) + for i := 0; i < 3; i++ { + // upload object + tt.OKAll(w.UploadObject(context.Background(), bytes.NewReader(frand.Bytes(rhpv2.SectorSize)), api.DefaultBucketName, fmt.Sprintf("foo_%d", i), api.UploadObjectOptions{})) - // create a contract set with all 3 contracts - err = cluster.Bus.SetContractSet(context.Background(), testAutopilotConfig.Contracts.Set, []types.FileContractID{c.ID, c2.ID, c3.ID}) - tt.OK(err) + // download object from bus and keep track of its shards + res, err = b.Object(context.Background(), api.DefaultBucketName, fmt.Sprintf("foo_%d", i), api.GetObjectOptions{}) + tt.OK(err) + for _, shard := range res.Object.Slabs[0].Shards { + shards[shard.LatestHost] = append(shards[shard.LatestHost], shard) + } - // check the bus returns the desired contracts - up, err := cluster.Bus.UploadParams(context.Background()) - tt.OK(err) - csc, err := cluster.Bus.ContractSetContracts(context.Background(), up.ContractSet) - tt.OK(err) - if len(csc) != 3 { - t.Fatal("expected 3 contracts", len(csc)) + // delete the object + tt.OK(b.DeleteObject(context.Background(), api.DefaultBucketName, fmt.Sprintf("foo_%d", i), api.DeleteObjectOptions{})) } - // upload a file - data := frand.Bytes(5*rhpv2.SectorSize + 1) - tt.OKAll(cluster.Worker.UploadObject(context.Background(), bytes.NewReader(data), api.DefaultBucketName, "foo", api.UploadObjectOptions{})) + // build a frankenstein object constructed with all sectors on the same host + res.Object.Slabs[0].Shards = shards[res.Object.Slabs[0].Shards[0].LatestHost] + tt.OK(b.AddObject(context.Background(), api.DefaultBucketName, "frankenstein", testContractSet, res.Object.Object, api.AddObjectOptions{})) - // Download the file multiple times. - var wg sync.WaitGroup - for tt := 0; tt < 3; tt++ { - wg.Add(1) - go func() { - defer wg.Done() - for i := 0; i < 5; i++ { - buf := &bytes.Buffer{} - if err := cluster.Worker.DownloadObject(context.Background(), buf, api.DefaultBucketName, "foo", api.DownloadObjectOptions{}); err != nil { - t.Error(err) - break - } - if !bytes.Equal(buf.Bytes(), data) { - t.Error("data mismatch") - break - } - } - }() - } - wg.Wait() + // assert we can download this object + tt.OK(w.DownloadObject(context.Background(), io.Discard, api.DefaultBucketName, "frankenstein", api.DownloadObjectOptions{})) } func TestContractArchival(t *testing.T) { From a09453bc39176704ed5be350b318a4cd978540f9 Mon Sep 17 00:00:00 2001 From: PJ Date: Thu, 7 Dec 2023 16:19:33 +0100 Subject: [PATCH 14/25] worker: cleanup upload refactor --- internal/testing/pruning_test.go | 8 +--- worker/upload.go | 66 ++++++++++++++++++++------------ worker/worker.go | 4 +- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/internal/testing/pruning_test.go b/internal/testing/pruning_test.go index 6387e6a84..b80fe6acf 100644 --- a/internal/testing/pruning_test.go +++ b/internal/testing/pruning_test.go @@ -13,7 +13,6 @@ import ( "go.sia.tech/core/types" "go.sia.tech/renterd/api" "go.sia.tech/renterd/hostdb" - "go.uber.org/zap/zapcore" ) func TestHostPruning(t *testing.T) { @@ -22,9 +21,7 @@ func TestHostPruning(t *testing.T) { } // create a new test cluster - opts := clusterOptsDefault - opts.logger = newTestLoggerCustom(zapcore.DebugLevel) - cluster := newTestCluster(t, opts) + cluster := newTestCluster(t, clusterOptsDefault) defer cluster.Shutdown() b := cluster.Bus w := cluster.Worker @@ -85,9 +82,6 @@ func TestHostPruning(t *testing.T) { } time.Sleep(50 * time.Millisecond) } - if remaining != 0 { - t.Log("failed to trigger") - } // assert the host was not pruned hostss, err := b.Hosts(context.Background(), api.GetHostsOptions{}) diff --git a/worker/upload.go b/worker/upload.go index c57eab75b..b13164728 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -31,9 +31,9 @@ const ( ) var ( - errNoCandidateUploader = errors.New("no candidate uploader found") - errNotEnoughContracts = errors.New("not enough contracts to support requested redundancy") - errUploadManagerStopped = errors.New("upload manager stopped") + errNoCandidateUploader = errors.New("no candidate uploader found") + errNotEnoughContracts = errors.New("not enough contracts to support requested redundancy") + errWorkerShutDown = errors.New("worker was shut down") ) type ( @@ -367,7 +367,18 @@ func (mgr *uploadManager) newUploader(b Bus, hp hostProvider, c api.ContractMeta } } -func (mgr *uploadManager) MigrateShards(ctx context.Context, s *object.Slab, shardIndices []int, shards [][]byte, contractSet string, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { +func (mgr *uploadManager) MigrateShards(ctx context.Context, s *object.Slab, shardIndices []int, shards [][]byte, contractSet string, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) (err error) { + // cancel all in-flight requests when the upload is done + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "MigrateShards") + defer func() { + span.RecordError(err) + span.End() + }() + // create the upload upload, err := mgr.newUpload(ctx, len(shards), contracts, bh, lockPriority) if err != nil { @@ -461,7 +472,7 @@ func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []a defer cancel() // add tracing - ctx, span := tracing.Tracer.Start(ctx, "upload") + ctx, span := tracing.Tracer.Start(ctx, "Upload") defer func() { span.RecordError(err) span.End() @@ -573,7 +584,7 @@ func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []a for len(responses) < numSlabs { select { case <-mgr.shutdownCtx.Done(): - return false, "", errUploadManagerStopped + return false, "", errWorkerShutDown case numSlabs = <-numSlabsChan: case res := <-respChan: if res.err != nil { @@ -623,7 +634,18 @@ func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []a return } -func (mgr *uploadManager) UploadPackedSlab(ctx context.Context, rs api.RedundancySettings, ps api.PackedSlab, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) error { +func (mgr *uploadManager) UploadPackedSlab(ctx context.Context, rs api.RedundancySettings, ps api.PackedSlab, contracts []api.ContractMetadata, bh uint64, lockPriority int, mem *acquiredMemory) (err error) { + // cancel all in-flight requests when the upload is done + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + // add tracing + ctx, span := tracing.Tracer.Start(ctx, "UploadPackedSlab") + defer func() { + span.RecordError(err) + span.End() + }() + // build the shards shards := encryptPartialSlab(ps.Data, ps.Key, uint8(rs.MinShards), uint8(rs.TotalShards)) @@ -712,15 +734,11 @@ func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contra } func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh uint64) { - // build map of contracts + // build map of contracts to keep and what contracts got renewed toKeep := make(map[types.FileContractID]api.ContractMetadata) - for _, c := range contracts { - toKeep[c.ID] = c - } - - // build map of renewed contracts renewedTo := make(map[types.FileContractID]api.ContractMetadata) for _, c := range contracts { + toKeep[c.ID] = c if c.RenewedFrom != (types.FileContractID{}) { renewedTo[c.RenewedFrom] = c } @@ -736,7 +754,7 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh uploader.Stop() continue } - delete(toKeep, fcid) // toKeep becomes missing + delete(toKeep, fcid) // toKeep becomes toAdd if renewed { uploader.renew(mgr.hp, renewal, bh) @@ -829,12 +847,6 @@ outer: } } -func (u *uploader) healthy() bool { - u.mu.Lock() - defer u.mu.Unlock() - return u.consecutiveFailures == 0 -} - func (u *uploader) Stop() { for { upload := u.pop() @@ -924,6 +936,12 @@ func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) return root, nil } +func (u *uploader) healthy() bool { + u.mu.Lock() + defer u.mu.Unlock() + return u.consecutiveFailures == 0 +} + func (u *uploader) pop() *sectorUploadReq { u.mu.Lock() defer u.mu.Unlock() @@ -1354,6 +1372,10 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { return s.numUploaded == uint64(len(s.shards)) } +func (s *sectorUpload) isUploaded() bool { + return s.uploaded.Root != (types.Hash256{}) +} + func (req *sectorUploadReq) done() bool { select { case <-req.sector.ctx.Done(): @@ -1363,10 +1385,6 @@ func (req *sectorUploadReq) done() bool { } } -func (s *sectorUpload) isUploaded() bool { - return s.uploaded.Root != (types.Hash256{}) -} - func (req *sectorUploadReq) fail(err error) { select { case <-req.sector.ctx.Done(): diff --git a/worker/worker.go b/worker/worker.go index ba52735f6..e0ba73674 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -1131,7 +1131,7 @@ func (w *worker) objectsHandlerPUT(jc jape.Context) { params := defaultParameters(bucket, path) eTag, err := w.upload(ctx, jc.Request.Body, contracts, params, opts...) if err := jc.Check("couldn't upload object", err); err != nil { - if err != nil && !(errors.Is(err, errUploadManagerStopped) || + if err != nil && !(errors.Is(err, errWorkerShutDown) || errors.Is(err, errNotEnoughContracts) || errors.Is(err, context.Canceled)) { w.logger.Error(err) @@ -1270,7 +1270,7 @@ func (w *worker) multipartUploadHandlerPUT(jc jape.Context) { params := multipartParameters(bucket, path, uploadID, partNumber) eTag, err := w.upload(ctx, jc.Request.Body, contracts, params, opts...) if jc.Check("couldn't upload object", err) != nil { - if err != nil && !(errors.Is(err, errUploadManagerStopped) || + if err != nil && !(errors.Is(err, errWorkerShutDown) || errors.Is(err, errNotEnoughContracts) || errors.Is(err, context.Canceled)) { w.logger.Error(err) From 685a701afb5898e5c34b3fafdd6fed8477ae8a18 Mon Sep 17 00:00:00 2001 From: PJ Date: Thu, 7 Dec 2023 17:02:05 +0100 Subject: [PATCH 15/25] autopilot: fix log --- autopilot/autopilot.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autopilot/autopilot.go b/autopilot/autopilot.go index d42fbceab..43b54023d 100644 --- a/autopilot/autopilot.go +++ b/autopilot/autopilot.go @@ -242,7 +242,7 @@ func (ap *Autopilot) Run() error { // Trace/Log worker id chosen for this maintenance iteration. workerID, err := w.ID(ctx) if err != nil { - ap.logger.Errorf("failed to fetch worker id - abort maintenance", err) + ap.logger.Errorf("aborting maintenance, failed to fetch worker id, err: %v", err) return } span.SetAttributes(attribute.String("worker", workerID)) From 1f337ff3829501ec61bd87a2e128c68d0f22c007 Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 11:22:06 +0100 Subject: [PATCH 16/25] worker: update memory manager init --- worker/download.go | 3 ++- worker/memory.go | 8 ++------ worker/upload.go | 3 ++- worker/worker.go | 20 +++++++++----------- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/worker/download.go b/worker/download.go index 3db613107..4aaeb9083 100644 --- a/worker/download.go +++ b/worker/download.go @@ -159,11 +159,12 @@ type ( } ) -func (w *worker) initDownloadManager(mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { +func (w *worker) initDownloadManager(maxMemory, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { if w.downloadManager != nil { panic("download manager already initialized") // developer error } + mm := newMemoryManager(logger, maxMemory) w.downloadManager = newDownloadManager(w, w, mm, w.bus, maxOverdrive, overdriveTimeout, logger) } diff --git a/worker/memory.go b/worker/memory.go index 9d1bd2094..c86a59324 100644 --- a/worker/memory.go +++ b/worker/memory.go @@ -2,7 +2,6 @@ package worker import ( "context" - "fmt" "sync" "go.sia.tech/renterd/api" @@ -35,17 +34,14 @@ type ( var _ memoryManager = (*manager)(nil) -func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) (memoryManager, error) { - if maxMemory == 0 { - return nil, fmt.Errorf("maxMemory cannot be 0") - } +func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) memoryManager { mm := &manager{ logger: logger, totalAvailable: maxMemory, } mm.available = mm.totalAvailable mm.sigNewMem = *sync.NewCond(&mm.mu) - return mm, nil + return mm } func (mm *manager) Status() api.MemoryStatus { diff --git a/worker/upload.go b/worker/upload.go index b13164728..24172237c 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -153,11 +153,12 @@ type ( } ) -func (w *worker) initUploadManager(mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { +func (w *worker) initUploadManager(maxMemory, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) { if w.uploadManager != nil { panic("upload manager already initialized") // developer error } + mm := newMemoryManager(logger, maxMemory) w.uploadManager = newUploadManager(w.bus, w, w, mm, maxOverdrive, overdriveTimeout, w.shutdownCtx, logger) } diff --git a/worker/worker.go b/worker/worker.go index e0ba73674..2afc34fe6 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -1396,6 +1396,12 @@ func New(masterKey [32]byte, id string, b Bus, contractLockingDuration, busFlush if uploadOverdriveTimeout == 0 { return nil, errors.New("upload overdrive timeout must be positive") } + if downloadMaxMemory == 0 { + return nil, errors.New("downloadMaxMemory cannot be 0") + } + if uploadMaxMemory == 0 { + return nil, errors.New("uploadMaxMemory cannot be 0") + } ctx, cancel := context.WithCancel(context.Background()) w := &worker{ @@ -1412,20 +1418,12 @@ func New(masterKey [32]byte, id string, b Bus, contractLockingDuration, busFlush shutdownCtx: ctx, shutdownCtxCancel: cancel, } - w.initTransportPool() w.initAccounts(b) w.initContractSpendingRecorder() + w.initDownloadManager(downloadMaxMemory, downloadMaxOverdrive, downloadOverdriveTimeout, l.Sugar().Named("downloadmanager")) w.initPriceTables() - dmm, err := newMemoryManager(w.logger, downloadMaxMemory) - if err != nil { - return nil, err - } - w.initDownloadManager(dmm, downloadMaxOverdrive, downloadOverdriveTimeout, l.Sugar().Named("downloadmanager")) - umm, err := newMemoryManager(w.logger, uploadMaxMemory) - if err != nil { - return nil, err - } - w.initUploadManager(umm, uploadMaxOverdrive, uploadOverdriveTimeout, l.Sugar().Named("uploadmanager")) + w.initTransportPool() + w.initUploadManager(uploadMaxMemory, uploadMaxOverdrive, uploadOverdriveTimeout, l.Sugar().Named("uploadmanager")) return w, nil } From cfd837449b14d9750448748fa3b18845302b0103 Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 11:22:15 +0100 Subject: [PATCH 17/25] worker: update refresh uploaders --- worker/upload.go | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 24172237c..ce905f105 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -748,21 +748,19 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh // keep list of uploaders uploaders var uploaders []*uploader for _, uploader := range mgr.uploaders { - fcid := uploader.contractID() - - renewal, renewed := renewedTo[fcid] - if _, keep := toKeep[fcid]; !(keep || renewed) { + renewal, renewed := renewedTo[uploader.contractID()] + if _, keep := toKeep[uploader.contractID()]; !(keep || renewed) { uploader.Stop() continue - } - delete(toKeep, fcid) // toKeep becomes toAdd - - if renewed { + } else if renewed { uploader.renew(mgr.hp, renewal, bh) - } else { - uploader.updateBlockHeight(bh) } + // delete current fcid from toKeep, by doing so it becomes a list of the + // contracts we want to add + delete(toKeep, uploader.contractID()) + + uploader.updateBlockHeight(bh) uploader.tryRecomputeStats() uploaders = append(uploaders, uploader) } From 316160fc6e8d219f3eb0ae3f094e66279075f92a Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 13:09:16 +0100 Subject: [PATCH 18/25] worker: refactor upload --- tracing/tracing.go | 4 +- worker/contract_lock.go | 119 +++++++++ worker/download.go | 4 +- worker/memory.go | 18 +- worker/upload.go | 558 ++++++++++++---------------------------- worker/upload_params.go | 38 --- worker/upload_utils.go | 55 ++++ worker/uploader.go | 265 +++++++++++++++++++ worker/worker.go | 121 --------- 9 files changed, 614 insertions(+), 568 deletions(-) create mode 100644 worker/contract_lock.go create mode 100644 worker/upload_utils.go create mode 100644 worker/uploader.go diff --git a/tracing/tracing.go b/tracing/tracing.go index 8f4f0a8dc..8dbf75cc2 100644 --- a/tracing/tracing.go +++ b/tracing/tracing.go @@ -13,7 +13,7 @@ import ( "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.4.0" - "go.opentelemetry.io/otel/trace" + "go.opentelemetry.io/otel/trace/noop" "go.sia.tech/jape" ) @@ -23,7 +23,7 @@ const ( ) var ( - Tracer = trace.NewNoopTracerProvider().Tracer("noop") + Tracer = noop.NewTracerProvider().Tracer("noop") ) // Init initialises a new OpenTelemetry Tracer using information from the diff --git a/worker/contract_lock.go b/worker/contract_lock.go new file mode 100644 index 000000000..569ac2be5 --- /dev/null +++ b/worker/contract_lock.go @@ -0,0 +1,119 @@ +package worker + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + "go.sia.tech/core/types" + "go.uber.org/zap" +) + +type ContractLocker interface { + AcquireContract(ctx context.Context, fcid types.FileContractID, priority int, d time.Duration) (lockID uint64, err error) + KeepaliveContract(ctx context.Context, fcid types.FileContractID, lockID uint64, d time.Duration) (err error) + ReleaseContract(ctx context.Context, fcid types.FileContractID, lockID uint64) (err error) +} + +var _ ContractLocker = (Bus)(nil) + +type contractLock struct { + lockID uint64 + fcid types.FileContractID + d time.Duration + locker ContractLocker + logger *zap.SugaredLogger + + stopCtx context.Context + stopCtxCancel context.CancelFunc + stopWG sync.WaitGroup +} + +func newContractLock(fcid types.FileContractID, lockID uint64, d time.Duration, locker ContractLocker, logger *zap.SugaredLogger) *contractLock { + ctx, cancel := context.WithCancel(context.Background()) + cl := &contractLock{ + lockID: lockID, + fcid: fcid, + d: d, + locker: locker, + logger: logger, + + stopCtx: ctx, + stopCtxCancel: cancel, + } + cl.stopWG.Add(1) + go func() { + cl.keepaliveLoop() + cl.stopWG.Done() + }() + return cl +} + +func (w *worker) acquireContractLock(ctx context.Context, fcid types.FileContractID, priority int) (_ revisionUnlocker, err error) { + lockID, err := w.bus.AcquireContract(ctx, fcid, priority, w.contractLockingDuration) + if err != nil { + return nil, err + } + return newContractLock(fcid, lockID, w.contractLockingDuration, w.bus, w.logger), nil +} + +func (w *worker) withContractLock(ctx context.Context, fcid types.FileContractID, priority int, fn func() error) error { + contractLock, err := w.acquireContractLock(ctx, fcid, priority) + if err != nil { + return err + } + defer func() { + releaseCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + _ = contractLock.Release(releaseCtx) + cancel() + }() + + return fn() +} + +func (cl *contractLock) Release(ctx context.Context) error { + // Stop background loop. + cl.stopCtxCancel() + cl.stopWG.Wait() + + // Release the contract. + return cl.locker.ReleaseContract(ctx, cl.fcid, cl.lockID) +} + +func (cl *contractLock) keepaliveLoop() { + // Create ticker for 20% of the lock duration. + start := time.Now() + var lastUpdate time.Time + tickDuration := cl.d / 5 + t := time.NewTicker(tickDuration) + + // Cleanup + defer func() { + t.Stop() + select { + case <-t.C: + default: + } + }() + + // Loop until stopped. + for { + select { + case <-cl.stopCtx.Done(): + return // released + case <-t.C: + } + if err := cl.locker.KeepaliveContract(cl.stopCtx, cl.fcid, cl.lockID, cl.d); err != nil && !errors.Is(err, context.Canceled) { + cl.logger.Errorw(fmt.Sprintf("failed to send keepalive: %v", err), + "contract", cl.fcid, + "lockID", cl.lockID, + "loopStart", start, + "timeSinceLastUpdate", time.Since(lastUpdate), + "tickDuration", tickDuration) + return + } + lastUpdate = time.Now() + } +} diff --git a/worker/download.go b/worker/download.go index 4aaeb9083..32265d184 100644 --- a/worker/download.go +++ b/worker/download.go @@ -40,7 +40,7 @@ type ( id [8]byte downloadManager struct { - mm memoryManager + mm MemoryManager hp hostProvider pss partialSlabStore slm sectorLostMarker @@ -168,7 +168,7 @@ func (w *worker) initDownloadManager(maxMemory, maxOverdrive uint64, overdriveTi w.downloadManager = newDownloadManager(w, w, mm, w.bus, maxOverdrive, overdriveTimeout, logger) } -func newDownloadManager(hp hostProvider, pss partialSlabStore, mm memoryManager, slm sectorLostMarker, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) *downloadManager { +func newDownloadManager(hp hostProvider, pss partialSlabStore, mm MemoryManager, slm sectorLostMarker, maxOverdrive uint64, overdriveTimeout time.Duration, logger *zap.SugaredLogger) *downloadManager { return &downloadManager{ hp: hp, mm: mm, diff --git a/worker/memory.go b/worker/memory.go index c86a59324..3bc58d665 100644 --- a/worker/memory.go +++ b/worker/memory.go @@ -9,14 +9,14 @@ import ( ) type ( - // memoryManager helps regulate processes that use a lot of memory. Such as + // MemoryManager helps regulate processes that use a lot of memory. Such as // uploads and downloads. - memoryManager interface { + MemoryManager interface { Status() api.MemoryStatus AcquireMemory(ctx context.Context, amt uint64) *acquiredMemory } - manager struct { + memoryManager struct { totalAvailable uint64 logger *zap.SugaredLogger @@ -26,16 +26,16 @@ type ( } acquiredMemory struct { - mm *manager + mm *memoryManager remaining uint64 } ) -var _ memoryManager = (*manager)(nil) +var _ MemoryManager = (*memoryManager)(nil) -func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) memoryManager { - mm := &manager{ +func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) MemoryManager { + mm := &memoryManager{ logger: logger, totalAvailable: maxMemory, } @@ -44,7 +44,7 @@ func newMemoryManager(logger *zap.SugaredLogger, maxMemory uint64) memoryManager return mm } -func (mm *manager) Status() api.MemoryStatus { +func (mm *memoryManager) Status() api.MemoryStatus { mm.mu.Lock() defer mm.mu.Unlock() return api.MemoryStatus{ @@ -53,7 +53,7 @@ func (mm *manager) Status() api.MemoryStatus { } } -func (mm *manager) AcquireMemory(ctx context.Context, amt uint64) *acquiredMemory { +func (mm *memoryManager) AcquireMemory(ctx context.Context, amt uint64) *acquiredMemory { if amt == 0 { mm.logger.Fatal("cannot acquire 0 memory") } else if mm.totalAvailable < amt { diff --git a/worker/upload.go b/worker/upload.go index ce905f105..41c736864 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -41,7 +41,7 @@ type ( b Bus hp hostProvider rl revisionLocker - mm memoryManager + mm MemoryManager logger *zap.SugaredLogger shutdownCtx context.Context @@ -55,29 +55,6 @@ type ( uploaders []*uploader } - uploader struct { - b Bus - - hk types.PublicKey - siamuxAddr string - signalNewUpload chan struct{} - shutdownCtx context.Context - - mu sync.Mutex - bh uint64 - endHeight uint64 - fcid types.FileContractID - host hostV3 - queue []*sectorUploadReq - - // stats related field - consecutiveFailures uint64 - lastRecompute time.Time - - statsSectorUploadEstimateInMS *stats.DataPoints - statsSectorUploadSpeedBytesPerMS *stats.DataPoints - } - uploadManagerStats struct { avgSlabUploadSpeedMBPS float64 avgOverdrivePct float64 @@ -101,21 +78,25 @@ type ( mem *acquiredMemory overdriveTimeout time.Duration - candidates []*uploader // sorted by upload estimate + sectors []*sectorUpload + candidates []*candidate // sorted by upload estimate shards [][]byte - mu sync.Mutex - numInflight uint64 - numLaunched uint64 - numUploaded uint64 - - overdriving map[int]map[types.PublicKey]struct{} - lastOverdrive time.Time + mu sync.Mutex + numInflight uint64 + numLaunched uint64 numOverdriving uint64 + numUploaded uint64 + + lastOverdrive time.Time + errs HostErrorSet + } + + candidate struct { + uploader *uploader - sectors map[int]*sectorUpload - used map[types.PublicKey]struct{} - errs HostErrorSet + used bool + overdriving int // sector index } slabUploadResponse struct { @@ -325,7 +306,7 @@ func (w *worker) uploadPackedSlab(ctx context.Context, ps api.PackedSlab, rs api return nil } -func newUploadManager(b Bus, hp hostProvider, rl revisionLocker, mm memoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, shutdownCtx context.Context, logger *zap.SugaredLogger) *uploadManager { +func newUploadManager(b Bus, hp hostProvider, rl revisionLocker, mm MemoryManager, maxOverdrive uint64, overdriveTimeout time.Duration, shutdownCtx context.Context, logger *zap.SugaredLogger) *uploadManager { return &uploadManager{ b: b, hp: hp, @@ -444,7 +425,7 @@ func (mgr *uploadManager) Stats() uploadManagerStats { speeds := make(map[types.PublicKey]float64) for _, u := range mgr.uploaders { speeds[u.hk] = u.statsSectorUploadSpeedBytesPerMS.Average() * 0.008 - if u.healthy() { + if u.Healthy() { numHealthy++ } } @@ -748,19 +729,19 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh // keep list of uploaders uploaders var uploaders []*uploader for _, uploader := range mgr.uploaders { - renewal, renewed := renewedTo[uploader.contractID()] - if _, keep := toKeep[uploader.contractID()]; !(keep || renewed) { + renewal, renewed := renewedTo[uploader.ContractID()] + if _, keep := toKeep[uploader.ContractID()]; !(keep || renewed) { uploader.Stop() continue } else if renewed { - uploader.renew(mgr.hp, renewal, bh) + uploader.Renew(mgr.hp, renewal, bh) } // delete current fcid from toKeep, by doing so it becomes a list of the // contracts we want to add - delete(toKeep, uploader.contractID()) + delete(toKeep, uploader.ContractID()) - uploader.updateBlockHeight(bh) + uploader.UpdateBlockHeight(bh) uploader.tryRecomputeStats() uploaders = append(uploaders, uploader) } @@ -774,232 +755,51 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh mgr.uploaders = uploaders } -func (u *uploader) SignalWork() { - select { - case u.signalNewUpload <- struct{}{}: - default: - } -} - -func (u *uploader) Start(hp hostProvider, rl revisionLocker) { -outer: - for { - // wait for work - select { - case <-u.signalNewUpload: - case <-u.shutdownCtx.Done(): - return - } - - for { - // check if we are stopped - select { - case <-u.shutdownCtx.Done(): - return - default: - } - - // pop the next upload req - req := u.pop() - if req == nil { - continue outer - } - - // skip if upload is done - if req.done() { - continue - } - - // execute it - var root types.Hash256 - start := time.Now() - fcid := u.contractID() - err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.lockPriority, u.blockHeight(), func(rev types.FileContractRevision) error { - if rev.RevisionNumber == math.MaxUint64 { - return errMaxRevisionReached - } - - var err error - root, err = u.execute(req, rev) - return err - }) +func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { + // prepare response channel + responseChan := make(chan sectorUploadResp) - // the uploader's contract got renewed, requeue the request - if errors.Is(err, errMaxRevisionReached) { - u.enqueue(req) - continue outer - } + // prepare sectors + sectors := make([]*sectorUpload, len(shards)) + for sI, shard := range shards { + // create the ctx + sCtx, sCancel := context.WithCancel(ctx) - // send the response - if err != nil { - req.fail(err) - } else { - req.succeed(root) - } + // attach the upload's span + sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") + span.SetAttributes(attribute.Bool("overdrive", false)) + span.SetAttributes(attribute.Int("sector", sI)) - // track the error, ignore gracefully closed streams and canceled overdrives - canceledOverdrive := req.done() && req.overdrive && err != nil - if !canceledOverdrive && !isClosedStream(err) { - u.trackSectorUpload(err, time.Since(start)) - } + // create the sector + sectors[sI] = §orUpload{ + data: (*[rhpv2.SectorSize]byte)(shard), + index: sI, + root: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), + ctx: sCtx, + cancel: sCancel, } } -} -func (u *uploader) Stop() { - for { - upload := u.pop() - if upload == nil { - break - } - if !upload.done() { - upload.fail(errors.New("uploader stopped")) + // prepare requests + requests := make([]*sectorUploadReq, len(shards)) + for sI := range shards { + requests[sI] = §orUploadReq{ + lockPriority: u.lockPriority, + overdrive: false, + responseChan: responseChan, + sector: sectors[sI], + uploadID: u.id, } } -} - -func (u *uploader) blockHeight() uint64 { - u.mu.Lock() - defer u.mu.Unlock() - return u.bh -} - -func (u *uploader) contractID() types.FileContractID { - u.mu.Lock() - defer u.mu.Unlock() - return u.fcid -} - -func (u *uploader) enqueue(req *sectorUploadReq) { - // trace the request - span := trace.SpanFromContext(req.sector.ctx) - span.SetAttributes(attribute.Stringer("hk", u.hk)) - span.AddEvent("enqueued") - - // decorate the request - req.fcid = u.contractID() - req.hk = u.hk - - // enqueue the request - u.mu.Lock() - u.queue = append(u.queue, req) - u.mu.Unlock() - - // signal there's work - u.SignalWork() -} - -func (u *uploader) estimate() float64 { - u.mu.Lock() - defer u.mu.Unlock() - - // fetch estimated duration per sector - estimateP90 := u.statsSectorUploadEstimateInMS.P90() - if estimateP90 == 0 { - estimateP90 = 1 - } - - // calculate estimated time - numSectors := float64(len(u.queue) + 1) - return numSectors * estimateP90 -} - -func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) (types.Hash256, error) { - u.mu.Lock() - host := u.host - fcid := u.fcid - u.mu.Unlock() - - // fetch span from context - span := trace.SpanFromContext(req.sector.ctx) - span.AddEvent("execute") - - // update the bus - if err := u.b.AddUploadingSector(req.sector.ctx, req.uploadID, fcid, req.sector.root); err != nil { - return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) - } - - // upload the sector - start := time.Now() - root, err := host.UploadSector(req.sector.ctx, req.sector.data, rev) - if err != nil { - return types.Hash256{}, err - } - - // update span - elapsed := time.Since(start) - span.SetAttributes(attribute.Int64("duration", elapsed.Milliseconds())) - span.RecordError(err) - span.End() - - return root, nil -} - -func (u *uploader) healthy() bool { - u.mu.Lock() - defer u.mu.Unlock() - return u.consecutiveFailures == 0 -} - -func (u *uploader) pop() *sectorUploadReq { - u.mu.Lock() - defer u.mu.Unlock() - - if len(u.queue) > 0 { - j := u.queue[0] - u.queue[0] = nil - u.queue = u.queue[1:] - return j - } - return nil -} - -func (u *uploader) renew(hp hostProvider, c api.ContractMetadata, bh uint64) { - u.mu.Lock() - defer u.mu.Unlock() - - u.bh = bh - u.host = hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr) - u.fcid = c.ID - u.siamuxAddr = c.SiamuxAddr - u.endHeight = c.WindowEnd -} - -func (u *uploader) trackSectorUpload(err error, d time.Duration) { - u.mu.Lock() - defer u.mu.Unlock() - if err != nil { - u.consecutiveFailures++ - u.statsSectorUploadEstimateInMS.Track(float64(time.Hour.Milliseconds())) - } else { - ms := d.Milliseconds() - u.consecutiveFailures = 0 - u.statsSectorUploadEstimateInMS.Track(float64(ms)) // duration in ms - u.statsSectorUploadSpeedBytesPerMS.Track(float64(rhpv2.SectorSize / ms)) // bytes per ms - } -} -func (u *uploader) tryRecomputeStats() { - u.mu.Lock() - defer u.mu.Unlock() - if time.Since(u.lastRecompute) < statsRecomputeMinInterval { - return + // prepare candidates + candidates := make([]*candidate, len(uploaders)) + for i, uploader := range uploaders { + candidates[i] = &candidate{uploader: uploader, used: false, overdriving: -1} } - u.lastRecompute = time.Now() - u.statsSectorUploadEstimateInMS.Recompute() - u.statsSectorUploadSpeedBytesPerMS.Recompute() -} - -func (u *uploader) updateBlockHeight(bh uint64) { - u.mu.Lock() - defer u.mu.Unlock() - u.bh = bh -} - -func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { // create slab upload - slab := &slabUpload{ + return &slabUpload{ lockPriority: u.lockPriority, uploadID: u.id, created: time.Now(), @@ -1007,49 +807,12 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, candidates mem: mem, overdriveTimeout: overdriveTimeout, + sectors: sectors, candidates: candidates, shards: shards, - overdriving: make(map[int]map[types.PublicKey]struct{}), - sectors: make(map[int]*sectorUpload, len(shards)), - used: make(map[types.PublicKey]struct{}), - errs: make(HostErrorSet), - } - - // prepare sector uploads - responseChan := make(chan sectorUploadResp) - requests := make([]*sectorUploadReq, len(shards)) - for sI, shard := range shards { - // create the ctx - sCtx, sCancel := context.WithCancel(ctx) - - // attach the upload's span - sCtx, span := tracing.Tracer.Start(sCtx, "uploadSector") - span.SetAttributes(attribute.Bool("overdrive", false)) - span.SetAttributes(attribute.Int("sector", sI)) - - // create the sector - sector := §orUpload{ - data: (*[rhpv2.SectorSize]byte)(shard), - index: sI, - root: rhpv2.SectorRoot((*[rhpv2.SectorSize]byte)(shard)), - - ctx: sCtx, - cancel: sCancel, - } - slab.sectors[sI] = sector - - // create the request - requests[sI] = §orUploadReq{ - lockPriority: slab.lockPriority, - overdrive: false, - responseChan: responseChan, - sector: sector, - uploadID: slab.uploadID, - } - } - - return slab, requests, responseChan + errs: make(HostErrorSet), + }, requests, responseChan } func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (overdrivePct float64, overdriveSpeed int64) { @@ -1103,33 +866,54 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates [ } } - // launch overdrive - resetOverdrive := slab.overdrive(ctx, respChan) + // create an overdrive timer + if overdriveTimeout == 0 { + overdriveTimeout = time.Duration(math.MaxInt64) + } + timer := time.NewTimer(overdriveTimeout) // collect responses var done bool +loop: for slab.inflight() > 0 && !done { - var resp sectorUploadResp select { case <-u.shutdownCtx.Done(): return nil, 0, 0, errors.New("upload stopped") case <-ctx.Done(): return nil, 0, 0, ctx.Err() - case resp = <-respChan: - } - - resetOverdrive() + case resp := <-respChan: + // receive the response + done = slab.receive(resp) + + // relaunch non-overdrive uploads + if !done && resp.err != nil && !resp.req.overdrive { + if overdriving, err := slab.launch(resp.req); err != nil { + if !overdriving { + break loop // fail the upload + } + } + } - // receive the response - done = slab.receive(resp) + // try overdriving a sector + if slab.canOverdrive() { + _, _ = slab.launch(slab.nextRequest(respChan)) // ignore result + } + case <-timer.C: + // try overdriving a sector + if slab.canOverdrive() { + _, _ = slab.launch(slab.nextRequest(respChan)) // ignore result + } + } - // relaunch non-overdrive uploads - if !done && resp.err != nil && !resp.req.overdrive { - if overdriving, err := slab.launch(resp.req); err != nil { - if !overdriving { - break // fail the upload + // reset the overdrive timer + if overdriveTimeout != math.MaxInt64 { + if !timer.Stop() { + select { + case <-timer.C: + default: } } + timer.Reset(overdriveTimeout) } } @@ -1140,12 +924,27 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates [ return sectors, slab.overdrivePct(), slab.uploadSpeed(), err } -func (s *slabUpload) uploadSpeed() int64 { +func (s *slabUpload) canOverdrive() bool { s.mu.Lock() defer s.mu.Unlock() - bytes := s.numUploaded * rhpv2.SectorSize - ms := time.Since(s.created).Milliseconds() - return int64(bytes) / ms + + // overdrive is not kicking in yet + remaining := uint64(len(s.shards)) - s.numUploaded + if remaining >= s.maxOverdrive { + return false + } + + // overdrive is not due yet + if time.Since(s.lastOverdrive) < s.overdriveTimeout { + return false + } + + // overdrive is maxed out + if s.numInflight-remaining >= s.maxOverdrive { + return false + } + + return true } func (s *slabUpload) finish() (sectors []object.Sector, _ error) { @@ -1169,6 +968,15 @@ func (s *slabUpload) inflight() uint64 { return s.numInflight } +func (s *slabUpload) ongoingOverdrive(sI int) bool { + for _, candidate := range s.candidates { + if candidate.used && candidate.overdriving == sI { + return true + } + } + return false +} + func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { s.mu.Lock() defer s.mu.Unlock() @@ -1178,102 +986,40 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { return false, nil } - // find next candidate - var candidate *uploader - for _, uploader := range s.candidates { - if _, used := s.used[uploader.hk]; used { + // find candidate + var candidate *candidate + for _, c := range s.candidates { + if c.used { continue } - candidate = uploader + candidate = c break } // no candidate found if candidate == nil { err = errNoCandidateUploader - interrupt = !req.overdrive && len(s.overdriving[req.sector.index]) == 0 + interrupt = !req.overdrive && !s.ongoingOverdrive(req.sector.index) span := trace.SpanFromContext(req.sector.ctx) span.RecordError(err) span.End() return } - // enqueue the req - candidate.enqueue(req) - - // update the state - s.numInflight++ - s.numLaunched++ - s.used[req.hk] = struct{}{} - + // update the candidate + candidate.used = true if req.overdrive { + candidate.overdriving = req.sector.index s.lastOverdrive = time.Now() s.numOverdriving++ - - if _, exists := s.overdriving[req.sector.index]; !exists { - s.overdriving[req.sector.index] = make(map[types.PublicKey]struct{}) - } - s.overdriving[req.sector.index][req.hk] = struct{}{} - } - return -} - -func (s *slabUpload) overdrive(ctx context.Context, respChan chan sectorUploadResp) (resetTimer func()) { - // overdrive is disabled - if s.overdriveTimeout == 0 { - return func() {} - } - - // create a timer to trigger overdrive - timer := time.NewTimer(s.overdriveTimeout) - resetTimer = func() { - timer.Stop() - select { - case <-timer.C: - default: - } - timer.Reset(s.overdriveTimeout) - } - - // create a function to check whether overdrive is possible - canOverdrive := func() bool { - s.mu.Lock() - defer s.mu.Unlock() - - // overdrive is not kicking in yet - remaining := uint64(len(s.shards)) - s.numUploaded - if remaining >= s.maxOverdrive { - return false - } - - // overdrive is not due yet - if time.Since(s.lastOverdrive) < s.overdriveTimeout { - return false - } - - // overdrive is maxed out - if s.numInflight-remaining >= s.maxOverdrive { - return false - } - - return true } - // try overdriving every time the timer fires - go func() { - for { - select { - case <-ctx.Done(): - return - case <-timer.C: - if canOverdrive() { - _, _ = s.launch(s.nextRequest(respChan)) // ignore result - } - resetTimer() - } - } - }() + // update the state + s.numInflight++ + s.numLaunched++ + // enqueue the req + candidate.uploader.enqueue(req) return } @@ -1281,11 +1027,20 @@ func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUplo s.mu.Lock() defer s.mu.Unlock() - // find the sector that's not finished and has the least amount of overdrives + // count overdrives + overdriveCnts := make(map[int]int) + for _, c := range s.candidates { + if c.used && c.overdriving != -1 { + overdriveCnts[c.overdriving]++ + } + } + + // overdrive the sector with the least amount of overdrives lowestNumOverdrives := math.MaxInt var nextSector *sectorUpload - for _, sector := range s.sectors { - if !sector.isUploaded() && len(s.overdriving[sector.index]) < lowestNumOverdrives { + for sI, sector := range s.sectors { + if !sector.isUploaded() && overdriveCnts[sI] < lowestNumOverdrives { + lowestNumOverdrives = overdriveCnts[sI] nextSector = sector } } @@ -1358,9 +1113,12 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // cancel the sector context sector.cancel() - // free hosts we're using to overdrive this sector - for hk := range s.overdriving[req.sector.index] { - delete(s.used, hk) + // release hosts that are overdriving this sector + for _, candidate := range s.candidates { + if candidate.overdriving == sector.index { + candidate.overdriving = -1 + candidate.used = false + } } // release memory @@ -1371,6 +1129,14 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { return s.numUploaded == uint64(len(s.shards)) } +func (s *slabUpload) uploadSpeed() int64 { + s.mu.Lock() + defer s.mu.Unlock() + bytes := s.numUploaded * rhpv2.SectorSize + ms := time.Since(s.created).Milliseconds() + return int64(bytes) / ms +} + func (s *sectorUpload) isUploaded() bool { return s.uploaded.Root != (types.Hash256{}) } diff --git a/worker/upload_params.go b/worker/upload_params.go index 8e033b287..c4fae96a0 100644 --- a/worker/upload_params.go +++ b/worker/upload_params.go @@ -1,12 +1,6 @@ package worker import ( - "bytes" - "encoding/hex" - "io" - - "github.com/gabriel-vasile/mimetype" - "go.sia.tech/core/types" "go.sia.tech/renterd/api" "go.sia.tech/renterd/build" "go.sia.tech/renterd/object" @@ -101,35 +95,3 @@ func WithRedundancySettings(rs api.RedundancySettings) UploadOption { up.rs = rs } } - -func newMimeReader(r io.Reader) (mimeType string, recycled io.Reader, err error) { - buf := bytes.NewBuffer(nil) - mtype, err := mimetype.DetectReader(io.TeeReader(r, buf)) - recycled = io.MultiReader(buf, r) - return mtype.String(), recycled, err -} - -type hashReader struct { - r io.Reader - h *types.Hasher -} - -func newHashReader(r io.Reader) *hashReader { - return &hashReader{ - r: r, - h: types.NewHasher(), - } -} - -func (e *hashReader) Read(p []byte) (int, error) { - n, err := e.r.Read(p) - if _, wErr := e.h.E.Write(p[:n]); wErr != nil { - return 0, wErr - } - return n, err -} - -func (e *hashReader) Hash() string { - sum := e.h.Sum() - return hex.EncodeToString(sum[:]) -} diff --git a/worker/upload_utils.go b/worker/upload_utils.go new file mode 100644 index 000000000..4b5241b4d --- /dev/null +++ b/worker/upload_utils.go @@ -0,0 +1,55 @@ +package worker + +import ( + "bytes" + "encoding/hex" + "io" + + "github.com/gabriel-vasile/mimetype" + "go.sia.tech/core/types" + "go.sia.tech/renterd/object" +) + +func encryptPartialSlab(data []byte, key object.EncryptionKey, minShards, totalShards uint8) [][]byte { + slab := object.Slab{ + Key: key, + MinShards: minShards, + Shards: make([]object.Sector, totalShards), + } + encodedShards := make([][]byte, totalShards) + slab.Encode(data, encodedShards) + slab.Encrypt(encodedShards) + return encodedShards +} + +func newMimeReader(r io.Reader) (mimeType string, recycled io.Reader, err error) { + buf := bytes.NewBuffer(nil) + mtype, err := mimetype.DetectReader(io.TeeReader(r, buf)) + recycled = io.MultiReader(buf, r) + return mtype.String(), recycled, err +} + +type hashReader struct { + r io.Reader + h *types.Hasher +} + +func newHashReader(r io.Reader) *hashReader { + return &hashReader{ + r: r, + h: types.NewHasher(), + } +} + +func (e *hashReader) Read(p []byte) (int, error) { + n, err := e.r.Read(p) + if _, wErr := e.h.E.Write(p[:n]); wErr != nil { + return 0, wErr + } + return n, err +} + +func (e *hashReader) Hash() string { + sum := e.h.Sum() + return hex.EncodeToString(sum[:]) +} diff --git a/worker/uploader.go b/worker/uploader.go new file mode 100644 index 000000000..47655c2e4 --- /dev/null +++ b/worker/uploader.go @@ -0,0 +1,265 @@ +package worker + +import ( + "context" + "errors" + "fmt" + "math" + "sync" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + rhpv2 "go.sia.tech/core/rhp/v2" + "go.sia.tech/core/types" + "go.sia.tech/renterd/api" + "go.sia.tech/renterd/stats" +) + +type ( + uploader struct { + b Bus + + hk types.PublicKey + siamuxAddr string + signalNewUpload chan struct{} + shutdownCtx context.Context + + mu sync.Mutex + bh uint64 + endHeight uint64 + fcid types.FileContractID + host hostV3 + queue []*sectorUploadReq + + // stats related field + consecutiveFailures uint64 + lastRecompute time.Time + + statsSectorUploadEstimateInMS *stats.DataPoints + statsSectorUploadSpeedBytesPerMS *stats.DataPoints + } +) + +func (u *uploader) BlockHeight() uint64 { + u.mu.Lock() + defer u.mu.Unlock() + return u.bh +} + +func (u *uploader) ContractID() types.FileContractID { + u.mu.Lock() + defer u.mu.Unlock() + return u.fcid +} + +func (u *uploader) Healthy() bool { + u.mu.Lock() + defer u.mu.Unlock() + return u.consecutiveFailures == 0 +} + +func (u *uploader) Renew(hp hostProvider, c api.ContractMetadata, bh uint64) { + u.mu.Lock() + defer u.mu.Unlock() + + u.bh = bh + u.host = hp.newHostV3(c.ID, c.HostKey, c.SiamuxAddr) + u.fcid = c.ID + u.siamuxAddr = c.SiamuxAddr + u.endHeight = c.WindowEnd +} + +func (u *uploader) SignalWork() { + select { + case u.signalNewUpload <- struct{}{}: + default: + } +} + +func (u *uploader) Start(hp hostProvider, rl revisionLocker) { +outer: + for { + // wait for work + select { + case <-u.signalNewUpload: + case <-u.shutdownCtx.Done(): + return + } + + for { + // check if we are stopped + select { + case <-u.shutdownCtx.Done(): + return + default: + } + + // pop the next upload req + req := u.pop() + if req == nil { + continue outer + } + + // skip if upload is done + if req.done() { + continue + } + + // execute it + var root types.Hash256 + start := time.Now() + fcid := u.ContractID() + err := rl.withRevision(req.sector.ctx, defaultRevisionFetchTimeout, fcid, u.hk, u.siamuxAddr, req.lockPriority, u.BlockHeight(), func(rev types.FileContractRevision) error { + if rev.RevisionNumber == math.MaxUint64 { + return errMaxRevisionReached + } + + var err error + root, err = u.execute(req, rev) + return err + }) + + // the uploader's contract got renewed, requeue the request + if errors.Is(err, errMaxRevisionReached) { + u.enqueue(req) + continue outer + } + + // send the response + if err != nil { + req.fail(err) + } else { + req.succeed(root) + } + + // track the error, ignore gracefully closed streams and canceled overdrives + canceledOverdrive := req.done() && req.overdrive && err != nil + if !canceledOverdrive && !isClosedStream(err) { + u.trackSectorUpload(err, time.Since(start)) + } + } + } +} + +func (u *uploader) Stop() { + for { + upload := u.pop() + if upload == nil { + break + } + if !upload.done() { + upload.fail(errors.New("uploader stopped")) + } + } +} + +func (u *uploader) UpdateBlockHeight(bh uint64) { + u.mu.Lock() + defer u.mu.Unlock() + u.bh = bh +} + +func (u *uploader) enqueue(req *sectorUploadReq) { + // trace the request + span := trace.SpanFromContext(req.sector.ctx) + span.SetAttributes(attribute.Stringer("hk", u.hk)) + span.AddEvent("enqueued") + + // decorate the request + req.fcid = u.ContractID() + req.hk = u.hk + + // enqueue the request + u.mu.Lock() + u.queue = append(u.queue, req) + u.mu.Unlock() + + // signal there's work + u.SignalWork() +} + +func (u *uploader) estimate() float64 { + u.mu.Lock() + defer u.mu.Unlock() + + // fetch estimated duration per sector + estimateP90 := u.statsSectorUploadEstimateInMS.P90() + if estimateP90 == 0 { + estimateP90 = 1 + } + + // calculate estimated time + numSectors := float64(len(u.queue) + 1) + return numSectors * estimateP90 +} + +func (u *uploader) execute(req *sectorUploadReq, rev types.FileContractRevision) (types.Hash256, error) { + u.mu.Lock() + host := u.host + fcid := u.fcid + u.mu.Unlock() + + // fetch span from context + span := trace.SpanFromContext(req.sector.ctx) + span.AddEvent("execute") + + // update the bus + if err := u.b.AddUploadingSector(req.sector.ctx, req.uploadID, fcid, req.sector.root); err != nil { + return types.Hash256{}, fmt.Errorf("failed to add uploading sector to contract %v, err: %v", fcid, err) + } + + // upload the sector + start := time.Now() + root, err := host.UploadSector(req.sector.ctx, req.sector.data, rev) + if err != nil { + return types.Hash256{}, err + } + + // update span + elapsed := time.Since(start) + span.SetAttributes(attribute.Int64("duration", elapsed.Milliseconds())) + span.RecordError(err) + span.End() + + return root, nil +} + +func (u *uploader) pop() *sectorUploadReq { + u.mu.Lock() + defer u.mu.Unlock() + + if len(u.queue) > 0 { + j := u.queue[0] + u.queue[0] = nil + u.queue = u.queue[1:] + return j + } + return nil +} + +func (u *uploader) trackSectorUpload(err error, d time.Duration) { + u.mu.Lock() + defer u.mu.Unlock() + if err != nil { + u.consecutiveFailures++ + u.statsSectorUploadEstimateInMS.Track(float64(time.Hour.Milliseconds())) + } else { + ms := d.Milliseconds() + u.consecutiveFailures = 0 + u.statsSectorUploadEstimateInMS.Track(float64(ms)) // duration in ms + u.statsSectorUploadSpeedBytesPerMS.Track(float64(rhpv2.SectorSize / ms)) // bytes per ms + } +} + +func (u *uploader) tryRecomputeStats() { + u.mu.Lock() + defer u.mu.Unlock() + if time.Since(u.lastRecompute) < statsRecomputeMinInterval { + return + } + + u.lastRecompute = time.Now() + u.statsSectorUploadEstimateInMS.Recompute() + u.statsSectorUploadSpeedBytesPerMS.Recompute() +} diff --git a/worker/worker.go b/worker/worker.go index 2afc34fe6..8a547926c 100644 --- a/worker/worker.go +++ b/worker/worker.go @@ -107,12 +107,6 @@ type ( } ) -type ContractLocker interface { - AcquireContract(ctx context.Context, fcid types.FileContractID, priority int, d time.Duration) (lockID uint64, err error) - KeepaliveContract(ctx context.Context, fcid types.FileContractID, lockID uint64, d time.Duration) (err error) - ReleaseContract(ctx context.Context, fcid types.FileContractID, lockID uint64) (err error) -} - // A Bus is the source of truth within a renterd system. type Bus interface { alerts.Alerter @@ -313,20 +307,6 @@ func (w *worker) newHostV3(contractID types.FileContractID, hostKey types.Public } } -func (w *worker) withContractLock(ctx context.Context, fcid types.FileContractID, priority int, fn func() error) error { - contractLock, err := w.acquireContractLock(ctx, fcid, priority) - if err != nil { - return err - } - defer func() { - releaseCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - _ = contractLock.Release(releaseCtx) - cancel() - }() - - return fn() -} - func (w *worker) withRevision(ctx context.Context, fetchTimeout time.Duration, contractID types.FileContractID, hk types.PublicKey, siamuxAddr string, lockPriority int, blockHeight uint64, fn func(rev types.FileContractRevision) error) error { return w.withContractLock(ctx, contractID, lockPriority, func() error { h := w.newHostV3(contractID, hk, siamuxAddr) @@ -1283,18 +1263,6 @@ func (w *worker) multipartUploadHandlerPUT(jc jape.Context) { jc.ResponseWriter.Header().Set("ETag", api.FormatETag(eTag)) } -func encryptPartialSlab(data []byte, key object.EncryptionKey, minShards, totalShards uint8) [][]byte { - slab := object.Slab{ - Key: key, - MinShards: minShards, - Shards: make([]object.Sector, totalShards), - } - encodedShards := make([][]byte, totalShards) - slab.Encode(data, encodedShards) - slab.Encrypt(encodedShards) - return encodedShards -} - func (w *worker) objectsHandlerDELETE(jc jape.Context) { var batch bool if jc.DecodeForm("batch", &batch) != nil { @@ -1344,10 +1312,6 @@ func (w *worker) rhpContractsHandlerGET(jc jape.Context) { jc.Encode(resp) } -func preparePayment(accountKey types.PrivateKey, amt types.Currency, blockHeight uint64) rhpv3.PayByEphemeralAccountRequest { - return rhpv3.PayByEphemeralAccount(rhpv3.Account(accountKey.PublicKey()), amt, blockHeight+6, accountKey) // 1 hour valid -} - func (w *worker) idHandlerGET(jc jape.Context) { jc.Encode(w.id) } @@ -1482,91 +1446,6 @@ func (w *worker) Shutdown(_ context.Context) error { return nil } -type contractLock struct { - lockID uint64 - fcid types.FileContractID - d time.Duration - locker ContractLocker - logger *zap.SugaredLogger - - stopCtx context.Context - stopCtxCancel context.CancelFunc - stopWG sync.WaitGroup -} - -func newContractLock(fcid types.FileContractID, lockID uint64, d time.Duration, locker ContractLocker, logger *zap.SugaredLogger) *contractLock { - ctx, cancel := context.WithCancel(context.Background()) - cl := &contractLock{ - lockID: lockID, - fcid: fcid, - d: d, - locker: locker, - logger: logger, - - stopCtx: ctx, - stopCtxCancel: cancel, - } - cl.stopWG.Add(1) - go func() { - cl.keepaliveLoop() - cl.stopWG.Done() - }() - return cl -} - -func (cl *contractLock) Release(ctx context.Context) error { - // Stop background loop. - cl.stopCtxCancel() - cl.stopWG.Wait() - - // Release the contract. - return cl.locker.ReleaseContract(ctx, cl.fcid, cl.lockID) -} - -func (cl *contractLock) keepaliveLoop() { - // Create ticker for 20% of the lock duration. - start := time.Now() - var lastUpdate time.Time - tickDuration := cl.d / 5 - t := time.NewTicker(tickDuration) - - // Cleanup - defer func() { - t.Stop() - select { - case <-t.C: - default: - } - }() - - // Loop until stopped. - for { - select { - case <-cl.stopCtx.Done(): - return // released - case <-t.C: - } - if err := cl.locker.KeepaliveContract(cl.stopCtx, cl.fcid, cl.lockID, cl.d); err != nil && !errors.Is(err, context.Canceled) { - cl.logger.Errorw(fmt.Sprintf("failed to send keepalive: %v", err), - "contract", cl.fcid, - "lockID", cl.lockID, - "loopStart", start, - "timeSinceLastUpdate", time.Since(lastUpdate), - "tickDuration", tickDuration) - return - } - lastUpdate = time.Now() - } -} - -func (w *worker) acquireContractLock(ctx context.Context, fcid types.FileContractID, priority int) (_ revisionUnlocker, err error) { - lockID, err := w.bus.AcquireContract(ctx, fcid, priority, w.contractLockingDuration) - if err != nil { - return nil, err - } - return newContractLock(fcid, lockID, w.contractLockingDuration, w.bus, w.logger), nil -} - func (w *worker) scanHost(ctx context.Context, hostKey types.PublicKey, hostIP string) (settings rhpv2.HostSettings, pt rhpv3.HostPriceTable, elapsed time.Duration, err error) { // record host scan defer func() { From 7bb7e72468baf28701d6f1026afd6ea61f936c8d Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 13:27:33 +0100 Subject: [PATCH 19/25] worker: cleanup upload types more --- worker/upload.go | 126 +++++++++++++++++------------------------------ 1 file changed, 44 insertions(+), 82 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 41c736864..c3729d9c2 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -71,25 +71,23 @@ type ( } slabUpload struct { - uploadID api.UploadID - created time.Time - lockPriority int - maxOverdrive uint64 - mem *acquiredMemory - overdriveTimeout time.Duration + uploadID api.UploadID + lockPriority int + + maxOverdrive uint64 + lastOverdrive time.Time sectors []*sectorUpload candidates []*candidate // sorted by upload estimate - shards [][]byte - mu sync.Mutex - numInflight uint64 numLaunched uint64 + numInflight uint64 numOverdriving uint64 numUploaded uint64 + numSectors uint64 - lastOverdrive time.Time - errs HostErrorSet + mem *acquiredMemory + errs HostErrorSet } candidate struct { @@ -755,7 +753,7 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh mgr.uploaders = uploaders } -func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { +func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders []*uploader, mem *acquiredMemory, maxOverdrive uint64) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { // prepare response channel responseChan := make(chan sectorUploadResp) @@ -800,16 +798,14 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders [ // create slab upload return &slabUpload{ - lockPriority: u.lockPriority, - uploadID: u.id, - created: time.Now(), - maxOverdrive: maxOverdrive, - mem: mem, - overdriveTimeout: overdriveTimeout, + lockPriority: u.lockPriority, + uploadID: u.id, + maxOverdrive: maxOverdrive, + mem: mem, sectors: sectors, candidates: candidates, - shards: shards, + numSectors: uint64(len(shards)), errs: make(HostErrorSet), }, requests, responseChan @@ -848,6 +844,8 @@ func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data } func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) ([]object.Sector, float64, int64, error) { + start := time.Now() + // add tracing ctx, span := tracing.Tracer.Start(ctx, "uploadShards") defer span.End() @@ -857,7 +855,7 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates [ defer cancel() // prepare the upload - slab, requests, respChan := u.newSlabUpload(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) + slab, requests, respChan := u.newSlabUpload(ctx, shards, candidates, mem, maxOverdrive) // launch all shard uploads for _, upload := range requests { @@ -875,7 +873,7 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates [ // collect responses var done bool loop: - for slab.inflight() > 0 && !done { + for slab.numInflight > 0 && !done { select { case <-u.shutdownCtx.Done(): return nil, 0, 0, errors.New("upload stopped") @@ -895,12 +893,12 @@ loop: } // try overdriving a sector - if slab.canOverdrive() { + if slab.canOverdrive(overdriveTimeout) { _, _ = slab.launch(slab.nextRequest(respChan)) // ignore result } case <-timer.C: // try overdriving a sector - if slab.canOverdrive() { + if slab.canOverdrive(overdriveTimeout) { _, _ = slab.launch(slab.nextRequest(respChan)) // ignore result } } @@ -917,25 +915,34 @@ loop: } } + // calculate the upload speed + bytes := slab.numUploaded * rhpv2.SectorSize + ms := time.Since(start).Milliseconds() + speed := int64(bytes) / ms + + // calculate overdrive pct + var numOverdrive uint64 + if slab.numLaunched > slab.numSectors { + numOverdrive = slab.numLaunched - slab.numSectors + } + overdrivePct := float64(numOverdrive) / float64(slab.numSectors) + // register the amount of overdrive sectors - span.SetAttributes(attribute.Int("overdrive", slab.overdriveCnt())) + span.SetAttributes(attribute.Int("overdrive", int(numOverdrive))) sectors, err := slab.finish() - return sectors, slab.overdrivePct(), slab.uploadSpeed(), err + return sectors, overdrivePct, speed, err } -func (s *slabUpload) canOverdrive() bool { - s.mu.Lock() - defer s.mu.Unlock() - +func (s *slabUpload) canOverdrive(overdriveTimeout time.Duration) bool { // overdrive is not kicking in yet - remaining := uint64(len(s.shards)) - s.numUploaded + remaining := s.numSectors - s.numUploaded if remaining >= s.maxOverdrive { return false } // overdrive is not due yet - if time.Since(s.lastOverdrive) < s.overdriveTimeout { + if time.Since(s.lastOverdrive) < overdriveTimeout { return false } @@ -948,26 +955,17 @@ func (s *slabUpload) canOverdrive() bool { } func (s *slabUpload) finish() (sectors []object.Sector, _ error) { - s.mu.Lock() - defer s.mu.Unlock() - - if s.numUploaded < uint64(len(s.shards)) { - remaining := uint64(len(s.shards)) - s.numUploaded + if s.numUploaded < s.numSectors { + remaining := s.numSectors - s.numUploaded return nil, fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d uploaders=%d errors=%d %w", s.numLaunched, s.numUploaded, remaining, s.numInflight, len(s.candidates), len(s.errs), s.errs) } - for i := 0; i < len(s.shards); i++ { - sectors = append(sectors, s.sectors[i].uploaded) + for _, sector := range s.sectors { + sectors = append(sectors, sector.uploaded) } return } -func (s *slabUpload) inflight() uint64 { - s.mu.Lock() - defer s.mu.Unlock() - return s.numInflight -} - func (s *slabUpload) ongoingOverdrive(sI int) bool { for _, candidate := range s.candidates { if candidate.used && candidate.overdriving == sI { @@ -978,9 +976,6 @@ func (s *slabUpload) ongoingOverdrive(sI int) bool { } func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { - s.mu.Lock() - defer s.mu.Unlock() - // nothing to do if req == nil { return false, nil @@ -1024,9 +1019,6 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { } func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUploadReq { - s.mu.Lock() - defer s.mu.Unlock() - // count overdrives overdriveCnts := make(map[int]int) for _, c := range s.candidates { @@ -1057,28 +1049,7 @@ func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUplo } } -func (s *slabUpload) overdriveCnt() int { - s.mu.Lock() - defer s.mu.Unlock() - return int(s.numLaunched) - len(s.sectors) -} - -func (s *slabUpload) overdrivePct() float64 { - s.mu.Lock() - defer s.mu.Unlock() - - numOverdrive := int(s.numLaunched) - len(s.sectors) - if numOverdrive <= 0 { - return 0 - } - - return float64(numOverdrive) / float64(len(s.sectors)) -} - func (s *slabUpload) receive(resp sectorUploadResp) bool { - s.mu.Lock() - defer s.mu.Unlock() - // convenience variable req := resp.req sector := req.sector @@ -1110,7 +1081,7 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // update uploaded sectors s.numUploaded++ - // cancel the sector context + // cancel the sector's context sector.cancel() // release hosts that are overdriving this sector @@ -1123,18 +1094,9 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // release memory sector.data = nil - s.shards[sector.index] = nil s.mem.ReleaseSome(rhpv2.SectorSize) - return s.numUploaded == uint64(len(s.shards)) -} - -func (s *slabUpload) uploadSpeed() int64 { - s.mu.Lock() - defer s.mu.Unlock() - bytes := s.numUploaded * rhpv2.SectorSize - ms := time.Since(s.created).Milliseconds() - return int64(bytes) / ms + return s.numUploaded == s.numSectors } func (s *sectorUpload) isUploaded() bool { From 89370cecd0b70f5079021d157c62a9654306538b Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 13:36:15 +0100 Subject: [PATCH 20/25] worker: clean up the upload code --- worker/upload.go | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index c3729d9c2..1581dee32 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -55,6 +55,7 @@ type ( uploaders []*uploader } + // TODO: should become a metric uploadManagerStats struct { avgSlabUploadSpeedMBPS float64 avgOverdrivePct float64 @@ -72,6 +73,7 @@ type ( slabUpload struct { uploadID api.UploadID + mem *acquiredMemory lockPriority int maxOverdrive uint64 @@ -86,7 +88,6 @@ type ( numUploaded uint64 numSectors uint64 - mem *acquiredMemory errs HostErrorSet } @@ -104,8 +105,8 @@ type ( } sectorUpload struct { - data *[rhpv2.SectorSize]byte index int + data *[rhpv2.SectorSize]byte root types.Hash256 uploaded object.Sector @@ -114,11 +115,11 @@ type ( } sectorUploadReq struct { + uploadID api.UploadID + sector *sectorUpload lockPriority int overdrive bool responseChan chan sectorUploadResp - sector *sectorUpload - uploadID api.UploadID // set by the uploader performing the upload fcid types.FileContractID @@ -753,7 +754,7 @@ func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh mgr.uploaders = uploaders } -func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders []*uploader, mem *acquiredMemory, maxOverdrive uint64) (*slabUpload, []*sectorUploadReq, chan sectorUploadResp) { +func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders []*uploader, mem *acquiredMemory, maxOverdrive uint64) (*slabUpload, chan sectorUploadResp) { // prepare response channel responseChan := make(chan sectorUploadResp) @@ -778,18 +779,6 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders [ } } - // prepare requests - requests := make([]*sectorUploadReq, len(shards)) - for sI := range shards { - requests[sI] = §orUploadReq{ - lockPriority: u.lockPriority, - overdrive: false, - responseChan: responseChan, - sector: sectors[sI], - uploadID: u.id, - } - } - // prepare candidates candidates := make([]*candidate, len(uploaders)) for i, uploader := range uploaders { @@ -808,7 +797,7 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders [ numSectors: uint64(len(shards)), errs: make(HostErrorSet), - }, requests, responseChan + }, responseChan } func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (overdrivePct float64, overdriveSpeed int64) { @@ -855,9 +844,21 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates [ defer cancel() // prepare the upload - slab, requests, respChan := u.newSlabUpload(ctx, shards, candidates, mem, maxOverdrive) + slab, respChan := u.newSlabUpload(ctx, shards, candidates, mem, maxOverdrive) + + // prepare requests + requests := make([]*sectorUploadReq, len(shards)) + for sI := range shards { + requests[sI] = §orUploadReq{ + uploadID: slab.uploadID, + sector: slab.sectors[sI], + lockPriority: slab.lockPriority, + overdrive: false, + responseChan: respChan, + } + } - // launch all shard uploads + // launch all requests for _, upload := range requests { if _, err := slab.launch(upload); err != nil { return nil, 0, 0, err @@ -1071,6 +1072,12 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { return false } + // sanity check we receive the expected root + if resp.root != req.sector.root { + s.errs[req.hk] = errors.New("root mismatch") + return false + } + // store the sector sector.uploaded = object.Sector{ Contracts: map[types.PublicKey][]types.FileContractID{req.hk: {req.fcid}}, From f311a2532875340812b450184c917955457ed726 Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 14:17:38 +0100 Subject: [PATCH 21/25] worker: rename signalWork --- worker/uploader.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/worker/uploader.go b/worker/uploader.go index 47655c2e4..d3f5e2a5b 100644 --- a/worker/uploader.go +++ b/worker/uploader.go @@ -70,13 +70,6 @@ func (u *uploader) Renew(hp hostProvider, c api.ContractMetadata, bh uint64) { u.endHeight = c.WindowEnd } -func (u *uploader) SignalWork() { - select { - case u.signalNewUpload <- struct{}{}: - default: - } -} - func (u *uploader) Start(hp hostProvider, rl revisionLocker) { outer: for { @@ -176,7 +169,7 @@ func (u *uploader) enqueue(req *sectorUploadReq) { u.mu.Unlock() // signal there's work - u.SignalWork() + u.signalWork() } func (u *uploader) estimate() float64 { @@ -238,6 +231,13 @@ func (u *uploader) pop() *sectorUploadReq { return nil } +func (u *uploader) signalWork() { + select { + case u.signalNewUpload <- struct{}{}: + default: + } +} + func (u *uploader) trackSectorUpload(err error, d time.Duration) { u.mu.Lock() defer u.mu.Unlock() From bedd98b59146c1c684e4ac09d3668cbb4c46d0e0 Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 15:29:18 +0100 Subject: [PATCH 22/25] worker: rework candidate --- worker/upload.go | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index 1581dee32..ceb7834a2 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -93,9 +93,7 @@ type ( candidate struct { uploader *uploader - - used bool - overdriving int // sector index + req *sectorUploadReq } slabUploadResponse struct { @@ -782,7 +780,7 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders [ // prepare candidates candidates := make([]*candidate, len(uploaders)) for i, uploader := range uploaders { - candidates[i] = &candidate{uploader: uploader, used: false, overdriving: -1} + candidates[i] = &candidate{uploader: uploader} } // create slab upload @@ -886,8 +884,8 @@ loop: // relaunch non-overdrive uploads if !done && resp.err != nil && !resp.req.overdrive { - if overdriving, err := slab.launch(resp.req); err != nil { - if !overdriving { + if interrupt, err := slab.launch(resp.req); err != nil { + if interrupt { break loop // fail the upload } } @@ -969,7 +967,7 @@ func (s *slabUpload) finish() (sectors []object.Sector, _ error) { func (s *slabUpload) ongoingOverdrive(sI int) bool { for _, candidate := range s.candidates { - if candidate.used && candidate.overdriving == sI { + if candidate.req != nil && candidate.req.overdrive && candidate.req.sector.index == sI { return true } } @@ -985,7 +983,7 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { // find candidate var candidate *candidate for _, c := range s.candidates { - if c.used { + if c.req != nil { continue } candidate = c @@ -1003,9 +1001,8 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { } // update the candidate - candidate.used = true + candidate.req = req if req.overdrive { - candidate.overdriving = req.sector.index s.lastOverdrive = time.Now() s.numOverdriving++ } @@ -1023,8 +1020,8 @@ func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUplo // count overdrives overdriveCnts := make(map[int]int) for _, c := range s.candidates { - if c.used && c.overdriving != -1 { - overdriveCnts[c.overdriving]++ + if c.req != nil && c.req.overdrive { + overdriveCnts[c.req.sector.index]++ } } @@ -1091,11 +1088,10 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // cancel the sector's context sector.cancel() - // release hosts that are overdriving this sector + // release all other candidates for this sector for _, candidate := range s.candidates { - if candidate.overdriving == sector.index { - candidate.overdriving = -1 - candidate.used = false + if candidate.req != nil && candidate.req != req && candidate.req.sector.index == sector.index { + candidate.req = nil } } From 291f220c33ab010713356fe4be4cbcfbc0bf5796 Mon Sep 17 00:00:00 2001 From: PJ Date: Fri, 8 Dec 2023 15:56:03 +0100 Subject: [PATCH 23/25] worker: update ongoing/interrupt --- worker/upload.go | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/worker/upload.go b/worker/upload.go index ceb7834a2..554c4105d 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -965,15 +965,6 @@ func (s *slabUpload) finish() (sectors []object.Sector, _ error) { return } -func (s *slabUpload) ongoingOverdrive(sI int) bool { - for _, candidate := range s.candidates { - if candidate.req != nil && candidate.req.overdrive && candidate.req.sector.index == sI { - return true - } - } - return false -} - func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { // nothing to do if req == nil { @@ -981,9 +972,13 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { } // find candidate + var overdriving bool var candidate *candidate for _, c := range s.candidates { if c.req != nil { + if c.req.sector.index == req.sector.index { + overdriving = true + } continue } candidate = c @@ -993,7 +988,7 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { // no candidate found if candidate == nil { err = errNoCandidateUploader - interrupt = !req.overdrive && !s.ongoingOverdrive(req.sector.index) + interrupt = !req.overdrive && !overdriving span := trace.SpanFromContext(req.sector.ctx) span.RecordError(err) span.End() From ea408a7f65154bb36fd982cc4e6e2c1f1528da5e Mon Sep 17 00:00:00 2001 From: PJ Date: Mon, 11 Dec 2023 16:51:45 +0100 Subject: [PATCH 24/25] worker: add buffer, fix duplicate uploader --- internal/testing/cluster_test.go | 2 +- worker/upload.go | 159 +++++++++++++++++-------------- 2 files changed, 87 insertions(+), 74 deletions(-) diff --git a/internal/testing/cluster_test.go b/internal/testing/cluster_test.go index 56211d511..a47c2c3f1 100644 --- a/internal/testing/cluster_test.go +++ b/internal/testing/cluster_test.go @@ -677,7 +677,7 @@ func TestUploadDownloadExtended(t *testing.T) { tt.OK(err) objectsSize := uint64(len(file1) + len(file2) + len(small) + len(large)) if info.TotalObjectsSize != objectsSize { - t.Error("wrong size", info.TotalObjectsSize, len(small)+len(large)) + t.Error("wrong size", info.TotalObjectsSize, objectsSize) } sectorsSize := 15 * rhpv2.SectorSize if info.TotalSectorsSize != uint64(sectorsSize) { diff --git a/worker/upload.go b/worker/upload.go index 554c4105d..748e5f57f 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -379,14 +379,14 @@ func (mgr *uploadManager) MigrateShards(ctx context.Context, s *object.Slab, sha }() // upload the shards - uploaded, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + uploaded, uploadSpeed, overdrivePct, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) if err != nil { return err } // track stats mgr.statsOverdrivePct.Track(overdrivePct) - mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) + mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(uploadSpeed)) // overwrite the shards with the newly uploaded ones for i, si := range shardIndices { @@ -649,14 +649,14 @@ func (mgr *uploadManager) UploadPackedSlab(ctx context.Context, rs api.Redundanc }() // upload the shards - sectors, overdrivePct, overdriveSpeed, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + sectors, uploadSpeed, overdrivePct, err := upload.uploadShards(ctx, shards, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) if err != nil { return err } // track stats + mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(uploadSpeed)) mgr.statsOverdrivePct.Track(overdrivePct) - mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(overdriveSpeed)) // mark packed slab as uploaded slab := api.UploadedPackedSlab{BufferID: ps.BufferID, Shards: sectors} @@ -713,43 +713,50 @@ func (mgr *uploadManager) newUpload(ctx context.Context, totalShards int, contra } func (mgr *uploadManager) refreshUploaders(contracts []api.ContractMetadata, bh uint64) { - // build map of contracts to keep and what contracts got renewed - toKeep := make(map[types.FileContractID]api.ContractMetadata) - renewedTo := make(map[types.FileContractID]api.ContractMetadata) + // build maps to allow quick lookups + wanted := make(map[types.FileContractID]api.ContractMetadata) + renewals := make(map[types.FileContractID]api.ContractMetadata) for _, c := range contracts { - toKeep[c.ID] = c - if c.RenewedFrom != (types.FileContractID{}) { - renewedTo[c.RenewedFrom] = c + wanted[c.ID] = c + if c.RenewedFrom == (types.FileContractID{}) { + renewals[c.RenewedFrom] = c } } - // keep list of uploaders uploaders - var uploaders []*uploader + // stop or renew uploads we currently have + var refreshed []*uploader for _, uploader := range mgr.uploaders { - renewal, renewed := renewedTo[uploader.ContractID()] - if _, keep := toKeep[uploader.ContractID()]; !(keep || renewed) { + _, keep := wanted[uploader.ContractID()] + renewal, renewed := renewals[uploader.ContractID()] + + // stop uploaders that no longer appear in the list + if !(keep || renewed) { uploader.Stop() continue - } else if renewed { - uploader.Renew(mgr.hp, renewal, bh) } - // delete current fcid from toKeep, by doing so it becomes a list of the - // contracts we want to add - delete(toKeep, uploader.ContractID()) + // renew uploaders that got renewed + if renewed { + uploader.Renew(mgr.hp, renewal, bh) + } + // update uploader and add to the list uploader.UpdateBlockHeight(bh) uploader.tryRecomputeStats() - uploaders = append(uploaders, uploader) + + // update the wanted list, we'll be left with the uploaders we want to add + refreshed = append(refreshed, uploader) + delete(wanted, uploader.ContractID()) } - for _, c := range toKeep { + // add missing uploaders + for _, c := range wanted { uploader := mgr.newUploader(mgr.b, mgr.hp, c, bh) - uploaders = append(uploaders, uploader) + refreshed = append(refreshed, uploader) go uploader.Start(mgr.hp, mgr.rl) } - mgr.uploaders = uploaders + mgr.uploaders = refreshed } func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders []*uploader, mem *acquiredMemory, maxOverdrive uint64) (*slabUpload, chan sectorUploadResp) { @@ -798,7 +805,7 @@ func (u *upload) newSlabUpload(ctx context.Context, shards [][]byte, uploaders [ }, responseChan } -func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (overdrivePct float64, overdriveSpeed int64) { +func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data []byte, length, index int, respChan chan slabUploadResponse, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (uploadSpeed int64, overdrivePct float64) { // add tracing ctx, span := tracing.Tracer.Start(ctx, "uploadSlab") defer span.End() @@ -819,7 +826,7 @@ func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data resp.slab.Slab.Encrypt(shards) // upload the shards - resp.slab.Slab.Shards, overdrivePct, overdriveSpeed, resp.err = u.uploadShards(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) + resp.slab.Slab.Shards, uploadSpeed, overdrivePct, resp.err = u.uploadShards(ctx, shards, candidates, mem, maxOverdrive, overdriveTimeout) // send the response select { @@ -830,7 +837,7 @@ func (u *upload) uploadSlab(ctx context.Context, rs api.RedundancySettings, data return } -func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) ([]object.Sector, float64, int64, error) { +func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates []*uploader, mem *acquiredMemory, maxOverdrive uint64, overdriveTimeout time.Duration) (sectors []object.Sector, uploadSpeed int64, overdrivePct float64, err error) { start := time.Now() // add tracing @@ -858,7 +865,7 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates [ // launch all requests for _, upload := range requests { - if _, err := slab.launch(upload); err != nil { + if err := slab.launch(upload); err != nil { return nil, 0, 0, err } } @@ -869,7 +876,11 @@ func (u *upload) uploadShards(ctx context.Context, shards [][]byte, candidates [ } timer := time.NewTimer(overdriveTimeout) + // create a request buffer + var buffer []*sectorUploadReq + // collect responses + var used bool var done bool loop: for slab.numInflight > 0 && !done { @@ -880,25 +891,35 @@ loop: return nil, 0, 0, ctx.Err() case resp := <-respChan: // receive the response - done = slab.receive(resp) + used, done = slab.receive(resp) + if done { + break loop + } // relaunch non-overdrive uploads - if !done && resp.err != nil && !resp.req.overdrive { - if interrupt, err := slab.launch(resp.req); err != nil { - if interrupt { - break loop // fail the upload + if resp.err != nil && !resp.req.overdrive { + if err := slab.launch(resp.req); err != nil { + // a failure to relaunch non-overdrive uploads is bad, but + // we need to keep them around because an overdrive upload + // might've been redundant, in which case we can re-use the + // host to launch this request + buffer = append(buffer, resp.req) + } + } else if resp.err == nil && !used { + if len(buffer) > 0 { + // relaunch buffered upload request + if err := slab.launch(buffer[0]); err == nil { + buffer = buffer[1:] } + } else if slab.canOverdrive(overdriveTimeout) { + // or try overdriving a sector + _ = slab.launch(slab.nextRequest(respChan)) } } - - // try overdriving a sector - if slab.canOverdrive(overdriveTimeout) { - _, _ = slab.launch(slab.nextRequest(respChan)) // ignore result - } case <-timer.C: // try overdriving a sector if slab.canOverdrive(overdriveTimeout) { - _, _ = slab.launch(slab.nextRequest(respChan)) // ignore result + _ = slab.launch(slab.nextRequest(respChan)) // ignore result } } @@ -917,20 +938,29 @@ loop: // calculate the upload speed bytes := slab.numUploaded * rhpv2.SectorSize ms := time.Since(start).Milliseconds() - speed := int64(bytes) / ms + uploadSpeed = int64(bytes) / ms // calculate overdrive pct var numOverdrive uint64 if slab.numLaunched > slab.numSectors { numOverdrive = slab.numLaunched - slab.numSectors } - overdrivePct := float64(numOverdrive) / float64(slab.numSectors) + overdrivePct = float64(numOverdrive) / float64(slab.numSectors) // register the amount of overdrive sectors span.SetAttributes(attribute.Int("overdrive", int(numOverdrive))) - sectors, err := slab.finish() - return sectors, overdrivePct, speed, err + if slab.numUploaded < slab.numSectors { + remaining := slab.numSectors - slab.numUploaded + err = fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d pending=%d uploaders=%d errors=%d %w", slab.numLaunched, slab.numUploaded, remaining, slab.numInflight, len(buffer), len(slab.candidates), len(slab.errs), slab.errs) + return + } + + // collect the sectors + for _, sector := range slab.sectors { + sectors = append(sectors, sector.uploaded) + } + return } func (s *slabUpload) canOverdrive(overdriveTimeout time.Duration) bool { @@ -953,32 +983,16 @@ func (s *slabUpload) canOverdrive(overdriveTimeout time.Duration) bool { return true } -func (s *slabUpload) finish() (sectors []object.Sector, _ error) { - if s.numUploaded < s.numSectors { - remaining := s.numSectors - s.numUploaded - return nil, fmt.Errorf("failed to upload slab: launched=%d uploaded=%d remaining=%d inflight=%d uploaders=%d errors=%d %w", s.numLaunched, s.numUploaded, remaining, s.numInflight, len(s.candidates), len(s.errs), s.errs) - } - - for _, sector := range s.sectors { - sectors = append(sectors, sector.uploaded) - } - return -} - -func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { +func (s *slabUpload) launch(req *sectorUploadReq) error { // nothing to do if req == nil { - return false, nil + return nil } // find candidate - var overdriving bool var candidate *candidate for _, c := range s.candidates { if c.req != nil { - if c.req.sector.index == req.sector.index { - overdriving = true - } continue } candidate = c @@ -987,12 +1001,11 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { // no candidate found if candidate == nil { - err = errNoCandidateUploader - interrupt = !req.overdrive && !overdriving + err := errNoCandidateUploader span := trace.SpanFromContext(req.sector.ctx) span.RecordError(err) span.End() - return + return err } // update the candidate @@ -1008,7 +1021,7 @@ func (s *slabUpload) launch(req *sectorUploadReq) (interrupt bool, err error) { // enqueue the req candidate.uploader.enqueue(req) - return + return nil } func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUploadReq { @@ -1042,7 +1055,7 @@ func (s *slabUpload) nextRequest(responseChan chan sectorUploadResp) *sectorUplo } } -func (s *slabUpload) receive(resp sectorUploadResp) bool { +func (s *slabUpload) receive(resp sectorUploadResp) (bool, bool) { // convenience variable req := resp.req sector := req.sector @@ -1056,18 +1069,18 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { // failed reqs can't complete the upload if resp.err != nil { s.errs[req.hk] = resp.err - return false - } - - // redundant sectors can't complete the upload - if sector.uploaded.Root != (types.Hash256{}) { - return false + return false, false } // sanity check we receive the expected root if resp.root != req.sector.root { s.errs[req.hk] = errors.New("root mismatch") - return false + return false, false + } + + // redundant sectors can't complete the upload + if sector.uploaded.Root != (types.Hash256{}) { + return false, false } // store the sector @@ -1094,7 +1107,7 @@ func (s *slabUpload) receive(resp sectorUploadResp) bool { sector.data = nil s.mem.ReleaseSome(rhpv2.SectorSize) - return s.numUploaded == s.numSectors + return true, s.numUploaded == s.numSectors } func (s *sectorUpload) isUploaded() bool { From da1bf91cf444670dcedf413d376f33edc69c15b6 Mon Sep 17 00:00:00 2001 From: PJ Date: Tue, 12 Dec 2023 11:40:45 +0100 Subject: [PATCH 25/25] worker: track stats --- worker/upload.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/worker/upload.go b/worker/upload.go index 748e5f57f..5523b97a2 100644 --- a/worker/upload.go +++ b/worker/upload.go @@ -549,7 +549,13 @@ func (mgr *uploadManager) Upload(ctx context.Context, r io.Reader, contracts []a } else { // regular upload go func(rs api.RedundancySettings, data []byte, length, slabIndex int) { - upload.uploadSlab(ctx, rs, data, length, slabIndex, respChan, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + uploadSpeed, overdrivePct := upload.uploadSlab(ctx, rs, data, length, slabIndex, respChan, mgr.candidates(upload.allowed), mem, mgr.maxOverdrive, mgr.overdriveTimeout) + + // track stats + mgr.statsSlabUploadSpeedBytesPerMS.Track(float64(uploadSpeed)) + mgr.statsOverdrivePct.Track(overdrivePct) + + // release memory mem.Release() }(up.rs, data, length, slabIndex) }