diff --git a/autopilot/accounts.go b/autopilot/accounts.go index 75b89341a..e659c3c0f 100644 --- a/autopilot/accounts.go +++ b/autopilot/accounts.go @@ -12,18 +12,14 @@ import ( "go.opentelemetry.io/otel/codes" rhpv3 "go.sia.tech/core/rhp/v3" "go.sia.tech/core/types" - "go.sia.tech/renterd/alerts" "go.sia.tech/renterd/api" "go.sia.tech/renterd/tracing" "go.uber.org/zap" - "lukechampine.com/frand" ) var errMaxDriftExceeded = errors.New("drift on account is too large") var ( - alertAccountRefillID = frand.Entropy256() // constant across restarts - minBalance = types.Siacoins(1).Div64(2).Big() maxBalance = types.Siacoins(1) maxNegDrift = new(big.Int).Neg(types.Siacoins(10).Big()) @@ -158,44 +154,29 @@ func (a *accounts) refillWorkerAccounts(w Worker) { if a.markRefillInProgress(workerID, c.HostKey) { go func(contract api.ContractMetadata, inSet bool) { rCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) - accountID, refilled, rerr := refillWorkerAccount(rCtx, a.a, a.ap.bus, w, workerID, contract) - shouldLog := rerr != nil && (inSet || rerr.Is(errMaxDriftExceeded)) - if shouldLog { - a.l.Errorw(rerr.err.Error(), rerr.keysAndValues...) - } else if err == nil && refilled { - a.l.Infow("Successfully funded account", - "account", accountID, - "host", contract.HostKey, - "balance", maxBalance, - ) - } - - // handle registering alert. - alertID := types.HashBytes(append(alertAccountRefillID[:], accountID[:]...)) - if shouldLog { - data := map[string]interface{}{ - "accountID": accountID.String(), - "contractID": contract.ID.String(), - "hostKey": contract.HostKey.String(), + defer cancel() + accountID, refilled, rerr := refillWorkerAccount(rCtx, a.a, w, workerID, contract) + if rerr != nil { + // register the alert on failure + a.ap.RegisterAlert(ctx, newAccountRefillAlert(accountID, contract, *rerr)) + if inSet || rerr.Is(errMaxDriftExceeded) { + a.l.Errorw(rerr.err.Error(), rerr.keysAndValues...) } - for i := 0; i < len(rerr.keysAndValues); i += 2 { - data[fmt.Sprint(rerr.keysAndValues[i])] = rerr.keysAndValues[i+1] + } else { + // dismiss alerts on success + a.ap.DismissAlert(ctx, alertIDForAccount(alertAccountRefillID, accountID)) + + // log success + if refilled { + a.l.Infow("Successfully funded account", + "account", accountID, + "host", contract.HostKey, + "balance", maxBalance, + ) } - err := a.ap.alerts.RegisterAlert(ctx, alerts.Alert{ - ID: alertID, - Severity: alerts.SeverityError, - Message: fmt.Sprintf("failed to refill account: %v", rerr), - Data: data, - Timestamp: time.Now(), - }) - if err != nil { - a.ap.logger.Errorf("failed to register alert: %v", err) - } - } else if err := a.ap.alerts.DismissAlerts(ctx, alertID); err != nil { - a.ap.logger.Errorf("failed to dismiss alert: %v", err) } + a.markRefillDone(workerID, contract.HostKey) - cancel() }(c, inSet) } } @@ -217,7 +198,7 @@ func (err *refillError) Is(target error) bool { return errors.Is(err.err, target) } -func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter, w Worker, workerID string, contract api.ContractMetadata) (accountID rhpv3.Account, refilled bool, rerr *refillError) { +func refillWorkerAccount(ctx context.Context, a AccountStore, w Worker, workerID string, contract api.ContractMetadata) (accountID rhpv3.Account, refilled bool, rerr *refillError) { wrapErr := func(err error, keysAndValues ...interface{}) *refillError { if err == nil { return nil @@ -262,8 +243,8 @@ func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter, // expected. if account.Drift.Cmp(maxNegDrift) < 0 { rerr = wrapErr(fmt.Errorf("not refilling account since host is potentially cheating: %w", errMaxDriftExceeded), - "account", account.ID, - "host", contract.HostKey, + "accountID", account.ID, + "hostKey", contract.HostKey, "balance", account.Balance, "drift", account.Drift, ) @@ -276,8 +257,8 @@ func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter, err = w.RHPSync(ctx, contract.ID, contract.HostKey, contract.HostIP, contract.SiamuxAddr) if err != nil { rerr = wrapErr(fmt.Errorf("failed to sync account's balance: %w", err), - "account", account.ID, - "host", contract.HostKey, + "accountID", account.ID, + "hostKey", contract.HostKey, ) return } @@ -300,8 +281,8 @@ func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter, err = w.RHPFund(ctx, contract.ID, contract.HostKey, contract.HostIP, contract.SiamuxAddr, maxBalance) if err != nil { rerr = wrapErr(fmt.Errorf("failed to fund account: %w", err), - "account", account.ID, - "host", contract.HostKey, + "accountID", account.ID, + "hostKey", contract.HostKey, "balance", account.Balance, "expected", maxBalance, ) diff --git a/autopilot/alerts.go b/autopilot/alerts.go new file mode 100644 index 000000000..8561b7b30 --- /dev/null +++ b/autopilot/alerts.go @@ -0,0 +1,169 @@ +package autopilot + +import ( + "context" + "fmt" + "time" + + rhpv3 "go.sia.tech/core/rhp/v3" + "go.sia.tech/core/types" + "go.sia.tech/renterd/alerts" + "go.sia.tech/renterd/api" + "go.sia.tech/renterd/object" + "lukechampine.com/frand" +) + +var ( + alertAccountRefillID = frand.Entropy256() // constant until restarted + alertLowBalanceID = frand.Entropy256() // constant until restarted + alertMigrationID = frand.Entropy256() // constant until restarted + alertRenewalFailedID = frand.Entropy256() // constant until restarted +) + +func alertIDForAccount(alertID [32]byte, id rhpv3.Account) types.Hash256 { + return types.HashBytes(append(alertID[:], id[:]...)) +} + +func alertIDForContract(alertID [32]byte, contract api.ContractMetadata) types.Hash256 { + return types.HashBytes(append(alertID[:], contract.ID[:]...)) +} + +func alertIDForSlab(alertID [32]byte, slab object.Slab) types.Hash256 { + return types.HashBytes(append(alertID[:], []byte(slab.Key.String())...)) +} + +func randomAlertID() types.Hash256 { + return frand.Entropy256() +} + +func (ap *Autopilot) RegisterAlert(ctx context.Context, a alerts.Alert) { + if err := ap.alerts.RegisterAlert(ctx, a); err != nil { + ap.logger.Errorf("failed to register alert: %v", err) + } +} + +func (ap *Autopilot) DismissAlert(ctx context.Context, id types.Hash256) { + if err := ap.alerts.DismissAlerts(ctx, id); err != nil { + ap.logger.Errorf("failed to dismiss alert: %v", err) + } +} + +func newAccountLowBalanceAlert(address types.Address, balance, allowance types.Currency, bh, renewWindow, endHeight uint64) alerts.Alert { + severity := alerts.SeverityInfo + if bh+renewWindow/2 >= endHeight { + severity = alerts.SeverityCritical + } else if bh+renewWindow >= endHeight { + severity = alerts.SeverityWarning + } + + return alerts.Alert{ + ID: alertLowBalanceID, + Severity: severity, + Message: "Wallet is low on funds", + Data: map[string]any{ + "address": address, + "balance": balance, + "allowance": allowance, + "hint": fmt.Sprintf("The current wallet balance of %v is less than the configured allowance of %v. Ideally, a wallet holds at least one allowance worth of funds to make sure it can renew all its contracts.", balance, allowance), + }, + Timestamp: time.Now(), + } +} + +func newAccountRefillAlert(id rhpv3.Account, contract api.ContractMetadata, err refillError) alerts.Alert { + data := map[string]interface{}{ + "error": err, + "accountID": id.String(), + "contractID": contract.ID.String(), + "hostKey": contract.HostKey.String(), + } + for i := 0; i < len(err.keysAndValues); i += 2 { + data[fmt.Sprint(err.keysAndValues[i])] = err.keysAndValues[i+1] + } + + return alerts.Alert{ + ID: alertIDForAccount(alertAccountRefillID, id), + Severity: alerts.SeverityError, + Message: "Ephemeral account refill failed", + Data: data, + Timestamp: time.Now(), + } +} + +func newContractRenewalFailedAlert(contract api.ContractMetadata, interrupted bool, err error) alerts.Alert { + severity := alerts.SeverityWarning + if interrupted { + severity = alerts.SeverityCritical + } + + return alerts.Alert{ + ID: alertIDForContract(alertRenewalFailedID, contract), + Severity: severity, + Message: "Contract renewal failed", + Data: map[string]interface{}{ + "error": err, + "renewalsInterrupted": interrupted, + "contractID": contract.ID.String(), + "hostKey": contract.HostKey.String(), + }, + Timestamp: time.Now(), + } +} + +func newContractSetChangeAlert(name string, added, removed int, removedReasons map[string]string) alerts.Alert { + return alerts.Alert{ + ID: randomAlertID(), + Severity: alerts.SeverityInfo, + Message: "Contract set changed", + Data: map[string]any{ + "name": name, + "added": added, + "removed": removed, + "removals": removedReasons, + "hint": "A high churn rate can lead to a lot of unnecessary migrations, it might be necessary to tweak your configuration depending on the reason hosts are being discarded from the set.", + }, + Timestamp: time.Now(), + } +} + +func newOngoingMigrationsAlert(n int) alerts.Alert { + return alerts.Alert{ + ID: alertMigrationID, + Severity: alerts.SeverityInfo, + Message: fmt.Sprintf("Migrating %d slabs", n), + Timestamp: time.Now(), + } +} + +func newSlabMigrationFailedAlert(slab object.Slab, health float64, err error) alerts.Alert { + severity := alerts.SeverityWarning + if health < 0.5 { + severity = alerts.SeverityCritical + } + + return alerts.Alert{ + ID: alertIDForSlab(alertMigrationID, slab), + Severity: severity, + Message: "Slab migration failed", + Data: map[string]interface{}{ + "error": err, + "health": health, + "slabKey": slab.Key.String(), + "hint": "Migration failures can be temporary, but if they persist it can eventually lead to data loss and should therefor be taken very seriously.", + }, + Timestamp: time.Now(), + } +} + +func newRefreshHealthFailedAlert(err error) alerts.Alert { + return alerts.Alert{ + ID: randomAlertID(), + Severity: alerts.SeverityCritical, + Message: "Health refresh failed", + Data: map[string]interface{}{ + "migrationsInterrupted": true, + "error": err, + }, + Timestamp: time.Now(), + } +} diff --git a/autopilot/contractor.go b/autopilot/contractor.go index 0f9c09258..d30addadb 100644 --- a/autopilot/contractor.go +++ b/autopilot/contractor.go @@ -16,19 +16,12 @@ import ( rhpv2 "go.sia.tech/core/rhp/v2" rhpv3 "go.sia.tech/core/rhp/v3" "go.sia.tech/core/types" - "go.sia.tech/renterd/alerts" "go.sia.tech/renterd/api" "go.sia.tech/renterd/hostdb" "go.sia.tech/renterd/tracing" "go.sia.tech/renterd/wallet" "go.sia.tech/renterd/worker" "go.uber.org/zap" - "lukechampine.com/frand" -) - -var ( - alertLowBalanceID = frand.Entropy256() // constant until restarted - alertRenewalFailedID = frand.Entropy256() // constant until restarted ) const ( @@ -392,10 +385,10 @@ func (c *contractor) performContractMaintenance(ctx context.Context, w Worker) ( } // return whether the maintenance changed the contract set - return c.computeContractSetChanged(currentSet, updatedSet, formed, refreshed, renewed, toStopUsing, contractData), nil + return c.computeContractSetChanged(state.cfg.Contracts.Set, currentSet, updatedSet, formed, refreshed, renewed, toStopUsing, contractData), nil } -func (c *contractor) computeContractSetChanged(oldSet []api.ContractMetadata, newSet, formed []types.FileContractID, refreshed, renewed []renewal, toStopUsing map[types.FileContractID]string, contractData map[types.FileContractID]uint64) bool { +func (c *contractor) computeContractSetChanged(name string, oldSet []api.ContractMetadata, newSet, formed []types.FileContractID, refreshed, renewed []renewal, toStopUsing map[types.FileContractID]string, contractData map[types.FileContractID]uint64) bool { // build some maps for easier lookups previous := make(map[types.FileContractID]struct{}) for _, c := range oldSet { @@ -464,19 +457,7 @@ func (c *contractor) computeContractSetChanged(oldSet []api.ContractMetadata, ne ) hasChanged := len(added)+len(removed) > 0 if hasChanged { - err := c.ap.alerts.RegisterAlert(context.Background(), alerts.Alert{ - ID: frand.Entropy256(), - Severity: alerts.SeverityInfo, - Message: fmt.Sprintf("The contract set has changed: %v contracts added and %v removed", len(added), len(removed)), - Data: map[string]any{ - "additions": added, - "removals": removedReasons, - }, - Timestamp: time.Now(), - }) - if err != nil { - logFn("failed to register alert", "error", err) - } + c.ap.RegisterAlert(context.Background(), newContractSetChangeAlert(name, len(added), len(removed), removedReasons)) } return hasChanged } @@ -517,7 +498,6 @@ func (c *contractor) performWalletMaintenance(ctx context.Context) error { l.Warnf("wallet maintenance skipped, fetching consensus state failed with err: %v", err) return err } - bh := cs.BlockHeight // fetch wallet balance wallet, err := b.Wallet(ctx) @@ -529,27 +509,9 @@ func (c *contractor) performWalletMaintenance(ctx context.Context) error { // register an alert if balance is low if balance.Cmp(cfg.Contracts.Allowance) < 0 { - // increase severity as we progress through renew window - severity := alerts.SeverityInfo - if bh+renewWindow/2 >= endHeight(cfg, period) { - severity = alerts.SeverityCritical - } else if bh+renewWindow >= endHeight(cfg, period) { - severity = alerts.SeverityWarning - } - - err = c.ap.alerts.RegisterAlert(ctx, alerts.Alert{ - ID: alertLowBalanceID, - Severity: severity, - Message: "wallet is low on funds", - Data: map[string]any{ - "address": state.address, - "balance": balance, - }, - Timestamp: time.Now(), - }) - if err != nil { - l.Errorf("failed to register alert: err %v", err) - } + c.ap.RegisterAlert(ctx, newAccountLowBalanceAlert(state.address, balance, cfg.Contracts.Allowance, cs.BlockHeight, renewWindow, endHeight(cfg, period))) + } else { + c.ap.DismissAlert(ctx, alertLowBalanceID) } // pending maintenance transaction - nothing to do @@ -1002,28 +964,20 @@ func (c *contractor) runContractRenewals(ctx context.Context, w Worker, toRenew } // renew and add if it succeeds or if its usable + contract := toRenew[i].contract.ContractMetadata renewed, proceed, err := c.renewContract(ctx, w, toRenew[i], budget) - if err == nil { - renewals = append(renewals, renewal{from: toRenew[i].contract.ID, to: renewed.ID, ci: toRenew[i]}) - } else if toRenew[i].usable { - toKeep = append(toKeep, toRenew[i]) + if err != nil { + c.ap.RegisterAlert(ctx, newContractRenewalFailedAlert(contract, !proceed, err)) + if toRenew[i].usable { + toKeep = append(toKeep, toRenew[i]) + } + } else { + c.ap.DismissAlert(ctx, alertIDForContract(alertRenewalFailedID, contract)) + renewals = append(renewals, renewal{from: contract.ID, to: renewed.ID, ci: toRenew[i]}) } // break if we don't want to proceed if !proceed { - rerr := c.ap.alerts.RegisterAlert(ctx, alerts.Alert{ - ID: alertRenewalFailedID, - Severity: alerts.SeverityCritical, - Message: fmt.Sprintf("Contract renewals were interrupted due to latest error: %v", err), - Data: map[string]interface{}{ - "contractID": toRenew[i].contract.ID.String(), - "hostKey": toRenew[i].contract.HostKey.String(), - }, - Timestamp: time.Now(), - }) - if rerr != nil { - c.logger.Errorf("failed to register alert: err %v", rerr) - } break } } diff --git a/autopilot/migrator.go b/autopilot/migrator.go index 733a3d3e9..31b5c4135 100644 --- a/autopilot/migrator.go +++ b/autopilot/migrator.go @@ -2,23 +2,15 @@ package autopilot import ( "context" - "fmt" "math" "sort" "sync" "time" - "go.sia.tech/core/types" - "go.sia.tech/renterd/alerts" "go.sia.tech/renterd/api" "go.sia.tech/renterd/object" "go.sia.tech/renterd/tracing" "go.uber.org/zap" - "lukechampine.com/frand" -) - -var ( - alertMigrationID = frand.Entropy256() // constant until restarted ) const ( @@ -127,21 +119,11 @@ func (m *migrator) performMigrations(p *workerPool) { } res, err := w.MigrateSlab(ctx, slab, ap.Config.Contracts.Set) if err != nil { - errMsg := fmt.Sprintf("%v: failed to migrate slab %d/%d, health: %v, err: %v", id, j.slabIdx+1, j.batchSize, j.Health, err) - rerr := m.ap.alerts.RegisterAlert(ctx, alerts.Alert{ - ID: types.HashBytes([]byte(slab.Key.String())), - Severity: alerts.SeverityCritical, - Message: errMsg, - Data: map[string]interface{}{ - "slabKey": slab.Key.String(), - }, - Timestamp: time.Now(), - }) - if rerr != nil { - m.logger.Errorf("failed to register alert: err %v", rerr) - } - m.logger.Errorf(errMsg) + m.ap.RegisterAlert(ctx, newSlabMigrationFailedAlert(slab, j.Health, err)) + m.logger.Errorf("%v: failed to migrate slab %d/%d, health: %v, err: %v", id, j.slabIdx+1, j.batchSize, j.Health, err) continue + } else { + m.ap.DismissAlert(ctx, alertIDForSlab(alertMigrationID, slab)) } m.logger.Debugf("%v: successfully migrated slab (health: %v migrated shards: %d) %d/%d", id, j.Health, res.NumShardsMigrated, j.slabIdx+1, j.batchSize) } @@ -169,12 +151,7 @@ OUTER: // recompute health. start := time.Now() if err := b.RefreshHealth(ctx); err != nil { - rerr := m.ap.alerts.RegisterAlert(ctx, alerts.Alert{ - ID: frand.Entropy256(), - Severity: alerts.SeverityCritical, - Message: fmt.Sprintf("migrations interrupted - failed to refresh cached slab health: %v", err), - Timestamp: time.Now(), - }) + rerr := m.ap.alerts.RegisterAlert(ctx, newRefreshHealthFailedAlert(err)) if rerr != nil { m.logger.Errorf("failed to register alert: err %v", rerr) } @@ -227,14 +204,11 @@ OUTER: m.logger.Debugf("%d slabs to migrate", len(toMigrate)) // register an alert to notify users about ongoing migrations. - err = m.ap.alerts.RegisterAlert(ctx, alerts.Alert{ - ID: alertMigrationID, - Severity: alerts.SeverityInfo, - Message: fmt.Sprintf("Migrating %d slabs", len(toMigrate)), - Timestamp: time.Now(), - }) - if err != nil { - m.logger.Errorf("failed to register alert: err %v", err) + if len(toMigrate) > 0 { + err = m.ap.alerts.RegisterAlert(ctx, newOngoingMigrationsAlert(len(toMigrate))) + if err != nil { + m.logger.Errorf("failed to register alert: err %v", err) + } } // return if there are no slabs to migrate diff --git a/worker/download.go b/worker/download.go index fa91681aa..dacf3db18 100644 --- a/worker/download.go +++ b/worker/download.go @@ -1098,7 +1098,7 @@ func (s *slabDownload) finish() ([][]byte, error) { unused++ } } - return nil, fmt.Errorf("failed to download slab: completed=%d, inflight=%d, launched=%d downloaders=%d unused=%d errors=%w", s.numCompleted, s.numInflight, s.numLaunched, s.mgr.numDownloaders(), unused, s.errs) + return nil, fmt.Errorf("failed to download slab: completed=%d, inflight=%d, launched=%d downloaders=%d unused=%d errors=%d %w", s.numCompleted, s.numInflight, s.numLaunched, s.mgr.numDownloaders(), unused, len(s.errs), s.errs) } return s.sectors, nil }