Skip to content

Commit

Permalink
Refactor Alerts (#684)
Browse files Browse the repository at this point in the history
* autopilot: improve alerting

* autopilot: tweak severity

* autopilot: add alert hints
  • Loading branch information
peterjan committed Oct 24, 2023
1 parent 7131fa7 commit 4dee09d
Show file tree
Hide file tree
Showing 5 changed files with 221 additions and 143 deletions.
71 changes: 26 additions & 45 deletions autopilot/accounts.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,14 @@ import (
"go.opentelemetry.io/otel/codes"
rhpv3 "go.sia.tech/core/rhp/v3"
"go.sia.tech/core/types"
"go.sia.tech/renterd/alerts"
"go.sia.tech/renterd/api"
"go.sia.tech/renterd/tracing"
"go.uber.org/zap"
"lukechampine.com/frand"
)

var errMaxDriftExceeded = errors.New("drift on account is too large")

var (
alertAccountRefillID = frand.Entropy256() // constant across restarts

minBalance = types.Siacoins(1).Div64(2).Big()
maxBalance = types.Siacoins(1)
maxNegDrift = new(big.Int).Neg(types.Siacoins(10).Big())
Expand Down Expand Up @@ -158,44 +154,29 @@ func (a *accounts) refillWorkerAccounts(w Worker) {
if a.markRefillInProgress(workerID, c.HostKey) {
go func(contract api.ContractMetadata, inSet bool) {
rCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
accountID, refilled, rerr := refillWorkerAccount(rCtx, a.a, a.ap.bus, w, workerID, contract)
shouldLog := rerr != nil && (inSet || rerr.Is(errMaxDriftExceeded))
if shouldLog {
a.l.Errorw(rerr.err.Error(), rerr.keysAndValues...)
} else if err == nil && refilled {
a.l.Infow("Successfully funded account",
"account", accountID,
"host", contract.HostKey,
"balance", maxBalance,
)
}

// handle registering alert.
alertID := types.HashBytes(append(alertAccountRefillID[:], accountID[:]...))
if shouldLog {
data := map[string]interface{}{
"accountID": accountID.String(),
"contractID": contract.ID.String(),
"hostKey": contract.HostKey.String(),
defer cancel()
accountID, refilled, rerr := refillWorkerAccount(rCtx, a.a, w, workerID, contract)
if rerr != nil {
// register the alert on failure
a.ap.RegisterAlert(ctx, newAccountRefillAlert(accountID, contract, *rerr))
if inSet || rerr.Is(errMaxDriftExceeded) {
a.l.Errorw(rerr.err.Error(), rerr.keysAndValues...)
}
for i := 0; i < len(rerr.keysAndValues); i += 2 {
data[fmt.Sprint(rerr.keysAndValues[i])] = rerr.keysAndValues[i+1]
} else {
// dismiss alerts on success
a.ap.DismissAlert(ctx, alertIDForAccount(alertAccountRefillID, accountID))

// log success
if refilled {
a.l.Infow("Successfully funded account",
"account", accountID,
"host", contract.HostKey,
"balance", maxBalance,
)
}
err := a.ap.alerts.RegisterAlert(ctx, alerts.Alert{
ID: alertID,
Severity: alerts.SeverityError,
Message: fmt.Sprintf("failed to refill account: %v", rerr),
Data: data,
Timestamp: time.Now(),
})
if err != nil {
a.ap.logger.Errorf("failed to register alert: %v", err)
}
} else if err := a.ap.alerts.DismissAlerts(ctx, alertID); err != nil {
a.ap.logger.Errorf("failed to dismiss alert: %v", err)
}

a.markRefillDone(workerID, contract.HostKey)
cancel()
}(c, inSet)
}
}
Expand All @@ -217,7 +198,7 @@ func (err *refillError) Is(target error) bool {
return errors.Is(err.err, target)
}

func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter, w Worker, workerID string, contract api.ContractMetadata) (accountID rhpv3.Account, refilled bool, rerr *refillError) {
func refillWorkerAccount(ctx context.Context, a AccountStore, w Worker, workerID string, contract api.ContractMetadata) (accountID rhpv3.Account, refilled bool, rerr *refillError) {
wrapErr := func(err error, keysAndValues ...interface{}) *refillError {
if err == nil {
return nil
Expand Down Expand Up @@ -262,8 +243,8 @@ func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter,
// expected.
if account.Drift.Cmp(maxNegDrift) < 0 {
rerr = wrapErr(fmt.Errorf("not refilling account since host is potentially cheating: %w", errMaxDriftExceeded),
"account", account.ID,
"host", contract.HostKey,
"accountID", account.ID,
"hostKey", contract.HostKey,
"balance", account.Balance,
"drift", account.Drift,
)
Expand All @@ -276,8 +257,8 @@ func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter,
err = w.RHPSync(ctx, contract.ID, contract.HostKey, contract.HostIP, contract.SiamuxAddr)
if err != nil {
rerr = wrapErr(fmt.Errorf("failed to sync account's balance: %w", err),
"account", account.ID,
"host", contract.HostKey,
"accountID", account.ID,
"hostKey", contract.HostKey,
)
return
}
Expand All @@ -300,8 +281,8 @@ func refillWorkerAccount(ctx context.Context, a AccountStore, am alerts.Alerter,
err = w.RHPFund(ctx, contract.ID, contract.HostKey, contract.HostIP, contract.SiamuxAddr, maxBalance)
if err != nil {
rerr = wrapErr(fmt.Errorf("failed to fund account: %w", err),
"account", account.ID,
"host", contract.HostKey,
"accountID", account.ID,
"hostKey", contract.HostKey,
"balance", account.Balance,
"expected", maxBalance,
)
Expand Down
169 changes: 169 additions & 0 deletions autopilot/alerts.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
package autopilot

import (
"context"
"fmt"
"time"

rhpv3 "go.sia.tech/core/rhp/v3"
"go.sia.tech/core/types"
"go.sia.tech/renterd/alerts"
"go.sia.tech/renterd/api"
"go.sia.tech/renterd/object"
"lukechampine.com/frand"
)

var (
alertAccountRefillID = frand.Entropy256() // constant until restarted
alertLowBalanceID = frand.Entropy256() // constant until restarted
alertMigrationID = frand.Entropy256() // constant until restarted
alertRenewalFailedID = frand.Entropy256() // constant until restarted
)

func alertIDForAccount(alertID [32]byte, id rhpv3.Account) types.Hash256 {
return types.HashBytes(append(alertID[:], id[:]...))
}

func alertIDForContract(alertID [32]byte, contract api.ContractMetadata) types.Hash256 {
return types.HashBytes(append(alertID[:], contract.ID[:]...))
}

func alertIDForSlab(alertID [32]byte, slab object.Slab) types.Hash256 {
return types.HashBytes(append(alertID[:], []byte(slab.Key.String())...))
}

func randomAlertID() types.Hash256 {
return frand.Entropy256()
}

func (ap *Autopilot) RegisterAlert(ctx context.Context, a alerts.Alert) {
if err := ap.alerts.RegisterAlert(ctx, a); err != nil {
ap.logger.Errorf("failed to register alert: %v", err)
}
}

func (ap *Autopilot) DismissAlert(ctx context.Context, id types.Hash256) {
if err := ap.alerts.DismissAlerts(ctx, id); err != nil {
ap.logger.Errorf("failed to dismiss alert: %v", err)
}
}

func newAccountLowBalanceAlert(address types.Address, balance, allowance types.Currency, bh, renewWindow, endHeight uint64) alerts.Alert {
severity := alerts.SeverityInfo
if bh+renewWindow/2 >= endHeight {
severity = alerts.SeverityCritical
} else if bh+renewWindow >= endHeight {
severity = alerts.SeverityWarning
}

return alerts.Alert{
ID: alertLowBalanceID,
Severity: severity,
Message: "Wallet is low on funds",
Data: map[string]any{
"address": address,
"balance": balance,
"allowance": allowance,
"hint": fmt.Sprintf("The current wallet balance of %v is less than the configured allowance of %v. Ideally, a wallet holds at least one allowance worth of funds to make sure it can renew all its contracts.", balance, allowance),
},
Timestamp: time.Now(),
}
}

func newAccountRefillAlert(id rhpv3.Account, contract api.ContractMetadata, err refillError) alerts.Alert {
data := map[string]interface{}{
"error": err,
"accountID": id.String(),
"contractID": contract.ID.String(),
"hostKey": contract.HostKey.String(),
}
for i := 0; i < len(err.keysAndValues); i += 2 {
data[fmt.Sprint(err.keysAndValues[i])] = err.keysAndValues[i+1]
}

return alerts.Alert{
ID: alertIDForAccount(alertAccountRefillID, id),
Severity: alerts.SeverityError,
Message: "Ephemeral account refill failed",
Data: data,
Timestamp: time.Now(),
}
}

func newContractRenewalFailedAlert(contract api.ContractMetadata, interrupted bool, err error) alerts.Alert {
severity := alerts.SeverityWarning
if interrupted {
severity = alerts.SeverityCritical
}

return alerts.Alert{
ID: alertIDForContract(alertRenewalFailedID, contract),
Severity: severity,
Message: "Contract renewal failed",
Data: map[string]interface{}{
"error": err,
"renewalsInterrupted": interrupted,
"contractID": contract.ID.String(),
"hostKey": contract.HostKey.String(),
},
Timestamp: time.Now(),
}
}

func newContractSetChangeAlert(name string, added, removed int, removedReasons map[string]string) alerts.Alert {
return alerts.Alert{
ID: randomAlertID(),
Severity: alerts.SeverityInfo,
Message: "Contract set changed",
Data: map[string]any{
"name": name,
"added": added,
"removed": removed,
"removals": removedReasons,
"hint": "A high churn rate can lead to a lot of unnecessary migrations, it might be necessary to tweak your configuration depending on the reason hosts are being discarded from the set.",
},
Timestamp: time.Now(),
}
}

func newOngoingMigrationsAlert(n int) alerts.Alert {
return alerts.Alert{
ID: alertMigrationID,
Severity: alerts.SeverityInfo,
Message: fmt.Sprintf("Migrating %d slabs", n),
Timestamp: time.Now(),
}
}

func newSlabMigrationFailedAlert(slab object.Slab, health float64, err error) alerts.Alert {
severity := alerts.SeverityWarning
if health < 0.5 {
severity = alerts.SeverityCritical
}

return alerts.Alert{
ID: alertIDForSlab(alertMigrationID, slab),
Severity: severity,
Message: "Slab migration failed",
Data: map[string]interface{}{
"error": err,
"health": health,
"slabKey": slab.Key.String(),
"hint": "Migration failures can be temporary, but if they persist it can eventually lead to data loss and should therefor be taken very seriously.",
},
Timestamp: time.Now(),
}
}

func newRefreshHealthFailedAlert(err error) alerts.Alert {
return alerts.Alert{
ID: randomAlertID(),
Severity: alerts.SeverityCritical,
Message: "Health refresh failed",
Data: map[string]interface{}{
"migrationsInterrupted": true,
"error": err,
},
Timestamp: time.Now(),
}
}
Loading

0 comments on commit 4dee09d

Please sign in to comment.