diff --git a/api/autopilot.go b/api/autopilot.go index 3c4ceb688..7e3f9b7f7 100644 --- a/api/autopilot.go +++ b/api/autopilot.go @@ -57,9 +57,10 @@ type ( // HostsConfig contains all hosts settings used in the autopilot. HostsConfig struct { - AllowRedundantIPs bool `json:"allowRedundantIPs"` - MaxDowntimeHours uint64 `json:"maxDowntimeHours"` - ScoreOverrides map[types.PublicKey]float64 `json:"scoreOverrides"` + AllowRedundantIPs bool `json:"allowRedundantIPs"` + MaxDowntimeHours uint64 `json:"maxDowntimeHours"` + MinRecentScanFailures uint64 `json:"minRecentScanFailures"` + ScoreOverrides map[types.PublicKey]float64 `json:"scoreOverrides"` } // WalletConfig contains all wallet settings used in the autopilot. diff --git a/autopilot/autopilot.go b/autopilot/autopilot.go index b522c3acd..589a5a5f9 100644 --- a/autopilot/autopilot.go +++ b/autopilot/autopilot.go @@ -590,7 +590,7 @@ func (ap *Autopilot) triggerHandlerPOST(jc jape.Context) { } // New initializes an Autopilot. -func New(id string, bus Bus, workers []Worker, logger *zap.Logger, heartbeat time.Duration, scannerScanInterval time.Duration, scannerBatchSize, scannerMinRecentFailures, scannerNumThreads uint64, migrationHealthCutoff float64, accountsRefillInterval time.Duration, revisionSubmissionBuffer, migratorParallelSlabsPerWorker uint64, revisionBroadcastInterval time.Duration) (*Autopilot, error) { +func New(id string, bus Bus, workers []Worker, logger *zap.Logger, heartbeat time.Duration, scannerScanInterval time.Duration, scannerBatchSize, scannerNumThreads uint64, migrationHealthCutoff float64, accountsRefillInterval time.Duration, revisionSubmissionBuffer, migratorParallelSlabsPerWorker uint64, revisionBroadcastInterval time.Duration) (*Autopilot, error) { ap := &Autopilot{ alerts: alerts.WithOrigin(bus, fmt.Sprintf("autopilot.%s", id)), id: id, @@ -603,7 +603,6 @@ func New(id string, bus Bus, workers []Worker, logger *zap.Logger, heartbeat tim scanner, err := newScanner( ap, scannerBatchSize, - scannerMinRecentFailures, scannerNumThreads, scannerScanInterval, scannerTimeoutInterval, diff --git a/autopilot/hostscore_test.go b/autopilot/hostscore_test.go index c352f9339..8246cd499 100644 --- a/autopilot/hostscore_test.go +++ b/autopilot/hostscore_test.go @@ -26,7 +26,8 @@ var cfg = api.AutopilotConfig{ Set: api.DefaultAutopilotID, }, Hosts: api.HostsConfig{ - MaxDowntimeHours: 24 * 7 * 2, + MaxDowntimeHours: 24 * 7 * 2, + MinRecentScanFailures: 10, }, Wallet: api.WalletConfig{ DefragThreshold: 1000, diff --git a/autopilot/scanner.go b/autopilot/scanner.go index 7f04fe116..a0bbbce49 100644 --- a/autopilot/scanner.go +++ b/autopilot/scanner.go @@ -40,10 +40,9 @@ type ( ap *Autopilot wg sync.WaitGroup - scanBatchSize uint64 - scanThreads uint64 - scanMinInterval time.Duration - scanMinRecentFailures uint64 + scanBatchSize uint64 + scanThreads uint64 + scanMinInterval time.Duration timeoutMinInterval time.Duration timeoutMinTimeout time.Duration @@ -119,7 +118,7 @@ func (t *tracker) timeout() time.Duration { return time.Duration(percentile) * time.Millisecond } -func newScanner(ap *Autopilot, scanBatchSize, scanMinRecentFailures, scanThreads uint64, scanMinInterval, timeoutMinInterval, timeoutMinTimeout time.Duration) (*scanner, error) { +func newScanner(ap *Autopilot, scanBatchSize, scanThreads uint64, scanMinInterval, timeoutMinInterval, timeoutMinTimeout time.Duration) (*scanner, error) { if scanBatchSize == 0 { return nil, errors.New("scanner batch size has to be greater than zero") } @@ -139,10 +138,9 @@ func newScanner(ap *Autopilot, scanBatchSize, scanMinRecentFailures, scanThreads interruptScanChan: make(chan struct{}), - scanBatchSize: scanBatchSize, - scanThreads: scanThreads, - scanMinInterval: scanMinInterval, - scanMinRecentFailures: scanMinRecentFailures, + scanBatchSize: scanBatchSize, + scanThreads: scanThreads, + scanMinInterval: scanMinInterval, timeoutMinInterval: timeoutMinInterval, timeoutMinTimeout: timeoutMinTimeout, @@ -193,7 +191,9 @@ func (s *scanner) tryPerformHostScan(ctx context.Context, w scanWorker, force bo s.mu.Unlock() s.logger.Infof("%s started", scanType) - maxDowntime := time.Duration(s.ap.State().cfg.Hosts.MaxDowntimeHours) * time.Hour + hostCfg := s.ap.State().cfg.Hosts + maxDowntime := time.Duration(hostCfg.MaxDowntimeHours) * time.Hour + minRecentScanFailures := hostCfg.MinRecentScanFailures s.wg.Add(1) go func(st string) { @@ -212,7 +212,7 @@ func (s *scanner) tryPerformHostScan(ctx context.Context, w scanWorker, force bo if !interrupted && maxDowntime > 0 { s.logger.Debugf("removing hosts that have been offline for more than %v", maxDowntime) - removed, err := s.bus.RemoveOfflineHosts(ctx, s.scanMinRecentFailures, maxDowntime) + removed, err := s.bus.RemoveOfflineHosts(ctx, minRecentScanFailures, maxDowntime) if err != nil { s.logger.Errorf("error occurred while removing offline hosts, err: %v", err) } else if removed > 0 { diff --git a/bus/bus.go b/bus/bus.go index 014de6bcd..656524bc3 100644 --- a/bus/bus.go +++ b/bus/bus.go @@ -597,6 +597,10 @@ func (b *bus) hostsRemoveHandlerPOST(jc jape.Context) { jc.Error(errors.New("maxDowntime must be non-zero"), http.StatusBadRequest) return } + if hrr.MinRecentScanFailures == 0 { + jc.Error(errors.New("minRecentScanFailures must be non-zero"), http.StatusBadRequest) + return + } removed, err := b.hdb.RemoveOfflineHosts(jc.Request.Context(), hrr.MinRecentScanFailures, time.Duration(hrr.MaxDowntimeHours)) if jc.Check("couldn't remove offline hosts", err) != nil { return diff --git a/cmd/renterd/main.go b/cmd/renterd/main.go index 0d2907b7f..a9802f88e 100644 --- a/cmd/renterd/main.go +++ b/cmd/renterd/main.go @@ -101,7 +101,6 @@ var ( RevisionBroadcastInterval: 7 * 24 * time.Hour, ScannerBatchSize: 1000, ScannerInterval: 24 * time.Hour, - ScannerMinRecentFailures: 10, ScannerNumThreads: 100, MigratorParallelSlabsPerWorker: 1, }, @@ -302,7 +301,6 @@ func main() { flag.DurationVar(&cfg.Autopilot.RevisionBroadcastInterval, "autopilot.revisionBroadcastInterval", cfg.Autopilot.RevisionBroadcastInterval, "interval at which the autopilot broadcasts contract revisions to be mined - can be overwritten using the RENTERD_AUTOPILOT_REVISION_BROADCAST_INTERVAL environment variable - setting it to 0 will disable this feature") flag.Uint64Var(&cfg.Autopilot.ScannerBatchSize, "autopilot.scannerBatchSize", cfg.Autopilot.ScannerBatchSize, "size of the batch with which hosts are scanned") flag.DurationVar(&cfg.Autopilot.ScannerInterval, "autopilot.scannerInterval", cfg.Autopilot.ScannerInterval, "interval at which hosts are scanned") - flag.Uint64Var(&cfg.Autopilot.ScannerMinRecentFailures, "autopilot.scannerMinRecentFailures", cfg.Autopilot.ScannerMinRecentFailures, "minimum amount of consesutive failed scans a host must have before it is removed for exceeding the max downtime") flag.Uint64Var(&cfg.Autopilot.ScannerNumThreads, "autopilot.scannerNumThreads", cfg.Autopilot.ScannerNumThreads, "number of threads that scan hosts") flag.Uint64Var(&cfg.Autopilot.MigratorParallelSlabsPerWorker, "autopilot.migratorParallelSlabsPerWorker", cfg.Autopilot.MigratorParallelSlabsPerWorker, "number of slabs that the autopilot migrates in parallel per worker. Can be overwritten using the RENTERD_MIGRATOR_PARALLEL_SLABS_PER_WORKER environment variable") flag.BoolVar(&cfg.Autopilot.Enabled, "autopilot.enabled", cfg.Autopilot.Enabled, "enable/disable the autopilot - can be overwritten using the RENTERD_AUTOPILOT_ENABLED environment variable") diff --git a/config/config.go b/config/config.go index 83f3586be..b5e50d26d 100644 --- a/config/config.go +++ b/config/config.go @@ -111,7 +111,6 @@ type ( RevisionSubmissionBuffer uint64 `yaml:"revisionSubmissionBuffer"` ScannerInterval time.Duration `yaml:"scannerInterval"` ScannerBatchSize uint64 `yaml:"scannerBatchSize"` - ScannerMinRecentFailures uint64 `yaml:"scannerMinRecentFailures"` ScannerNumThreads uint64 `yaml:"scannerNumThreads"` MigratorParallelSlabsPerWorker uint64 `yaml:"migratorParallelSlabsPerWorker"` } diff --git a/internal/node/node.go b/internal/node/node.go index ba0f0aa9a..73eb0af58 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -176,7 +176,7 @@ func NewWorker(cfg config.Worker, b worker.Bus, seed types.PrivateKey, l *zap.Lo } func NewAutopilot(cfg AutopilotConfig, b autopilot.Bus, workers []autopilot.Worker, l *zap.Logger) (http.Handler, RunFn, ShutdownFn, error) { - ap, err := autopilot.New(cfg.ID, b, workers, l, cfg.Heartbeat, cfg.ScannerInterval, cfg.ScannerBatchSize, cfg.ScannerMinRecentFailures, cfg.ScannerNumThreads, cfg.MigrationHealthCutoff, cfg.AccountsRefillInterval, cfg.RevisionSubmissionBuffer, cfg.MigratorParallelSlabsPerWorker, cfg.RevisionBroadcastInterval) + ap, err := autopilot.New(cfg.ID, b, workers, l, cfg.Heartbeat, cfg.ScannerInterval, cfg.ScannerBatchSize, cfg.ScannerNumThreads, cfg.MigrationHealthCutoff, cfg.AccountsRefillInterval, cfg.RevisionSubmissionBuffer, cfg.MigratorParallelSlabsPerWorker, cfg.RevisionBroadcastInterval) if err != nil { return nil, nil, nil, err } diff --git a/internal/testing/cluster.go b/internal/testing/cluster.go index 6b7bdca27..23be69419 100644 --- a/internal/testing/cluster.go +++ b/internal/testing/cluster.go @@ -62,8 +62,9 @@ var ( Set: testContractSet, }, Hosts: api.HostsConfig{ - MaxDowntimeHours: 10, - AllowRedundantIPs: true, // allow for integration tests by default + MaxDowntimeHours: 10, + MinRecentScanFailures: 10, + AllowRedundantIPs: true, // allow for integration tests by default }, } @@ -960,7 +961,6 @@ func testApCfg() node.AutopilotConfig { ScannerInterval: time.Second, ScannerBatchSize: 10, ScannerNumThreads: 1, - ScannerMinRecentFailures: 5, }, } } diff --git a/stores/autopilot_test.go b/stores/autopilot_test.go index 1d8d5262a..6ea62edd7 100644 --- a/stores/autopilot_test.go +++ b/stores/autopilot_test.go @@ -37,8 +37,9 @@ func TestAutopilotStore(t *testing.T) { Set: testContractSet, }, Hosts: api.HostsConfig{ - MaxDowntimeHours: 10, - AllowRedundantIPs: true, // allow for integration tests by default + MaxDowntimeHours: 10, + MinRecentScanFailures: 10, + AllowRedundantIPs: true, // allow for integration tests by default }, Wallet: api.WalletConfig{ DefragThreshold: 1234, diff --git a/stores/migrations.go b/stores/migrations.go index 82fdaa449..998bcec78 100644 --- a/stores/migrations.go +++ b/stores/migrations.go @@ -273,6 +273,12 @@ func performMigrations(db *gorm.DB, logger *zap.SugaredLogger) error { return performMigration00022_extendObjectID(tx, logger) }, }, + { + ID: "00023_defaultMinRecentScanFailures", + Migrate: func(tx *gorm.DB) error { + return performMigration00023_defaultMinRecentScanFailures(tx, logger) + }, + }, } // Create migrator. m := gormigrate.New(db, gormigrate.DefaultOptions, migrations) @@ -995,3 +1001,26 @@ func performMigration00022_extendObjectID(txn *gorm.DB, logger *zap.SugaredLogge logger.Info("migration 00022_extendObjectID complete") return nil } + +func performMigration00023_defaultMinRecentScanFailures(txn *gorm.DB, logger *zap.SugaredLogger) error { + logger.Info("performing migration 00023_defaultMinRecentScanFailures") + + var autopilots []dbAutopilot + if err := txn.Model(&dbAutopilot{}).Find(&autopilots).Error; err != nil { + return err + } + + for _, autopilot := range autopilots { + if autopilot.Config.Hosts.MinRecentScanFailures == 0 { + autopilot.Config.Hosts.MinRecentScanFailures = 10 + if err := txn.Save(&autopilot).Error; err != nil { + logger.Errorf("failed to set default value for MinRecentScanFailures on autopilot '%v', err: %v", autopilot.Identifier, err) + return err + } + logger.Debugf("successfully defaulted MinRecentScanFailures to 10 on autopilot '%v'", autopilot.Identifier) + } + } + + logger.Info("migration 00023_defaultMinRecentScanFailures complete") + return nil +}