Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix deletion of model in cases of ModeFailed and ScheduleFailed with server disconnects #4882

Merged
merged 4 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions scheduler/pkg/agent/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,14 @@ func (s *Server) removeServerReplicaImpl(serverName string, serverReplicaIdx int
s.logger.Debugf("Failed to reschedule model %s when server %s replica %d disconnected", modelName, serverName, serverReplicaIdx)
}
}
// retry failed models
// this is perhaps counterintuitive, but we want to retry failed models on other servers
// specifically in the case of model state `LoadFailed` and the server replica disconnects, we want to reconcile
// the model state with th new set of active servers
// note that this will also retry `ScheduleFailed`, which is a side effect of calling `ScheduleFailedModels`
if _, err := s.scheduler.ScheduleFailedModels(); err != nil {
s.logger.WithError(err).Errorf("Failed to reschedule failed models when server %s replica %d disconnected", serverName, serverReplicaIdx)
}
}

func (s *Server) drainServerReplicaImpl(serverName string, serverReplicaIdx int) {
Expand Down
2 changes: 1 addition & 1 deletion scheduler/pkg/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) error {
logger.Debugf("Model %s is deleted ensuring removed", modelName)
err = s.store.UpdateLoadedModels(modelName, latestModel.GetVersion(), server, []*store.ServerReplica{})
if err != nil {
logger.Warnf("Failed to unschedule model replicas for model %s on server %s", modelName, server)
logger.WithError(err).Warnf("Failed to unschedule model replicas for model %s on server %s", modelName, server)
}

} else {
Expand Down
2 changes: 1 addition & 1 deletion scheduler/pkg/store/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ func (m *MemoryStore) updateModelStateImpl(
modelKey, version, serverKey, replicaIdx, desiredState.String(),
)

// Update models loaded onto replica if loaded or unloaded is state
// Update models loaded onto replica for relevant state
if desiredState == Loaded || desiredState == Loading || desiredState == Unloaded || desiredState == LoadFailed {
server, ok := m.store.servers[serverKey]
if ok {
Expand Down
3 changes: 2 additions & 1 deletion scheduler/pkg/store/memory_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ func (m *MemoryStore) FailedScheduling(modelVersion *ModelVersion, reason string
AvailableReplicas: modelVersion.state.AvailableReplicas,
UnavailableReplicas: modelVersion.GetModel().GetDeploymentSpec().GetReplicas() - modelVersion.state.AvailableReplicas,
}

// make sure we reset server
modelVersion.server = ""
m.eventHub.PublishModelEvent(
modelFailureEventSource,
coordinator.ModelEventMsg{
Expand Down
Loading