Skip to content

Commit

Permalink
Don't consider ContainerCannotRun with a 128 exit code as doomed (#694
Browse files Browse the repository at this point in the history
)

* Don't consider `ContainerCannotRun` with a 128 exit code as doomed

* Check state before lastState when testing for doomed pods
  • Loading branch information
DazWorrall committed Feb 19, 2020
1 parent e4e6e34 commit 59da174
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 8 deletions.
20 changes: 12 additions & 8 deletions lib/krane/kubernetes_resource/pod.rb
Original file line number Diff line number Diff line change
Expand Up @@ -219,21 +219,25 @@ def doom_reason
limbo_reason = @status.dig("state", "waiting", "reason")
limbo_message = @status.dig("state", "waiting", "message")

if @status.dig("lastState", "terminated", "reason") == "ContainerCannotRun"
# ref: https://github.com/kubernetes/kubernetes/blob/562e721ece8a16e05c7e7d6bdd6334c910733ab2/pkg/kubelet/dockershim/docker_container.go#L353
exit_code = @status.dig('lastState', 'terminated', 'exitCode')
"Failed to start (exit #{exit_code}): #{@status.dig('lastState', 'terminated', 'message')}"
elsif @status.dig("state", "terminated", "reason") == "ContainerCannotRun"
exit_code = @status.dig('state', 'terminated', 'exitCode')
"Failed to start (exit #{exit_code}): #{@status.dig('state', 'terminated', 'message')}"
elsif limbo_reason == "CrashLoopBackOff"
if limbo_reason == "CrashLoopBackOff"
exit_code = @status.dig('lastState', 'terminated', 'exitCode')
"Crashing repeatedly (exit #{exit_code}). See logs for more information."
elsif limbo_reason == "ErrImagePull" && limbo_message.match(/not found/i)
"Failed to pull image #{@image}. "\
"Did you wait for it to be built and pushed to the registry before deploying?"
elsif limbo_reason == "CreateContainerConfigError"
"Failed to generate container configuration: #{limbo_message}"
elsif @status.dig("lastState", "terminated", "reason") == "ContainerCannotRun"
# ref: https://github.com/kubernetes/kubernetes/blob/562e721ece8a16e05c7e7d6bdd6334c910733ab2/pkg/kubelet/dockershim/docker_container.go#L353
exit_code = @status.dig('lastState', 'terminated', 'exitCode')
# We've observed failures here that are actually issues with the node or kube infra, and not with the
# container. These issues have been transient and result in a 128 exit code, so do not treat these as fatal.
return if exit_code == 128
"Failed to start (exit #{exit_code}): #{@status.dig('lastState', 'terminated', 'message')}"
elsif @status.dig("state", "terminated", "reason") == "ContainerCannotRun"
exit_code = @status.dig('state', 'terminated', 'exitCode')
return if exit_code == 128
"Failed to start (exit #{exit_code}): #{@status.dig('state', 'terminated', 'message')}"
end
end

Expand Down
40 changes: 40 additions & 0 deletions test/unit/krane/kubernetes_resource/pod_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,46 @@ def test_deploy_failed_is_true_for_container_cannot_run_error
assert_equal(expected_msg.strip, pod.failure_message)
end

def test_deploy_failed_is_false_for_container_cannot_run_error_with_128_exit_code
container_state = {
"state" => {
"terminated" => {
"message" => "Error: failed to start container 'foo': Error response from daemon: grpc: the client" \
"connection is closing",
"reason" => "ContainerCannotRun",
"exitCode" => 128,
},
},
}
pod = build_synced_pod(build_pod_template(container_state: container_state))

refute_predicate(pod, :deploy_failed?)
assert_nil(pod.failure_message)
end

def test_deploy_failed_is_true_for_container_cannot_run_with_crash_loop_backoff
container_state = {
"state" => {
"waiting" => {
"message" => "Back-off 2m40s restarting failed container=myapp-container pod=myapp-pod_default",
"reason" => "CrashLoopBackOff",
},
},
"lastState" => {
"terminated" => {
"message" => "Error: failed to start container 'foo': Error response from daemon: grpc: the client" \
"connection is closing",
"reason" => "ContainerCannotRun",
"exitCode" => 128,
},
},
}
pod = build_synced_pod(build_pod_template(container_state: container_state))

assert_predicate(pod, :deploy_failed?)
assert_includes(pod.failure_message, 'Crashing repeatedly')
end

def test_deploy_failed_is_true_for_evicted_unmanaged_pods
template = pod_spec.merge(
"status" => {
Expand Down

0 comments on commit 59da174

Please sign in to comment.