Skip to content

Commit

Permalink
Merge pull request #258 from Shopify/fix_summary_actions
Browse files Browse the repository at this point in the history
Fix action summaries with failures/timeouts
  • Loading branch information
KnVerey committed Apr 3, 2018
2 parents d152372 + 6560b6e commit 2c1abc1
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 21 deletions.
6 changes: 6 additions & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ inherit_from:

AllCops:
TargetRubyVersion: 2.3

Style/TrailingCommaInArrayLiteral:
Enabled: false

Style/TrailingCommaInHashLiteral:
Enabled: false
7 changes: 3 additions & 4 deletions lib/kubernetes-deploy/deploy_task.rb
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def run!(verify_result: true, allow_protected_ns: false, prune: true)
failed_resources = resources.reject(&:deploy_succeeded?)
success = failed_resources.empty?
if !success && failed_resources.all?(&:deploy_timed_out?)
raise DeploymentTimeoutError, failed_resources
raise DeploymentTimeoutError
end
raise FatalDeploymentError unless success
else
Expand All @@ -172,13 +172,12 @@ def run!(verify_result: true, allow_protected_ns: false, prune: true)
end
::StatsD.measure('all_resources.duration', StatsD.duration(start), tags: statsd_tags << "status:success")
@logger.print_summary(:success)
rescue DeploymentTimeoutError => error
@logger.summary.add_action(error.message)
rescue DeploymentTimeoutError
@logger.print_summary(:timed_out)
::StatsD.measure('all_resources.duration', StatsD.duration(start), tags: statsd_tags << "status:timeout")
raise
rescue FatalDeploymentError => error
@logger.summary.add_action(error.message) if error.message.present?
@logger.summary.add_action(error.message) if error.message != error.class.to_s
@logger.print_summary(:failure)
::StatsD.measure('all_resources.duration', StatsD.duration(start), tags: statsd_tags << "status:failed")
raise
Expand Down
8 changes: 1 addition & 7 deletions lib/kubernetes-deploy/errors.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,7 @@ def initialize(name, context)
end
end

class DeploymentTimeoutError < FatalDeploymentError
attr_reader :resources
def initialize(resources)
@resources = resources
super("Timed out waiting for #{@resources.count} resources.")
end
end
class DeploymentTimeoutError < FatalDeploymentError; end

module Errors
extend self
Expand Down
13 changes: 12 additions & 1 deletion lib/kubernetes-deploy/resource_watcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,18 @@ def record_statuses_for_summary(resources)
end

if fail_count > 0
@logger.summary.add_action("failed to #{@operation_name} #{fail_count} #{'resource'.pluralize(fail_count)}")
timeouts, failures = failed_resources.partition(&:deploy_timed_out?)
if timeouts.present?
@logger.summary.add_action(
"timed out waiting for #{timeouts.length} #{'resource'.pluralize(timeouts.length)} to #{@operation_name}"
)
end

if failures.present?
@logger.summary.add_action(
"failed to #{@operation_name} #{failures.length} #{'resource'.pluralize(failures.length)}"
)
end
KubernetesDeploy::Concurrency.split_across_threads(failed_resources, &:sync_debug_info)
failed_resources.each { |r| @logger.summary.add_paragraph(r.debug_message) }
end
Expand Down
7 changes: 3 additions & 4 deletions lib/kubernetes-deploy/restart_task.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,18 @@ def perform!(deployments_names = nil)
failed_resources = resources.reject(&:deploy_succeeded?)
success = failed_resources.empty?
if !success && failed_resources.all?(&:deploy_timed_out?)
raise DeploymentTimeoutError, failed_resources
raise DeploymentTimeoutError
end
raise FatalDeploymentError unless success
::StatsD.measure('restart.duration', StatsD.duration(start), tags: tags('success', deployments))
@logger.print_summary(:success)
rescue DeploymentTimeoutError => error
rescue DeploymentTimeoutError
::StatsD.measure('restart.duration', StatsD.duration(start), tags: tags('timeout', deployments))
@logger.summary.add_action(error.message)
@logger.print_summary(:timed_out)
raise
rescue FatalDeploymentError => error
::StatsD.measure('restart.duration', StatsD.duration(start), tags: tags('failure', deployments))
@logger.summary.add_action(error.message) if error.message.present?
@logger.summary.add_action(error.message) if error.message != error.class.to_s
@logger.print_summary(:failure)
raise
end
Expand Down
34 changes: 30 additions & 4 deletions test/integration/kubernetes_deploy_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ def test_refuses_deploy_to_protected_namespace_with_override_if_pruning_enabled
generated_ns = @namespace
@namespace = 'default'
assert_deploy_failure(deploy_fixtures("hello-cloud", allow_protected_ns: true, prune: true))
assert_logs_match(/Refusing to deploy to protected namespace 'default' with pruning enabled/)
assert_logs_match_all([
"Configuration invalid",
"- Refusing to deploy to protected namespace 'default' with pruning enabled"
], in_order: true)
ensure
@namespace = generated_ns
end
Expand All @@ -100,7 +103,10 @@ def test_refuses_deploy_to_protected_namespace_without_override
generated_ns = @namespace
@namespace = 'default'
assert_deploy_failure(deploy_fixtures("hello-cloud", prune: false))
assert_logs_match(/Refusing to deploy to protected namespace/)
assert_logs_match_all([
"Configuration invalid",
"- Refusing to deploy to protected namespace"
], in_order: true)
ensure
@namespace = generated_ns
end
Expand Down Expand Up @@ -301,6 +307,7 @@ def test_deployment_container_mounting_secret_that_does_not_exist_as_env_var_fai
assert_deploy_failure(result)

assert_logs_match_all([
"Failed to deploy 1 resource",
"Deployment/web: FAILED",
"The following containers are in a state that is unlikely to be recoverable:",
"app: Failed to generate container configuration: secrets \"monitoring-token\" not found",
Expand All @@ -318,6 +325,7 @@ def test_bad_container_image_on_deployment_pod_fails_quickly
assert_deploy_failure(result)

assert_logs_match_all([
"Failed to deploy 1 resource",
"Deployment/cannot-run: FAILED",
"The following containers are in a state that is unlikely to be recoverable:",
"container-cannot-run: Failed to pull image some-invalid-image:badtag.",
Expand All @@ -339,6 +347,7 @@ def test_crashing_container_on_deployment_fails_quickly
assert_deploy_failure(deploy_fixtures("invalid", subset: ["crash_loop.yml"]))

assert_logs_match_all([
"Failed to deploy 1 resource",
"Deployment/crash-loop: FAILED",
"The following containers are in a state that is unlikely to be recoverable:",
"crash-loop-back-off: Crashing repeatedly (exit 1). See logs for more information.",
Expand All @@ -350,6 +359,7 @@ def test_unrunnable_container_on_deployment_pod_fails_quickly
assert_deploy_failure(deploy_fixtures("invalid", subset: ["cannot_run.yml"]))

assert_logs_match_all([
"Failed to deploy 1 resource",
"Deployment/cannot-run: FAILED",
"The following containers are in a state that is unlikely to be recoverable:",
%r{container-cannot-run: Failed to start \(exit 127\): .*/some/bad/path},
Expand Down Expand Up @@ -385,6 +395,7 @@ def test_deployment_with_progress_times_out_for_short_duration
assert_deploy_failure(result, :timed_out)

assert_logs_match_all([
"Successfully deployed 1 resource and timed out waiting for 1 resource to deploy",
'Deployment/undying: TIMED OUT (progress deadline: 10s)',
'Timeout reason: ProgressDeadlineExceeded'
])
Expand Down Expand Up @@ -442,7 +453,7 @@ def test_create_ejson_secrets_with_malformed_secret_data
fixtures["secrets.ejson"]["kubernetes_secrets"]["monitoring-token"]["data"] = malformed
end
assert_deploy_failure(result)
assert_logs_match(/data for secret monitoring-token was invalid/)
assert_logs_match("Creation of kubernetes secrets from ejson failed: data for secret monitoring-token was invalid")
end

def test_pruning_of_secrets_created_from_ejson
Expand Down Expand Up @@ -489,8 +500,20 @@ def test_pruning_of_existing_managed_secrets_when_ejson_file_has_been_deleted
end

def test_deploy_result_logging_for_mixed_result_deploy
result = deploy_fixtures("invalid", subset: ["bad_probe.yml", "init_crash.yml", "missing_volumes.yml"])
result = deploy_fixtures("invalid", subset: ["bad_probe.yml", "init_crash.yml", "missing_volumes.yml"]) do |f|
f["bad_probe.yml"]["ConfigMap"] = {
"apiVersion" => "v1",
"kind" => "ConfigMap",
"metadata" => { "name" => "test" },
"data" => { "datapoint1" => "value1" }
}
end
assert_deploy_failure(result)
assert_logs_match_all([
"Successfully deployed 1 resource, timed out waiting for 2 resources to deploy, and failed to deploy 1 resource",
"Successful resources",
%r{ConfigMap/test\s+Available}
], in_order: true)

# Debug info for bad probe timeout
assert_logs_match_all([
Expand Down Expand Up @@ -706,6 +729,7 @@ def test_output_when_unmanaged_pod_preexists
def test_bad_container_on_daemon_sets_fails
assert_deploy_failure(deploy_fixtures("invalid", subset: ["crash_loop_daemon_set.yml"]))
assert_logs_match_all([
"Failed to deploy 1 resource",
"DaemonSet/crash-loop: FAILED",
"crash-loop-back-off: Crashing repeatedly (exit 1). See logs for more information.",
"Final status: 1 currentNumberScheduled, 1 desiredNumberScheduled, 0 numberReady",
Expand All @@ -728,6 +752,7 @@ def test_bad_container_on_stateful_sets_fails_with_rolling_update

assert_deploy_failure(result)
assert_logs_match_all([
"Failed to deploy 1 resource",
"StatefulSet/stateful-busybox: FAILED",
"app: Crashing repeatedly (exit 1). See logs for more information.",
"Events (common success events excluded):",
Expand Down Expand Up @@ -770,6 +795,7 @@ def test_resource_quotas_are_deployed_first
"Predeploying priority resources",
"Deploying ResourceQuota/resource-quotas (timeout: 30s)",
"Deployment/web rollout timed out",
"Successfully deployed 1 resource and timed out waiting for 1 resource to deploy",
"Successful resources",
"ResourceQuota/resource-quotas",
%r{Deployment/web: TIMED OUT \(progress deadline: \d+s\)},
Expand Down
2 changes: 1 addition & 1 deletion test/integration/restart_task_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def test_restart_failure
"Triggered `web` restart",
"Deployment/web rollout timed out",
"Result: TIMED OUT",
"Failed to restart 1 resource",
"Timed out waiting for 1 resource to restart",
"Deployment/web: TIMED OUT",
"The following containers have not passed their readiness probes",
"app must exit 0 from the following command",
Expand Down

0 comments on commit 2c1abc1

Please sign in to comment.