OpenFn · taylordowns2000 · Sep 8, 2025 · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,10 +34,16 @@ and this project adheres to
   [#1588](https://github.com/OpenFn/lightning/issues/1588)
 
 ### Fixed
+
 - Tooltip gets stuck when switching pages
   [#3559](https://github.com/OpenFn/lightning/pull/3559)
 - Current run dataclip stuck when switching nodes
   [#3560](https://github.com/OpenFn/lightning/pull/3560)
+- Release `:claimed` runs back to `:available` state if their `run_channel` is
+  never joined because of worker timeout/network issues. We can do this because
+  we know that if the run channel never gets joined, the worker never receives
+  data and cannot begin running the run.
+  [#3565](https://github.com/OpenFn/lightning/issues/3565)
 
 ## [v2.14.4] - 2025-09-09
 

diff --git a/lib/lightning/config.ex b/lib/lightning/config.ex
@@ -61,12 +61,12 @@ defmodule Lightning.Config do
     end
 
     @impl true
-    def grace_period do
+    def grace_period_seconds do
       Application.get_env(:lightning, :run_grace_period_seconds)
     end
 
     @impl true
-    def default_max_run_duration do
+    def default_max_run_duration_seconds do
       Application.get_env(:lightning, :max_run_duration_seconds)
     end
 
@@ -324,6 +324,11 @@ defmodule Lightning.Config do
       Application.get_env(:lightning, :per_workflow_claim_limit, 50)
     end
 
+    @impl true
+    def run_channel_join_timeout_seconds do
+      Application.get_env(:lightning, :run_channel_join_timeout_seconds, 30)
+    end
+
     @impl true
     def metrics_run_performance_age_seconds do
       metrics_config() |> Keyword.get(:run_performance_age_seconds)
@@ -397,11 +402,11 @@ defmodule Lightning.Config do
   @callback apollo(key :: atom() | nil) :: map()
   @callback check_flag?(atom()) :: boolean() | nil
   @callback cors_origin() :: list()
-  @callback default_max_run_duration() :: integer()
+  @callback default_max_run_duration_seconds() :: integer()
   @callback email_sender_name() :: String.t()
   @callback get_extension_mod(key :: atom()) :: any()
   @callback google(key :: atom()) :: any()
-  @callback grace_period() :: integer()
+  @callback grace_period_seconds() :: integer()
   @callback instance_admin_email() :: String.t()
   @callback kafka_alternate_storage_enabled?() :: boolean()
   @callback kafka_alternate_storage_file_path() :: String.t()
@@ -447,6 +452,7 @@ defmodule Lightning.Config do
   @callback external_metrics_module() :: module() | nil
   @callback ai_assistant_modes() :: %{atom() => module()}
   @callback per_workflow_claim_limit() :: pos_integer()
+  @callback run_channel_join_timeout_seconds() :: pos_integer()
   @callback sentry() :: module()
   @callback webhook_retry() :: Keyword.t()
   @callback webhook_retry(key :: atom()) :: any()
@@ -493,15 +499,15 @@ defmodule Lightning.Config do
 
   The returned value is in seconds.
   """
-  def grace_period do
-    impl().grace_period()
+  def grace_period_seconds do
+    impl().grace_period_seconds()
   end
 
   @doc """
   Returns the default maximum run duration in seconds.
   """
-  def default_max_run_duration do
-    impl().default_max_run_duration()
+  def default_max_run_duration_seconds do
+    impl().default_max_run_duration_seconds()
   end
 
   def repo_connection_token_signer do
@@ -684,6 +690,10 @@ defmodule Lightning.Config do
     impl().per_workflow_claim_limit()
   end
 
+  def run_channel_join_timeout_seconds do
+    impl().run_channel_join_timeout_seconds()
+  end
+
   def sentry do
     impl().sentry()
   end

diff --git a/lib/lightning/config/bootstrap.ex b/lib/lightning/config/bootstrap.ex
@@ -278,6 +278,10 @@ defmodule Lightning.Config.Bootstrap do
            :max_run_duration_seconds,
            env!("WORKER_MAX_RUN_DURATION_SECONDS", :integer, 300)
 
+    config :lightning,
+           :run_channel_join_timeout_seconds,
+           env!("RUN_CHANNEL_JOIN_TIMEOUT_SECONDS", :integer, 30)
+
     config :lightning,
            :max_dataclip_size_bytes,
            env!("MAX_DATACLIP_SIZE_MB", :integer, 10) * 1_000_000

diff --git a/lib/lightning/extensions/usage_limiter.ex b/lib/lightning/extensions/usage_limiter.ex
@@ -17,7 +17,7 @@ defmodule Lightning.Extensions.UsageLimiter do
   def get_run_options(context) do
     [
       save_dataclips: Lightning.Projects.save_dataclips?(context.project_id),
-      run_timeout_ms: Lightning.Config.default_max_run_duration() * 1000
+      run_timeout_ms: Lightning.Config.default_max_run_duration_seconds() * 1000
     ]
   end
 

diff --git a/lib/lightning/runs.ex b/lib/lightning/runs.ex
@@ -345,4 +345,29 @@ defmodule Lightning.Runs do
     |> order_by([{^order, :timestamp}])
     |> Repo.stream()
   end
+
+  @doc """
+  Rolls back claimed runs to available state.
+
+  This is used when a worker socket disconnects after runs have been claimed
+  but before the worker receives the response. The runs are set back to :available
+  so they can be claimed by another worker.
+  """
+  @spec rollback_claimed_runs([Run.t()]) :: {:ok, non_neg_integer()}
+  def rollback_claimed_runs(runs) do
+    # Set the runs back to :available state so they can be claimed by another worker
+    run_ids = Enum.map(runs, & &1.id)
+
+    {count, _} =
+      from(r in Run, where: r.id in ^run_ids)
+      |> Repo.update_all(
+        set: [state: :available, claimed_at: nil, worker_name: nil]
+      )
+
+    Logger.info(
+      "Successfully rolled back #{count} claimed runs to :available state"
+    )
+
+    {:ok, count}
+  end
 end
diff --git a/lib/lightning/runs/query.ex b/lib/lightning/runs/query.ex
@@ -21,15 +21,16 @@ defmodule Lightning.Runs.Query do
   def lost do
     now = Lightning.current_time()
 
-    grace_period_ms = Lightning.Config.grace_period() * 1000
+    grace_period_seconds = Lightning.Config.grace_period_seconds()
+    grace_period_ms = grace_period_seconds * 1000
 
     # TODO: Remove after live deployment rollouts are done. ====================
-    fallback_max = Lightning.Config.default_max_run_duration()
+    fallback_max_seconds = Lightning.Config.default_max_run_duration_seconds()
 
     fallback_oldest_claim =
       now
-      |> DateTime.add(-fallback_max, :second)
-      |> DateTime.add(-grace_period_ms, :millisecond)
+      |> DateTime.add(-fallback_max_seconds, :second)
+      |> DateTime.add(-grace_period_seconds, :second)
 
     # ==========================================================================
 

diff --git a/lib/lightning/workers.ex b/lib/lightning/workers.ex
@@ -82,7 +82,7 @@ defmodule Lightning.Workers do
   defp calculate_token_expiry(run_timeout_ms) do
     Lightning.current_time()
     |> DateTime.add(run_timeout_ms, :millisecond)
-    |> DateTime.add(Lightning.Config.grace_period())
+    |> DateTime.add(Lightning.Config.grace_period_seconds(), :second)
     |> DateTime.to_unix()
   end
 

diff --git a/lib/lightning_web/channels/run_channel.ex b/lib/lightning_web/channels/run_channel.ex
@@ -31,6 +31,20 @@ defmodule LightningWeb.RunChannel do
            Runs.get_project_id_for_run(run) do
       Sentry.Context.set_extra_context(%{run_id: id})
 
+      # Notify the worker channel that this run channel has been joined
+      # Use PubSub for cross-node communication in clustered environments
+      case socket.assigns[:worker_id] do
+        nil ->
+          # No worker ID available, continue normally
+          :ok
+
+        worker_id ->
+          Lightning.broadcast(
+            "worker_channel:#{worker_id}",
+            {:run_channel_joined, id, worker_id}
+          )
+      end
+
       {:ok,
        socket
        |> assign(%{