Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Models that are already supported by `vec-inf` would be launched using the cache
#### Other commands

* `batch-launch`: Launch multiple model inference servers at once, currently ONLY single node models supported,
* `status`: Check the model status by providing its Slurm job ID.
* `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
* `metrics`: Streams performance metrics to the console.
* `shutdown`: Shutdown a model by providing its Slurm job ID.
* `list`: List all available model names, or view the default/cached configuration of a specific model.
Expand Down
53 changes: 35 additions & 18 deletions docs/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,35 +149,52 @@ Since batch launches use heterogeneous jobs, users can request different partiti

### `status` command

You can check the inference server status by providing the Slurm job ID to the `status` command:
You can check the status of all inference servers launched through `vec-inf` by running the `status` command:
```bash
vec-inf status
```

And you should see an output like this:
```
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Job ID ┃ Model Name ┃ Status ┃ Base URL ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
│ 1434429 │ Qwen3-8B │ READY │ http://gpu113:8080/v1 │
│ 1434584 │ Qwen3-14B │ READY │ http://gpu053:8080/v1 │
│ 1435035+0 │ Qwen3-32B │ PENDING │ UNAVAILABLE │
│ 1435035+1 │ Qwen3-14B │ PENDING │ UNAVAILABLE │
└───────────┴────────────┴─────────┴───────────────────────┘
```

If you want to check why a specific job is pending or failing, append the job ID to the status command:

```bash
vec-inf status 15373800
vec-inf status 1435035+1
```

If the server is pending for resources, you should see an output like this:

```
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━
┃ Job Status ┃ Value
┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━
│ Model Name │ Meta-Llama-3.1-8B-Instruct
│ Model Status │ PENDING
│ Pending Reason │ Resources
│ Base URL │ UNAVAILABLE
└────────────────┴────────────────────────────
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
┃ Job Status ┃ Value ┃
┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ Model Name │ Qwen3-14B
│ Model Status │ PENDING │
│ Pending Reason │ Resources │
│ Base URL │ UNAVAILABLE │
└────────────────┴─────────────┘
```

When the server is ready, you should see an output like this:

```
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━
┃ Job Status ┃ Value
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━
│ Model Name │ Meta-Llama-3.1-8B-Instruct
│ Model Status │ READY
│ Base URL │ http://gpu042:8080/v1
└──────────────┴────────────────────────────
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Job Status ┃ Value ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
│ Model Name │ Qwen3-14B
│ Model Status │ READY │
│ Base URL │ http://gpu105:8080/v1 │
└──────────────┴───────────────────────┘
```

There are 5 possible states:
Expand All @@ -190,7 +207,7 @@ There are 5 possible states:

**Note**
* The base URL is only available when model is in `READY` state.
* For servers launched with `batch-launch`, the job ID should follow the format of "MAIN_JOB_ID+OFFSET" (e.g. 17480109+0, 17480109+1).
* For servers launched with `batch-launch`, the job ID should follow the format of "MAIN_JOB_ID+OFFSET" (e.g. 1435035+0, 1435035+1).

### `metrics` command

Expand Down
107 changes: 106 additions & 1 deletion tests/vec_inf/cli/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def test_list_single_model(runner):


def test_status_command(runner):
"""Test status command."""
"""Test status command with job ID argument."""
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
mock_client = MagicMock()
mock_client_class.return_value = mock_client
Expand All @@ -154,6 +154,111 @@ def test_status_command(runner):
assert "Meta-Llama-3.1-8B" in result.output


def test_status_command_no_job_id_no_running_jobs(runner):
"""Test status command with no argument when no jobs are running."""
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
mock_client = MagicMock()
mock_client_class.return_value = mock_client
mock_client.fetch_running_jobs.return_value = []

result = runner.invoke(cli, ["status"])

assert result.exit_code == 0
assert "No running jobs found." in result.output


def test_status_command_no_job_id_single_running_job(runner):
"""Test status command with no argument when one job is running."""
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
mock_client = MagicMock()
mock_client_class.return_value = mock_client
mock_client.fetch_running_jobs.return_value = ["12345"]

mock_status = MagicMock()
mock_status.model_name = "test-model-1"
mock_status.server_status = "READY"
mock_status.base_url = "http://localhost:8000"
mock_status.pending_reason = None
mock_status.failed_reason = None
mock_client.get_status.return_value = mock_status

result = runner.invoke(cli, ["status"])

assert result.exit_code == 0
assert "test-model-1" in result.output
mock_client.fetch_running_jobs.assert_called_once()
mock_client.get_status.assert_called_once_with("12345")


def test_status_command_no_job_id_multiple_running_jobs(runner):
"""Test status command with no argument when multiple jobs are running."""
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
mock_client = MagicMock()
mock_client_class.return_value = mock_client
mock_client.fetch_running_jobs.return_value = ["12345", "67890"]

mock_status_1 = MagicMock()
mock_status_1.model_name = "test-model-1"
mock_status_1.server_status = "READY"
mock_status_1.base_url = "http://localhost:8000"
mock_status_1.pending_reason = None
mock_status_1.failed_reason = None

mock_status_2 = MagicMock()
mock_status_2.model_name = "test-model-2"
mock_status_2.server_status = "PENDING"
mock_status_2.base_url = None
mock_status_2.pending_reason = "Waiting for resources"
mock_status_2.failed_reason = None

mock_client.get_status.side_effect = [mock_status_1, mock_status_2]

result = runner.invoke(cli, ["status"])

assert result.exit_code == 0
assert "test-model-1" in result.output
assert "test-model-2" in result.output
assert "12345" in result.output
assert "67890" in result.output
mock_client.fetch_running_jobs.assert_called_once()
assert mock_client.get_status.call_count == 2


def test_status_command_no_job_id_multiple_jobs_json_mode(runner):
"""Test status command with no argument and JSON mode for multiple jobs."""
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
mock_client = MagicMock()
mock_client_class.return_value = mock_client
mock_client.fetch_running_jobs.return_value = ["12345", "67890"]

mock_status_1 = MagicMock()
mock_status_1.model_name = "test-model-1"
mock_status_1.server_status = "READY"
mock_status_1.base_url = "http://localhost:8000"
mock_status_1.pending_reason = None
mock_status_1.failed_reason = None

mock_status_2 = MagicMock()
mock_status_2.model_name = "test-model-2"
mock_status_2.server_status = "FAILED"
mock_status_2.base_url = None
mock_status_2.pending_reason = None
mock_status_2.failed_reason = "Out of memory"

mock_client.get_status.side_effect = [mock_status_1, mock_status_2]

result = runner.invoke(cli, ["status", "--json-mode"])

assert result.exit_code == 0
output = json.loads(result.output)
assert isinstance(output, list)
assert len(output) == 2
assert output[0]["model_name"] == "test-model-1"
assert output[0]["model_status"] == "READY"
assert output[1]["model_name"] == "test-model-2"
assert output[1]["model_status"] == "FAILED"


def test_shutdown_command(runner):
"""Test shutdown command."""
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
Expand Down
Loading