From 1af5eb73c68b70a717c033cbcc7adfd297ac3483 Mon Sep 17 00:00:00 2001 From: mwrothbe Date: Mon, 13 Oct 2025 13:07:08 -0700 Subject: [PATCH 1/5] added reranker api --- src/cli/openarc_cli.py | 4 +- src/engine/optimum/optimum_rr.py | 114 +++++++++++++++++++++++++++++++ src/server/launch.py | 1 + src/server/main.py | 68 +++++++++++++++++- src/server/model_registry.py | 5 +- src/server/models/optimum.py | 55 ++++++++++++--- src/server/worker_registry.py | 94 ++++++++++++++++++++++++- 7 files changed, 325 insertions(+), 16 deletions(-) create mode 100644 src/engine/optimum/optimum_rr.py diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py index 98ab976..84d3fef 100644 --- a/src/cli/openarc_cli.py +++ b/src/cli/openarc_cli.py @@ -169,9 +169,9 @@ def cli(): required=True, help='Engine used to load the model (ovgenai, openvino, optimum)') @click.option('--model-type', '--mt', - type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb']), + type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb', 'rerank']), required=True, - help='Model type (llm, vlm, whisper, kokoro, emb)') + help='Model type (llm, vlm, whisper, kokoro, emb, rerank)') @click.option('--device', '--d', required=True, help='Device(s) to load the model on.') diff --git a/src/engine/optimum/optimum_rr.py b/src/engine/optimum/optimum_rr.py new file mode 100644 index 0000000..3718f5c --- /dev/null +++ b/src/engine/optimum/optimum_rr.py @@ -0,0 +1,114 @@ + + +import asyncio +import gc +import logging +from typing import Any, AsyncIterator, Dict, List, Union + +import torch +import torch.nn.functional as F +from torch import Tensor + +from transformers import AutoTokenizer +from optimum.intel import OVModelForCausalLM + +from src.server.models.optimum import RerankerConfig + +from typing import Any, AsyncIterator, Dict + +from src.server.model_registry import ModelLoadConfig, ModelRegistry + +class Optimum_RR: + + def __init__(self, load_config: ModelLoadConfig): + self.model_path = None + self.encoder_tokenizer = None + self.load_config = load_config + + def compute_logits(self, inputs, **kwargs): + batch_scores = self.model(**inputs).logits[:, -1, :] + true_vector = batch_scores[:, self.token_true_id] + false_vector = batch_scores[:, self.token_false_id] + batch_scores = torch.stack([false_vector, true_vector], dim=1) + batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1) + scores = batch_scores[:, 1].exp().tolist() + return scores + + def format_instruction(self, instruction, query, doc): + if instruction is None: + instruction = "Given a search query, retrieve relevant passages that answer the query" + output = ": {instruction}\n: {query}\n: {doc}".format(instruction=instruction, query=query, doc=doc) + return output + + async def generate_rerankings(self, rr_config: RerankerConfig) -> AsyncIterator[Union[Dict[str, Any], str]]: + prefix_tokens = self.tokenizer.encode(rr_config.prefix, add_special_tokens=False) + suffix_tokens = self.tokenizer.encode(rr_config.suffix, add_special_tokens=False) + self.max_length = rr_config.max_length + pairs = [self.format_instruction(rr_config.task, rr_config.query, doc) for doc in rr_config.documents] + print(pairs) + # Currently hard coding tokenizer args. If these are model independent than it is fine. Otherwise + # implement the rr_config PreTrainedTokenizerConfig args. + max_length = 8192 + inputs = self.tokenizer( + pairs, padding=False, truncation="longest_first", return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens) + ) + + for i, ele in enumerate(inputs["input_ids"]): + inputs["input_ids"][i] = prefix_tokens + ele + suffix_tokens + + # Currently hard coding tokenizer args. If these are model independent than it is fine. Otherwise + # implement the rr_config PreTrainedTokenizerConfig args. + inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=self.max_length) + for key in inputs: + inputs[key] = inputs[key].to(self.model.device) + + scores = self.compute_logits(inputs) + + ranked_documents = [{"doc":doc, "score":score} for score, doc in sorted(zip(scores, rr_config.documents), reverse=True)] + + yield ranked_documents + + #not implemented + def collect_metrics(self, rr_config: RerankerConfig, perf_metrics) -> Dict[str, Any]: + pass + + def load_model(self, loader: ModelLoadConfig): + """Load model using a ModelLoadConfig configuration and cache the tokenizer. + + Args: + loader: ModelLoadConfig containing model_path, device, engine, and runtime_config. + """ + + self.model = OVModelForCausalLM.from_pretrained(loader.model_path, + device=loader.device, + export=False, + use_cache=False) + + self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path) + self.token_false_id = self.tokenizer.convert_tokens_to_ids("no") + self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes") + logging.info(f"Model loaded successfully: {loader.model_name}") + + async def unload_model(self, registry: ModelRegistry, model_name: str) -> bool: + """Unregister model from registry and free memory resources. + + Args: + registry: ModelRegistry to unregister from + model_id: Private model identifier returned by register_load + + Returns: + True if the model was found and unregistered, else False. + """ + removed = await registry.register_unload(model_name) + + if self.model is not None: + del self.model + self.model = None + + if self.tokenizer is not None: + del self.tokenizer + self.tokenizer = None + + gc.collect() + logging.info(f"[{self.load_config.model_name}] weights and tokenizer unloaded and memory cleaned up") + return removed \ No newline at end of file diff --git a/src/server/launch.py b/src/server/launch.py index 1a42863..2b889d3 100644 --- a/src/server/launch.py +++ b/src/server/launch.py @@ -91,6 +91,7 @@ def start_server(host: str = "0.0.0.0", openarc_port: int = 8001, reload: bool = logger.info(" - POST /v1/audio/transcriptions: Whisper only") logger.info(" - POST /v1/audio/speech: Kokoro only") logger.info(" - POST /v1/embeddings") + logger.info(" - POST /v1/rerank") uvicorn.run( diff --git a/src/server/main.py b/src/server/main.py index 299fbfb..0783f68 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -21,7 +21,7 @@ from src.server.worker_registry import WorkerRegistry from src.server.models.openvino import OV_KokoroGenConfig from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig -from src.server.models.optimum import PreTrainedTokenizerConfig +from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig #===============================================================# # Logging @@ -162,6 +162,16 @@ class EmbeddingsRequest(BaseModel): #end of openai api config: Optional[PreTrainedTokenizerConfig] = None +# No openai api to reference +class RerankRequest(BaseModel): + model: str + query: str + documents: List[str] + prefix:Optional[str] = None + suffix:Optional[str] = None + task:Optional[str] = None + config: Optional[PreTrainedTokenizerConfig] = None #not implemented + @app.get("/v1/models", dependencies=[Depends(verify_api_key)]) async def openai_list_models(): """OpenAI-compatible endpoint that lists available models.""" @@ -399,4 +409,58 @@ async def embeddings(request: EmbeddingsRequest): raise HTTPException(status_code=400, detail=str(exc)) except Exception as exc: traceback.print_exc() - raise HTTPException(status_code=500, detail=f"Embedding failed: {str(exc)}") \ No newline at end of file + raise HTTPException(status_code=500, detail=f"Embedding failed: {str(exc)}") + +@app.post("/v1/rerank", dependencies=[Depends(verify_api_key)]) +async def rerank(request: RerankRequest): + + try: + if request.config: + + tok_config = PreTrainedTokenizerConfig.model_validate(request.config) + base_data = tok_config.model_dump() + rr_config = RerankerConfig.model_validate(base_data | {"query":request.query,"documents":request.documents}) + if request.prefix: + rr_config.prefix = request.prefix + if request.suffix: + rr_config.suffix = request.suffix + if request.task: + rr_config.task = request.task + + model_name = request.model + created_ts = int(time.time()) + request_id = f"ov-{uuid.uuid4().hex[:24]}" + + result = await _workers.rerank(model_name, rr_config) + data = result.get("data", None) + metrics = result.get("metrics", {}) or {} + + prompt_tokens = metrics.get("input_token", 0) + total_tokens = metrics.get("total_token", prompt_tokens) + + docs = [] + for i in range(len(data)): + docs.append({ + "index":i, + "object":"ranked_documents", + "ranked_documents":data[i] + }) + + response = { + "id": request_id, + "object": "list", + "created": created_ts, + "model": model_name, + "data": docs, + "usage": { + "prompt_tokens": prompt_tokens, + "total_tokens": total_tokens, + }, + } + + return response + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) + except Exception as exc: + traceback.print_exc() + raise HTTPException(status_code=500, detail=f"Reranking failed: {str(exc)}") \ No newline at end of file diff --git a/src/server/model_registry.py b/src/server/model_registry.py index eb9192b..7a8d5d5 100644 --- a/src/server/model_registry.py +++ b/src/server/model_registry.py @@ -64,13 +64,15 @@ class ModelType(str, Enum): - vlm: Image-to-text VLM models - whisper: Whisper ASR models - kokoro: Kokoro TTS models - - emb: Text-to-vector models""" + - emb: Text-to-vector models + - rerank: Reranker models""" LLM = "llm" VLM = "vlm" WHISPER = "whisper" KOKORO = "kokoro" EMB = "emb" + RERANK = "rerank" class EngineType(str, Enum): """Engine used to load the model. @@ -291,6 +293,7 @@ async def status(self) -> dict: (EngineType.OV_GENAI, ModelType.WHISPER): "src.engine.ov_genai.whisper.OVGenAI_Whisper", (EngineType.OPENVINO, ModelType.KOKORO): "src.engine.openvino.kokoro.OV_Kokoro", (EngineType.OV_OPTIMUM, ModelType.EMB): "src.engine.optimum.optimum_emb.Optimum_EMB", + (EngineType.OV_OPTIMUM, ModelType.RERANK): "src.engine.optimum.optimum_rr.Optimum_RR", } async def create_model_instance(load_config: ModelLoadConfig) -> Any: diff --git a/src/server/models/optimum.py b/src/server/models/optimum.py index 215cf52..6d81e42 100644 --- a/src/server/models/optimum.py +++ b/src/server/models/optimum.py @@ -7,7 +7,7 @@ class PreTrainedTokenizerConfig(BaseModel): Configuration for tokenizer. """ - text: Union[str, List[str], List[List[str]]] = Field( + text: Union[str, List[str], List[List[str]]] | None = Field( default=None, description=( "The sequence or batch of sequences to be encoded. Each sequence can be a string " @@ -17,7 +17,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - text_pair: Union[str, List[str], List[List[str]]] = Field( + text_pair: Union[str, List[str], List[List[str]]] | None = Field( default=None, description=( "The sequence or batch of sequences to be encoded. Each sequence can be a string " @@ -27,7 +27,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - text_target: Union[str, List[str], List[List[str]]] = Field( + text_target: Union[str, List[str], List[List[str]]] | None = Field( default=None, description=( "The sequence or batch of sequences to be encoded as target texts. Each sequence can be " @@ -37,7 +37,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - text_pair_target: Union[str, List[str], List[List[str]]] = Field( + text_pair_target: Union[str, List[str], List[List[str]]] | None = Field( default=None, description=( "The sequence or batch of sequences to be encoded as target texts. Each sequence can be " @@ -85,7 +85,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - max_length: int = Field( + max_length: int | None = Field( default=None, description=( "Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to " @@ -113,7 +113,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - pad_to_multiple_of: int = Field( + pad_to_multiple_of: int | None = Field( default=None, description=( "If set will pad the sequence to a multiple of the provided value. Requires padding to be activated. " @@ -122,7 +122,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - padding_side: str = Field( + padding_side: str | None = Field( default=None, description=( "The side on which the model should have padding applied. Should be selected between ['right', 'left']. " @@ -138,7 +138,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - return_token_type_ids: bool = Field( + return_token_type_ids: bool | None = Field( default=None, description=( "Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific " @@ -146,7 +146,7 @@ class PreTrainedTokenizerConfig(BaseModel): ) ) - return_attention_mask: bool = Field( + return_attention_mask: bool | None = Field( default=None, description=( "Whether to return the attention mask. If left to the default, will return the attention mask according to the " @@ -190,3 +190,40 @@ class PreTrainedTokenizerConfig(BaseModel): "Whether or not to print more information and warnings." ) ) + +class RerankerConfig(PreTrainedTokenizerConfig): + + query: str = Field( + default=None, + description=( + "Phrase to compare documents to." + ) + ) + + documents: List[str] = Field( + default=None, + description=( + "Documents to rank." + ) + ) + + prefix: str = Field( + default='<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n', + description=( + "Text to append to start of query. This is model specific." + ) + ) + + suffix: str = Field( + default="<|im_end|>\n<|im_start|>assistant\n\n\n\n\n", + description=( + "Text to append to end of query. This is model specific." + ) + ) + + task: str = Field( + default="Given a search query, retrieve relevant passages that answer the query", + description=( + "Prompt command delivered to the model." + ) + ) diff --git a/src/server/worker_registry.py b/src/server/worker_registry.py index 30269f6..b11228f 100644 --- a/src/server/worker_registry.py +++ b/src/server/worker_registry.py @@ -9,10 +9,11 @@ from src.engine.ov_genai.whisper import OVGenAI_Whisper from src.engine.openvino.kokoro import OV_Kokoro from src.engine.optimum.optimum_emb import Optimum_EMB +from src.engine.optimum.optimum_rr import Optimum_RR from src.server.models.openvino import OV_KokoroGenConfig from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig -from src.server.models.optimum import PreTrainedTokenizerConfig +from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig from src.server.model_registry import ModelRecord, ModelRegistry, ModelType logger = logging.getLogger(__name__) @@ -68,7 +69,7 @@ class InferWorker: - infer_whisper: Process audio transcription requests - infer_kokoro: Process speech generation requests - infer_emb: Process embedding requests - + - infer_rerank: Process reranking requests """ @staticmethod @@ -249,6 +250,34 @@ async def infer_emb(packet: WorkerPacket, emb_instance: Optimum_EMB) -> WorkerPa await packet.stream_queue.put(None) return packet + + @staticmethod + async def infer_rerank(packet: WorkerPacket, rerank_instance: Optimum_RR) -> WorkerPacket: + """Generate reranking for a single packet using the optimum pipeline""" + metrics = None + final_data = None + + try: + async for item in rerank_instance.generate_rerankings(packet.gen_config): + if isinstance(item, dict): + metrics = item + else: + final_data = item + + packet.response = final_data + packet.metrics = metrics + + except Exception as e: + # Log the full exception with traceback + logger.error("Reranking failed!", exc_info=True) + # Store error in packet response + packet.response = f"Error: {str(e)}" + packet.metrics = None + # Signal error to stream if streaming + if packet.gen_config.stream and packet.stream_queue is not None: + await packet.stream_queue.put(None) + + return packet class QueueWorker: """ @@ -409,6 +438,28 @@ async def queue_worker_emb(model_name: str, model_queue: asyncio.Queue, emb_mode packet.result_future.set_result(completed_packet) model_queue.task_done() + @staticmethod + async def queue_worker_rr(model_name: str, model_queue: asyncio.Queue, rr_model: Optimum_RR, registry: ModelRegistry): + """Reranker model inference worker that processes packets from queue""" + logger.info(f"[{model_name} Reranker Worker] Started, waiting for packets...") + while True: + packet = await model_queue.get() + if packet is None: + logger.info(f"[{model_name} Reranker Worker] Shutdown signal received.") + break + + completed_packet = await InferWorker.infer_rerank(packet, rr_model) + # Check if inference failed and trigger model unload + if not completed_packet.response: + logger.error(f"[{model_name} Reranker Worker] Inference failed, triggering model unload...") + asyncio.create_task(registry.register_unload(model_name)) + break + if completed_packet.metrics: + logger.info(f"[{model_name} Reranker Worker] Metrics: {completed_packet.metrics}") + if packet.result_future is not None and not packet.result_future.done(): + packet.result_future.set_result(completed_packet) + model_queue.task_done() + class WorkerRegistry: """ Central orchestrator for managing per-model inference workers and request routing. @@ -464,6 +515,9 @@ def __init__(self, model_registry: ModelRegistry): self._model_queues_emb: Dict[str, asyncio.Queue] = {} self._model_tasks_emb: Dict[str, asyncio.Task] = {} + self._model_queues_rerank: Dict[str, asyncio.Queue] = {} + self._model_tasks_rerank: Dict[str, asyncio.Task] = {} + self._lock = asyncio.Lock() self._model_registry.add_on_loaded(self._on_model_loaded) @@ -520,6 +574,13 @@ async def _on_model_loaded(self, record: ModelRecord) -> None: self._model_queues_emb[record.model_name] = q task = asyncio.create_task(QueueWorker.queue_worker_emb(record.model_name, q, instance, self._model_registry)) self._model_tasks_emb[record.model_name] = task + + elif mt == ModelType.RERANK and isinstance(instance, Optimum_RR): + if record.model_name not in self._model_queues_rerank: + q: asyncio.Queue = asyncio.Queue() + self._model_queues_rerank[record.model_name] = q + task = asyncio.create_task(QueueWorker.queue_worker_rr(record.model_name, q, instance, self._model_registry)) + self._model_tasks_rerank[record.model_name] = task else: logger.info(f"[WorkerRegistry] Model type/instance mismatch for {record.model_name}: {record.model_type}, {type(instance)}") @@ -564,6 +625,14 @@ async def _on_model_unloaded(self, record: ModelRecord) -> None: await q.put(None) if t is not None and not t.done(): t.cancel() + + # Try rerank dicts + q = self._model_queues_rerank.pop(record.model_name, None) + t = self._model_tasks_rerank.pop(record.model_name, None) + if q is not None: + await q.put(None) + if t is not None and not t.done(): + t.cancel() def _get_model_queue(self, model_name: str) -> asyncio.Queue: q = self._model_queues_llm.get(model_name) @@ -591,6 +660,12 @@ def _get_emb_queue(self, model_name: str) -> asyncio.Queue: if q is not None: return q raise ValueError(f"Embedding model '{model_name}' is not loaded or no worker is available") + + def _get_rerank_queue(self, model_name: str) -> asyncio.Queue: + q = self._model_queues_rerank.get(model_name) + if q is not None: + return q + raise ValueError(f"Rerank model '{model_name}' is not loaded or no worker is available") async def generate(self, model_name: str, gen_config: OVGenAI_GenConfig) -> Dict[str, Any]: """Generate text without streaming.""" @@ -674,4 +749,19 @@ async def embed(self, model_name: str, tok_config: PreTrainedTokenizerConfig) -> q = self._get_emb_queue(model_name) await q.put(packet) completed = await result_future + return {"data": completed.response, "metrics": completed.metrics or {}} + + async def rerank(self, model_name: str, rr_config: RerankerConfig) -> Dict[str, Any]: + """Rerank documents.""" + request_id = uuid.uuid4().hex + result_future: asyncio.Future = asyncio.get_running_loop().create_future() + packet = WorkerPacket( + request_id=request_id, + id_model=model_name, + gen_config=rr_config, + result_future=result_future, + ) + q = self._get_rerank_queue(model_name) + await q.put(packet) + completed = await result_future return {"data": completed.response, "metrics": completed.metrics or {}} \ No newline at end of file From bc50a709f98e546f2402b8cef79abb3813a759f6 Mon Sep 17 00:00:00 2001 From: mwrothbe Date: Mon, 13 Oct 2025 17:21:49 -0700 Subject: [PATCH 2/5] Bug fix when no tok config is provided --- src/server/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/server/main.py b/src/server/main.py index 0783f68..3795245 100644 --- a/src/server/main.py +++ b/src/server/main.py @@ -416,10 +416,12 @@ async def rerank(request: RerankRequest): try: if request.config: - tok_config = PreTrainedTokenizerConfig.model_validate(request.config) base_data = tok_config.model_dump() rr_config = RerankerConfig.model_validate(base_data | {"query":request.query,"documents":request.documents}) + else: + rr_config = RerankerConfig.model_validate({"query":request.query,"documents":request.documents}) + if request.prefix: rr_config.prefix = request.prefix if request.suffix: From 22d4284f54c772728b27ae1a046336c0d30349ec Mon Sep 17 00:00:00 2001 From: mwrothbe Date: Wed, 15 Oct 2025 09:28:13 -0700 Subject: [PATCH 3/5] Added load all option to cli --- src/cli/openarc_cli.py | 63 +++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py index 84d3fef..170cec2 100644 --- a/src/cli/openarc_cli.py +++ b/src/cli/openarc_cli.py @@ -76,6 +76,12 @@ def get_model_config(model_name: str): models = config.get("models", {}) return models.get(model_name) +def get_model_configs(): + """Get model configuration by name.""" + config = load_full_config() + models = config.get("models", {}) + return models + def remove_model_config(model_name: str): """Remove model configuration by name.""" config = load_full_config() @@ -199,42 +205,53 @@ def add(ctx, model_path, model_name, engine, model_type, device, runtime_config) @cli.command() @click.option('--model-name', '--mn', - required=True, + required=False, help='Model name to load from saved configuration.') +@click.option('--all-models', '--am', + required=False, + is_flag=True, + help='Load all models from saved configuration.') @click.pass_context -def load(ctx, model_name): +def load(ctx, model_name, all_models): """- Load a model from saved configuration.""" cli_instance = OpenArcCLI() + configs = {} # Get saved configuration - saved_config = get_model_config(model_name) - - if not saved_config: + if all_models: + configs = get_model_configs() + elif model_name: + configs[model_name]=get_model_config(model_name) + else: + console.print(f"❌ [red]A model name or all-models must be selected.[/red] {model_name}") + console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]") + ctx.exit(1) + + if len(configs)==0 or (model_name and not configs[model_name]): console.print(f"❌ [red]Model configuration not found:[/red] {model_name}") console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]") ctx.exit(1) - load_config = saved_config.copy() - - # Make API request to load the model - url = f"{cli_instance.base_url}/openarc/load" - - try: - console.print("[cyan]working...[/cyan]") - response = requests.post(url, json=load_config, headers=cli_instance.get_headers()) + for key, value in configs.items(): + # Make API request to load the model + url = f"{cli_instance.base_url}/openarc/load" - if response.status_code == 200: + try: + console.print("[cyan]working...[/cyan]") + response = requests.post(url, json=value, headers=cli_instance.get_headers()) + + if response.status_code == 200: - console.print("[green]Done![/green]") - console.print("[dim]Use 'openarc status' to check the status of loaded models.[/dim]") - else: - console.print(f"❌ [red]error: {response.status_code}[/red]") - console.print(f"[red]Response:[/red] {response.text}") + console.print("[green]Done![/green]") + console.print("[dim]Use 'openarc status' to check the status of loaded models.[/dim]") + else: + console.print(f"❌ [red]error: {response.status_code}[/red]") + console.print(f"[red]Response:[/red] {response.text}") + ctx.exit(1) + + except requests.exceptions.RequestException as e: + console.print(f"❌ [red]Request failed:[/red] {e}") ctx.exit(1) - - except requests.exceptions.RequestException as e: - console.print(f"❌ [red]Request failed:[/red] {e}") - ctx.exit(1) @cli.command() @click.option('--model-name', '--mn', required=True, help='Model name to unload') From e8d6e71f5104d727a6c2424283c070fd10920739 Mon Sep 17 00:00:00 2001 From: mwrothbe Date: Mon, 20 Oct 2025 17:37:15 -0700 Subject: [PATCH 4/5] Reverting CLI to pre-commit --- src/cli/openarc_cli.py | 607 +++++++++++++++++++++++------------------ 1 file changed, 343 insertions(+), 264 deletions(-) diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py index 170cec2..9938cc0 100644 --- a/src/cli/openarc_cli.py +++ b/src/cli/openarc_cli.py @@ -2,108 +2,67 @@ """ OpenArc CLI Tool - Command-line interface for OpenArc server operations. """ -import os import json +import os +import sys +import traceback +import requests +import yaml from pathlib import Path -import requests import rich_click as click +from rich import print as rprint from rich.console import Console from rich.panel import Panel from rich.table import Table from rich.text import Text -from src.server.launch import start_server -from src.cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery +from api.launcher import start_server +from cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery click.rich_click.STYLE_OPTIONS_TABLE_LEADING = 1 click.rich_click.STYLE_OPTIONS_TABLE_BOX = "SIMPLE" + +# click.rich_click.STYLE_OPTIONS_TABLE_ROW_STYLES = ["bold", ""] click.rich_click.STYLE_COMMANDS_TABLE_SHOW_LINES = True +# click.rich_click.STYLE_COMMANDS_TABLE_PAD_EDGE = True +#click.rich_click.STYLE_COMMANDS_TABLE_BOX = "DOUBLE" click.rich_click.STYLE_COMMANDS_TABLE_BORDER_STYLE = "red" click.rich_click.STYLE_COMMANDS_TABLE_ROW_STYLES = ["magenta", "yellow", "cyan", "green"] console = Console() -PROJECT_ROOT = Path(__file__).parent.parent.parent -CONFIG_FILE = PROJECT_ROOT / "openarc-config.json" +# Configuration handling - use project root directory +PROJECT_ROOT = Path(__file__).parent +CONFIG_FILE = PROJECT_ROOT / "openarc-cli-config.yaml" def save_cli_config(host: str, port: int): - """Save server configuration to JSON config file.""" - config = load_full_config() # Load existing config first - config.update({ + """Save server configuration to YAML config file.""" + config = { "server": { "host": host, "port": port }, "created_by": "openarc-cli", "version": "1.0" - }) + } with open(CONFIG_FILE, "w") as f: - json.dump(config, f, indent=2) + yaml.dump(config, f, default_flow_style=False, indent=2) console.print(f"📝 [dim]Configuration saved to: {CONFIG_FILE}[/dim]") -def save_model_config(model_name: str, load_config: dict): - """Save model configuration to JSON config file.""" - config = load_full_config() - - if "models" not in config: - config["models"] = {} - - config["models"][model_name] = load_config - - with open(CONFIG_FILE, "w") as f: - json.dump(config, f, indent=2) - - console.print(f"💾 [green]Model configuration saved:[/green] {model_name}") - -def load_full_config(): - """Load full configuration from JSON config file.""" +def load_cli_config(): + """Load server configuration from YAML config file.""" if CONFIG_FILE.exists(): try: with open(CONFIG_FILE, "r") as f: - config = json.load(f) - return config if config else {} - except (json.JSONDecodeError, FileNotFoundError): + config = yaml.safe_load(f) + if config and "server" in config: + return config["server"] + except (yaml.YAMLError, FileNotFoundError, KeyError): console.print(f"[yellow]Warning: Could not read config file {CONFIG_FILE}[/yellow]") - return {} - -def get_model_config(model_name: str): - """Get model configuration by name.""" - config = load_full_config() - models = config.get("models", {}) - return models.get(model_name) - -def get_model_configs(): - """Get model configuration by name.""" - config = load_full_config() - models = config.get("models", {}) - return models - -def remove_model_config(model_name: str): - """Remove model configuration by name.""" - config = load_full_config() - models = config.get("models", {}) - - if model_name not in models: - return False - - del models[model_name] - config["models"] = models - - with open(CONFIG_FILE, "w") as f: - json.dump(config, f, indent=2) - - return True - -def load_cli_config(): - """Load server configuration from YAML config file.""" - config = load_full_config() - if config and "server" in config: - return config["server"] - return {"host": "localhost", "port": 8000} # defaults class OpenArcCLI: @@ -134,7 +93,7 @@ def get_help(self, ctx): art.append(" | | \n", style="white") art.append(" |_| \n", style="white") art.append(" \n", style="white") - art.append(" The CLI application \n", style="white") + art.append("The CLI application \n", style="white") console.print(art) return super().get_help(ctx) @@ -144,12 +103,10 @@ def cli(): Use this application to interface with the OpenArc server. Features: - + • Start the OpenArc server. • Load models into the OpenArc server. - - • List models from saved configurations. • Check the status of loaded models. @@ -164,116 +121,277 @@ def cli(): """ @cli.command() -@click.option('--model-name', '--mn', - required=True, - help='Public facing name of the model.') -@click.option('--model-path', '--m', - required=True, - help='Path to OpenVINO IR converted model.') -@click.option('--engine', '--en', - type=click.Choice(['ovgenai', 'openvino', 'optimum']), - required=True, - help='Engine used to load the model (ovgenai, openvino, optimum)') -@click.option('--model-type', '--mt', - type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb', 'rerank']), - required=True, - help='Model type (llm, vlm, whisper, kokoro, emb, rerank)') -@click.option('--device', '--d', - required=True, - help='Device(s) to load the model on.') -@click.option("--runtime-config", "--rtc", - type=dict, - default={}, - help='OpenVINO runtime configuration (e.g., performance hints). These are checked serverside at runtime.') + +@click.option('--model', + required=True, + help=""" + - Absolute path to model. + + - The dir name which stores the openvino model files is used in the API to identify the model. + + - The dir name is the same as the model name. + """) + +@click.option('--model-type', + type=click.Choice(['TEXT', 'VISION']), + required=True, + default='TEXT', + help=""" + + - Type of model. + + """) + +@click.option('--device', + required=True, + default='CPU', + help=""" + - Device: CPU, GPU.0, GPU.1, GPU.2, GPU.3, GPU.4, AUTO + + - GPU.0 is the first GPU, GPU.1 is the second GPU, etc. + + - AUTO will automatically select the best device. + """) + +@click.option('--use-cache/--no-use-cache', + required=True, + default=True, + help=""" + - Use cache for stateful models. + + - Edge cases may require disabling cache, probably based on model architecture. + + """) + +@click.option('--dynamic-shapes/--no-dynamic-shapes', + required=True, + default=True, + help=""" + - Use dynamic shapes. + + - If false, the model will be loaded with static shapes. + + - OpenVINO IR usually use dynamic shapes but for NPU it must be disabled. + + """) + +@click.option('--pad-token-id', + required=False, + type=int, + help=""" + - (pad)pad token ID + + - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly. + + """ + ) + +@click.option('--eos-token-id', + required=False, + type=int, + help=""" + - (eos)end of sequence token id + + - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly. + + - When the eos token is set to the *incorrect* token the model will continue to generate tokens. + + - Pairing this with a target max_length is a good way to test performance. + """ + + ) + +@click.option('--bos-token-id', + required=False, + type=int, + help='beginning of sequence token ID') + +@click.option('--num-streams', + required=False, + type=int, + default=None, + show_default=True, + help='Number of inference streams') + +@click.option('--performance-hint', + required=False, + type=click.Choice(['LATENCY', 'THROUGHPUT', 'CUMULATIVE_THROUGHPUT']), + default=None, + show_default=True, + help=""" + --- + + - High level performance hint. + + - Usually I use 'LATENCY' which locks to one CPU or one CPU socket. + + - It's best to use the documentation for this. + + https://docs.openvino.ai/2025/openvino-workflow/running-inference/optimize-inference/high-level-performance-hints.html + + --- + + """ + ) + +@click.option('--inference-precision-hint', + required=False, + type=click.Choice(['fp32', 'f16', 'bf16', 'dynamic']), + default=None, + show_default=True, + help=""" + --- + + - Controls precision during inference, at inference time. + + - Works on CPU and GPU. + + - Target device specific features. + + - Ex:'bf16' is probably best on CPUs which support AMX. + + """ + ) + +@click.option('--enable-hyper-threading', + required=False, + type=bool, + default=None, + help=""" + --- + + - CPU ONLY --> Cannot be used with GPU. + + - Enable hyper-threading + + - This is only relevant for Intel CPUs with hyperthreading i.e, two virtual cores per physical core. + + """ + ) + +@click.option('--inference-num-threads', + required=False, + type=int, + default=None, + show_default=True, + help=""" + --- + + - CPU ONLY --> Cannot be used with GPU. + + - Number of inference threads + + - More threads usually means faster inference. + + - Therefore this can be used to constrain the number of threads used for inference. + """ + ) + +@click.option('--scheduling-core-type', + required=False, + type=click.Choice(['ANY_CORE', 'PCORE_ONLY', 'ECORE_ONLY']), + default=None, + show_default=True, + help=""" + --- + + - Advanced option to target p-cores or e-cores on CPUs which support it. + + - CPU ONLY --> Cannot be used with GPU. + + - [ANY_CORE]: Any core, so default for 'older' Intel CPUs. Default for most chips but no need to set. + + - [PCORE_ONLY]: Only run inference on threads which are performance cores. + + - [ECORE_ONLY]: Only run inference on threads which are efficency cores. + --- + """ + ) + @click.pass_context -def add(ctx, model_path, model_name, engine, model_type, device, runtime_config): - """- Add a model configuration to the config file.""" +def load(ctx, model, model_type, device, use_cache, dynamic_shapes, + pad_token_id, eos_token_id, bos_token_id, num_streams, performance_hint, + inference_precision_hint, enable_hyper_threading, inference_num_threads, + scheduling_core_type): + """- Load a model.""" + cli_instance = OpenArcCLI() - # Build and save configuration + # Build load_config from arguments load_config = { - "model_name": model_name, - "model_path": model_path, - "model_type": model_type, - "engine": engine, + "id_model": model, + "architecture_type": model_type, + "use_cache": use_cache, "device": device, - "runtime_config": runtime_config if runtime_config else {} + "dynamic_shapes": dynamic_shapes, } - save_model_config(model_name, load_config) - console.print(f"✅ [green]Saved configuration for:[/green] {model_name}") - console.print(f"[dim]Use 'openarc load --mn {model_name}' to load this model.[/dim]") - -@cli.command() -@click.option('--model-name', '--mn', - required=False, - help='Model name to load from saved configuration.') -@click.option('--all-models', '--am', - required=False, - is_flag=True, - help='Load all models from saved configuration.') -@click.pass_context -def load(ctx, model_name, all_models): - """- Load a model from saved configuration.""" - cli_instance = OpenArcCLI() + # Add optional token IDs if provided + if pad_token_id is not None: + load_config["pad_token_id"] = pad_token_id + if eos_token_id is not None: + load_config["eos_token_id"] = eos_token_id + if bos_token_id is not None: + load_config["bos_token_id"] = bos_token_id - configs = {} - # Get saved configuration - if all_models: - configs = get_model_configs() - elif model_name: - configs[model_name]=get_model_config(model_name) - else: - console.print(f"❌ [red]A model name or all-models must be selected.[/red] {model_name}") - console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]") - ctx.exit(1) - - if len(configs)==0 or (model_name and not configs[model_name]): - console.print(f"❌ [red]Model configuration not found:[/red] {model_name}") - console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]") - ctx.exit(1) + # Build ov_config from arguments + ov_config = {} + if performance_hint is not None: + ov_config["PERFORMANCE_HINT"] = performance_hint + if inference_precision_hint is not None: + ov_config["INFERENCE_PRECISION_HINT"] = inference_precision_hint + if enable_hyper_threading is not None: + ov_config["ENABLE_HYPER_THREADING"] = enable_hyper_threading + if inference_num_threads not in (None, False): + ov_config["INFERENCE_NUM_THREADS"] = inference_num_threads + if scheduling_core_type is not None: + ov_config["SCHEDULING_CORE_TYPE"] = scheduling_core_type + if num_streams is not None: + ov_config["NUM_STREAMS"] = num_streams - for key, value in configs.items(): - # Make API request to load the model - url = f"{cli_instance.base_url}/openarc/load" + # Prepare payload + payload = { + "load_config": load_config, + "ov_config": ov_config if ov_config else {} + } + + # Make API request + url = f"{cli_instance.base_url}/optimum/model/load" + + try: + console.print(f"🚀 [blue]Loading model:[/blue] {model}") + response = requests.post(url, json=payload, headers=cli_instance.get_headers()) - try: - console.print("[cyan]working...[/cyan]") - response = requests.post(url, json=value, headers=cli_instance.get_headers()) - - if response.status_code == 200: - - console.print("[green]Done![/green]") - console.print("[dim]Use 'openarc status' to check the status of loaded models.[/dim]") - else: - console.print(f"❌ [red]error: {response.status_code}[/red]") - console.print(f"[red]Response:[/red] {response.text}") - ctx.exit(1) - - except requests.exceptions.RequestException as e: - console.print(f"❌ [red]Request failed:[/red] {e}") + if response.status_code == 200: + console.print("✅ [green]Model loaded successfully![/green]") + else: + console.print(f"❌ [red]Error loading model: {response.status_code}[/red]") + console.print(f"[red]Response:[/red] {response.text}") ctx.exit(1) + + except requests.exceptions.RequestException as e: + console.print(f"❌ [red]Request failed:[/red] {e}") + ctx.exit(1) @cli.command() -@click.option('--model-name', '--mn', required=True, help='Model name to unload') +@click.option('--model-id', required=True, help='Model ID to unload') @click.pass_context -def unload(ctx, model_name): +def unload(ctx, model_id): """ - - POST Delete a model from registry and unload from memory. + - DELETE a model from memory. """ cli_instance = OpenArcCLI() - url = f"{cli_instance.base_url}/openarc/unload" - payload = {"model_name": model_name} + # Make API request + url = f"{cli_instance.base_url}/optimum/model/unload" + params = {"model_id": model_id} try: - console.print(f"🗑️ [blue]Unloading model:[/blue] {model_name}") - response = requests.post(url, json=payload, headers=cli_instance.get_headers()) + console.print(f"🗑️ [blue]Unloading model:[/blue] {model_id}") + response = requests.delete(url, params=params, headers=cli_instance.get_headers()) if response.status_code == 200: result = response.json() - # Handle different possible response formats - message = result.get('message', f"Model '{model_name}' unloaded successfully") - console.print(f"✅ [green]{message}[/green]") + console.print(f"✅ [green]{result['message']}[/green]") else: console.print(f"❌ [red]Error unloading model: {response.status_code}[/red]") console.print(f"[red]Response:[/red] {response.text}") @@ -283,79 +401,13 @@ def unload(ctx, model_name): console.print(f"❌ [red]Request failed:[/red] {e}") ctx.exit(1) -@cli.command() -@click.option('--model-name','--mn', help='Model name to remove (used with --rm).') -@click.option('--rm', is_flag=True, help='Remove a model configuration.') -@click.pass_context -def list(ctx, rm, model_name): - """- List saved model configurations. - - - Remove a model configuration.""" - if rm: - if not model_name: - console.print("❌ [red]Error:[/red] --model-name is required when using --rm") - - ctx.exit(1) - - # Check if model exists before trying to remove - existing_config = get_model_config(model_name) - if not existing_config: - console.print(f"❌ {model_name}[red] not found:[/red]") - console.print("[yellow]Use 'openarc list' to see available configurations.[/yellow]") - ctx.exit(1) - - # Remove the configuration - if remove_model_config(model_name): - console.print(f"🗑️ [green]Model configuration removed:[/green] {model_name}") - else: - console.print(f"❌ [red]Failed to remove model configuration:[/red] {model_name}") - ctx.exit(1) - return - - config = load_full_config() - models = config.get("models", {}) - - if not models: - console.print("[yellow]No model configurations found.[/yellow]") - console.print("[dim]Use 'openarc add --help' to see how to save configurations.[/dim]") - return - - console.print(f"📋 [blue]Saved Model Configurations ({len(models)}):[/blue]\n") - - for model_name, model_config in models.items(): - # Create a table for each model configuration - config_table = Table(show_header=False, box=None, pad_edge=False) - - - config_table.add_row("model_name", f"[cyan]{model_name}[/cyan]") - config_table.add_row("device", f"[blue]{model_config.get('device')}[/blue]") - config_table.add_row("engine", f"[green]{model_config.get('engine')}[/green]") - config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]") - - - rtc = model_config.get('runtime_config', {}) - if rtc: - config_table.add_row("", "") - config_table.add_row(Text("runtime_config", style="bold underline yellow"), "") - for key, value in rtc.items(): - config_table.add_row(f" {key}", f"[dim]{value}[/dim]") - - panel = Panel( - config_table, - border_style="green" - ) - console.print(panel) - - console.print("\n[dim]To load a saved configuration: openarc load --model-name [/dim]") - console.print("[dim]To remove a configuration: openarc list --remove --model-name [/dim]") - @cli.command() @click.pass_context def status(ctx): """- GET Status of loaded models.""" cli_instance = OpenArcCLI() - url = f"{cli_instance.base_url}/openarc/status" + url = f"{cli_instance.base_url}/optimum/status" try: console.print("📊 [blue]Getting model status...[/blue]") @@ -363,39 +415,66 @@ def status(ctx): if response.status_code == 200: result = response.json() - models = result.get("models", []) - total_models = result.get("total_loaded_models", 0) + loaded_models = result.get("loaded_models", {}) + total_models = result.get("total_models_loaded", 0) - if not models: + if not loaded_models: console.print("[yellow]No models currently loaded.[/yellow]") else: - # Create a table for all models - status_table = Table(title=f"📊 Loaded Models ({total_models})") - status_table.add_column("model_name", style="cyan", width=20) - status_table.add_column("device", style="blue", width=10) - status_table.add_column("model_type", style="magenta", width=15) - status_table.add_column("engine", style="green", width=10) - status_table.add_column("status", style="yellow", width=10) - status_table.add_column("time_loaded", style="dim", width=20) - - for model in models: - model_name = model.get("model_name") - device = model.get("device") - model_type = model.get("model_type") - engine = model.get("engine") - status = model.get("status") - time_loaded = model.get("time_loaded") - - status_table.add_row( - model_name, - device, - model_type, - engine, - status, - time_loaded + for model_id, model_info in loaded_models.items(): + device = model_info.get("device", "unknown") + status_val = model_info.get("status", "unknown") + metadata = model_info.get("model_metadata", {}) + model_type = metadata.get("architecture_type", "unknown") + use_cache = str(metadata.get("use_cache", "-")) + dynamic_shapes = str(metadata.get("dynamic_shapes", "-")) + pad_token_id = str(metadata.get("pad_token_id", "-")) + eos_token_id = str(metadata.get("eos_token_id", "-")) + bos_token_id = str(metadata.get("bos_token_id", "-")) + num_streams = str(metadata.get("NUM_STREAMS", "-")) + precision = str(metadata.get("INFERENCE_PRECISION_HINT", "-")) + perf_hint = str(metadata.get("PERFORMANCE_HINT", "-")) + inf_num_threads = str(metadata.get("INFERENCE_NUM_THREADS", "-")) + enable_ht = str(metadata.get("ENABLE_HYPER_THREADING", "-")) + sched_core_type = str(metadata.get("SCHEDULING_CORE_TYPE", "-")) + + model_table = Table(show_header=False, box=None, pad_edge=False) + + model_table.add_row(Text("Model Info", style="bold underline cyan"), "") + model_table.add_row("Model ID", f"[cyan]{model_id}[/cyan]") + model_table.add_row("Type", f"[yellow]{model_type}[/yellow]") + model_table.add_row("Status", f"[green]{status_val}[/green]" if status_val == "loaded" else f"[red]{status_val}[/red]") + model_table.add_row("", "") + + # Device Info Section + model_table.add_row(Text("Device Info", style="bold underline magenta"), "") + model_table.add_row("Device", f"[magenta]{device}[/magenta]") + model_table.add_row("Use Cache", use_cache) + model_table.add_row("Dynamic Shapes", dynamic_shapes) + model_table.add_row("", "") + + # Token IDs Section + model_table.add_row(Text("Token IDs", style="bold underline yellow"), "") + model_table.add_row("Pad Token ID", pad_token_id) + model_table.add_row("EOS Token ID", eos_token_id) + model_table.add_row("BOS Token ID", bos_token_id) + model_table.add_row("", "") + + # Performance Settings Section + model_table.add_row(Text("Performance Settings", style="bold underline green"), "") + model_table.add_row("NUM_STREAMS", num_streams) + model_table.add_row("INFERENCE_PRECISION_HINT", precision) + model_table.add_row("PERFORMANCE_HINT", perf_hint) + model_table.add_row("INFERENCE_NUM_THREADS", inf_num_threads) + model_table.add_row("ENABLE_HYPER_THREADING", enable_ht) + model_table.add_row("SCHEDULING_CORE_TYPE", sched_core_type) + + panel = Panel( + model_table, + title=f"🧩 Model: [bold]{model_id}[/bold]", + border_style="blue" if status_val == "loaded" else "red" ) - - console.print(status_table) + console.print(panel) console.print(f"\n[green]Total models loaded: {total_models}[/green]") else: @@ -413,11 +492,11 @@ def tool(ctx): """- Utility scripts.""" pass -@tool.command('device-props') +@tool.command('device-properties') @click.pass_context def device_properties(ctx): """ - - Query OpenVINO device properties for all available devices. + - Query device properties for all devices. """ try: @@ -462,22 +541,22 @@ def device_detect(ctx): diagnostic = DeviceDiagnosticQuery() available_devices = diagnostic.get_available_devices() - table = Table() - table.add_column("Index", style="cyan", width=2) + table = Table(title="📋 Available Devices") + table.add_column("#", style="cyan", width=4) table.add_column("Device", style="green") if not available_devices: - console.print("❌ [red] Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red]") + console.print("❌ [red]No OpenVINO devices found![/red]") ctx.exit(1) for i, device in enumerate(available_devices, 1): table.add_row(str(i), device) console.print(table) - console.print(f"\n✅ [green] Sanity test passed: found {len(available_devices)} device(s)[/green]") + console.print(f"\n✅ [green]OpenVINO runtime found {len(available_devices)} device(s)[/green]") except Exception as e: - console.print(f"❌ [red]Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red] {e}") + console.print(f"❌ [red]Error during device diagnosis:[/red] {e}") ctx.exit(1) @cli.group() @@ -488,10 +567,12 @@ def serve(): pass @serve.command("start") + @click.option("--host", type=str, default="0.0.0.0", show_default=True, help=""" - Host to bind the server to """) + @click.option("--openarc-port", type=int, default=8000, @@ -502,7 +583,7 @@ def serve(): def start(host, openarc_port): """ - - 'start' reads --host and --openarc-port from config or defaults to 0.0.0.0:8000 + - 'start' reads --host and --openarc-port and saves them to the config file. Then it starts the server and will read """ # Save server configuration for other CLI commands to use save_cli_config(host, openarc_port) @@ -514,5 +595,3 @@ def start(host, openarc_port): if __name__ == "__main__": cli() - - From b547cac541356257e77c85566c03f6b10eb58423 Mon Sep 17 00:00:00 2001 From: Emerson Tatelbaum <164939384+SearchSavior@users.noreply.github.com> Date: Mon, 20 Oct 2025 21:20:20 -0400 Subject: [PATCH 5/5] Refactor OpenArc CLI for JSON config and model handling copy and pasted cli from main 1.0.6 into fork. essentially I've used a screwdriver as a chisel --- src/cli/openarc_cli.py | 665 ++++++++++++++++++++--------------------- 1 file changed, 321 insertions(+), 344 deletions(-) diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py index 9938cc0..222abd6 100644 --- a/src/cli/openarc_cli.py +++ b/src/cli/openarc_cli.py @@ -2,67 +2,102 @@ """ OpenArc CLI Tool - Command-line interface for OpenArc server operations. """ -import json import os -import sys -import traceback -import requests -import yaml +import json from pathlib import Path +import requests import rich_click as click -from rich import print as rprint from rich.console import Console from rich.panel import Panel from rich.table import Table from rich.text import Text -from api.launcher import start_server -from cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery +from src.server.launch import start_server +from src.cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery click.rich_click.STYLE_OPTIONS_TABLE_LEADING = 1 click.rich_click.STYLE_OPTIONS_TABLE_BOX = "SIMPLE" - -# click.rich_click.STYLE_OPTIONS_TABLE_ROW_STYLES = ["bold", ""] click.rich_click.STYLE_COMMANDS_TABLE_SHOW_LINES = True -# click.rich_click.STYLE_COMMANDS_TABLE_PAD_EDGE = True -#click.rich_click.STYLE_COMMANDS_TABLE_BOX = "DOUBLE" click.rich_click.STYLE_COMMANDS_TABLE_BORDER_STYLE = "red" click.rich_click.STYLE_COMMANDS_TABLE_ROW_STYLES = ["magenta", "yellow", "cyan", "green"] console = Console() -# Configuration handling - use project root directory -PROJECT_ROOT = Path(__file__).parent -CONFIG_FILE = PROJECT_ROOT / "openarc-cli-config.yaml" +PROJECT_ROOT = Path(__file__).parent.parent.parent +CONFIG_FILE = PROJECT_ROOT / "openarc-config.json" def save_cli_config(host: str, port: int): - """Save server configuration to YAML config file.""" - config = { + """Save server configuration to JSON config file.""" + config = load_full_config() # Load existing config first + config.update({ "server": { "host": host, "port": port }, "created_by": "openarc-cli", "version": "1.0" - } + }) with open(CONFIG_FILE, "w") as f: - yaml.dump(config, f, default_flow_style=False, indent=2) + json.dump(config, f, indent=2) console.print(f"📝 [dim]Configuration saved to: {CONFIG_FILE}[/dim]") -def load_cli_config(): - """Load server configuration from YAML config file.""" +def save_model_config(model_name: str, load_config: dict): + """Save model configuration to JSON config file.""" + config = load_full_config() + + if "models" not in config: + config["models"] = {} + + config["models"][model_name] = load_config + + with open(CONFIG_FILE, "w") as f: + json.dump(config, f, indent=2) + + console.print(f"💾 [green]Model configuration saved:[/green] {model_name}") + +def load_full_config(): + """Load full configuration from JSON config file.""" if CONFIG_FILE.exists(): try: with open(CONFIG_FILE, "r") as f: - config = yaml.safe_load(f) - if config and "server" in config: - return config["server"] - except (yaml.YAMLError, FileNotFoundError, KeyError): + config = json.load(f) + return config if config else {} + except (json.JSONDecodeError, FileNotFoundError): console.print(f"[yellow]Warning: Could not read config file {CONFIG_FILE}[/yellow]") + return {} + +def get_model_config(model_name: str): + """Get model configuration by name.""" + config = load_full_config() + models = config.get("models", {}) + return models.get(model_name) + +def remove_model_config(model_name: str): + """Remove model configuration by name.""" + config = load_full_config() + models = config.get("models", {}) + + if model_name not in models: + return False + + del models[model_name] + config["models"] = models + + with open(CONFIG_FILE, "w") as f: + json.dump(config, f, indent=2) + + return True + +def load_cli_config(): + """Load server configuration from YAML config file.""" + config = load_full_config() + if config and "server" in config: + return config["server"] + return {"host": "localhost", "port": 8000} # defaults class OpenArcCLI: @@ -93,7 +128,7 @@ def get_help(self, ctx): art.append(" | | \n", style="white") art.append(" |_| \n", style="white") art.append(" \n", style="white") - art.append("The CLI application \n", style="white") + art.append(" The CLI application \n", style="white") console.print(art) return super().get_help(ctx) @@ -103,10 +138,12 @@ def cli(): Use this application to interface with the OpenArc server. Features: - + • Start the OpenArc server. • Load models into the OpenArc server. + + • List models from saved configurations. • Check the status of loaded models. @@ -121,277 +158,149 @@ def cli(): """ @cli.command() - -@click.option('--model', - required=True, - help=""" - - Absolute path to model. - - - The dir name which stores the openvino model files is used in the API to identify the model. - - - The dir name is the same as the model name. - """) - -@click.option('--model-type', - type=click.Choice(['TEXT', 'VISION']), - required=True, - default='TEXT', - help=""" - - - Type of model. - - """) - -@click.option('--device', - required=True, - default='CPU', - help=""" - - Device: CPU, GPU.0, GPU.1, GPU.2, GPU.3, GPU.4, AUTO - - - GPU.0 is the first GPU, GPU.1 is the second GPU, etc. - - - AUTO will automatically select the best device. - """) - -@click.option('--use-cache/--no-use-cache', - required=True, - default=True, - help=""" - - Use cache for stateful models. - - - Edge cases may require disabling cache, probably based on model architecture. - - """) - -@click.option('--dynamic-shapes/--no-dynamic-shapes', - required=True, - default=True, - help=""" - - Use dynamic shapes. - - - If false, the model will be loaded with static shapes. - - - OpenVINO IR usually use dynamic shapes but for NPU it must be disabled. - - """) - -@click.option('--pad-token-id', - required=False, - type=int, - help=""" - - (pad)pad token ID - - - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly. - - """ - ) - -@click.option('--eos-token-id', - required=False, - type=int, - help=""" - - (eos)end of sequence token id - - - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly. - - - When the eos token is set to the *incorrect* token the model will continue to generate tokens. - - - Pairing this with a target max_length is a good way to test performance. - """ - - ) - -@click.option('--bos-token-id', - required=False, - type=int, - help='beginning of sequence token ID') - -@click.option('--num-streams', - required=False, - type=int, - default=None, - show_default=True, - help='Number of inference streams') - -@click.option('--performance-hint', - required=False, - type=click.Choice(['LATENCY', 'THROUGHPUT', 'CUMULATIVE_THROUGHPUT']), - default=None, - show_default=True, - help=""" - --- - - - High level performance hint. - - - Usually I use 'LATENCY' which locks to one CPU or one CPU socket. - - - It's best to use the documentation for this. - - https://docs.openvino.ai/2025/openvino-workflow/running-inference/optimize-inference/high-level-performance-hints.html - - --- - - """ - ) - -@click.option('--inference-precision-hint', - required=False, - type=click.Choice(['fp32', 'f16', 'bf16', 'dynamic']), - default=None, - show_default=True, - help=""" - --- - - - Controls precision during inference, at inference time. - - - Works on CPU and GPU. - - - Target device specific features. - - - Ex:'bf16' is probably best on CPUs which support AMX. - - """ - ) - -@click.option('--enable-hyper-threading', - required=False, - type=bool, - default=None, - help=""" - --- - - - CPU ONLY --> Cannot be used with GPU. - - - Enable hyper-threading - - - This is only relevant for Intel CPUs with hyperthreading i.e, two virtual cores per physical core. - - """ - ) - -@click.option('--inference-num-threads', - required=False, - type=int, - default=None, - show_default=True, - help=""" - --- - - - CPU ONLY --> Cannot be used with GPU. - - - Number of inference threads - - - More threads usually means faster inference. - - - Therefore this can be used to constrain the number of threads used for inference. - """ - ) - -@click.option('--scheduling-core-type', - required=False, - type=click.Choice(['ANY_CORE', 'PCORE_ONLY', 'ECORE_ONLY']), - default=None, - show_default=True, - help=""" - --- - - - Advanced option to target p-cores or e-cores on CPUs which support it. - - - CPU ONLY --> Cannot be used with GPU. - - - [ANY_CORE]: Any core, so default for 'older' Intel CPUs. Default for most chips but no need to set. - - - [PCORE_ONLY]: Only run inference on threads which are performance cores. - - - [ECORE_ONLY]: Only run inference on threads which are efficency cores. - --- - """ - ) - +@click.option('--model-name', '--mn', + required=True, + help='Public facing name of the model.') +@click.option('--model-path', '--m', + required=True, + help='Path to OpenVINO IR converted model.') +@click.option('--engine', '--en', + type=click.Choice(['ovgenai', 'openvino', 'optimum']), + required=True, + help='Engine used to load the model (ovgenai, openvino, optimum)') +@click.option('--model-type', '--mt', + type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb']), + required=True, + help='Model type (llm, vlm, whisper, kokoro, emb)') +@click.option('--device', '--d', + required=True, + help='Device(s) to load the model on.') +@click.option("--runtime-config", "--rtc", + default=None, + help='OpenVINO runtime configuration (e.g., performance hints). These are checked serverside at runtime.') +@click.option('--vlm-type', '--vt', + type=click.Choice(['internvl2', 'llava15', 'llavanext', 'minicpmv26', 'phi3vision', 'phi4mm', 'qwen2vl', 'qwen25vl', 'gemma3']), + required=False, + default=None, + help='Vision model type. Used to map correct vision tokens.') @click.pass_context -def load(ctx, model, model_type, device, use_cache, dynamic_shapes, - pad_token_id, eos_token_id, bos_token_id, num_streams, performance_hint, - inference_precision_hint, enable_hyper_threading, inference_num_threads, - scheduling_core_type): - """- Load a model.""" - cli_instance = OpenArcCLI() +def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, vlm_type): + """- Add a model configuration to the config file.""" - # Build load_config from arguments + # Build and save configuration load_config = { - "id_model": model, - "architecture_type": model_type, - "use_cache": use_cache, + "model_name": model_name, + "model_path": model_path, + "model_type": model_type, + "engine": engine, "device": device, - "dynamic_shapes": dynamic_shapes, + "runtime_config": runtime_config if runtime_config else {}, + "vlm_type": vlm_type if vlm_type else None } - # Add optional token IDs if provided - if pad_token_id is not None: - load_config["pad_token_id"] = pad_token_id - if eos_token_id is not None: - load_config["eos_token_id"] = eos_token_id - if bos_token_id is not None: - load_config["bos_token_id"] = bos_token_id - - # Build ov_config from arguments - ov_config = {} - if performance_hint is not None: - ov_config["PERFORMANCE_HINT"] = performance_hint - if inference_precision_hint is not None: - ov_config["INFERENCE_PRECISION_HINT"] = inference_precision_hint - if enable_hyper_threading is not None: - ov_config["ENABLE_HYPER_THREADING"] = enable_hyper_threading - if inference_num_threads not in (None, False): - ov_config["INFERENCE_NUM_THREADS"] = inference_num_threads - if scheduling_core_type is not None: - ov_config["SCHEDULING_CORE_TYPE"] = scheduling_core_type - if num_streams is not None: - ov_config["NUM_STREAMS"] = num_streams - - # Prepare payload - payload = { - "load_config": load_config, - "ov_config": ov_config if ov_config else {} - } + save_model_config(model_name, load_config) + console.print(f"✅ [green]Saved configuration for:[/green] {model_name}") + console.print(f"[dim]Use 'openarc load {model_name}' to load this model.[/dim]") + +@cli.command() +@click.argument('model_names', nargs=-1, required=True) +@click.pass_context +def load(ctx, model_names): + """- Load one or more models from saved configuration. - # Make API request - url = f"{cli_instance.base_url}/optimum/model/load" + Examples: + openarc load model1 + openarc load Dolphin-X1 kokoro whisper + """ + cli_instance = OpenArcCLI() - try: - console.print(f"🚀 [blue]Loading model:[/blue] {model}") - response = requests.post(url, json=payload, headers=cli_instance.get_headers()) - - if response.status_code == 200: - console.print("✅ [green]Model loaded successfully![/green]") + model_names = list(model_names) + + # Track results + successful_loads = [] + failed_loads = [] + + # Start loading queue + if len(model_names) > 1: + console.print(f"🚀 [blue]Starting load queue...[/blue] ({len(model_names)} models)\n") + + # Load each model + for idx, name in enumerate(model_names, 1): + # Show progress indicator for multiple models + if len(model_names) > 1: + console.print(f"[cyan]({idx}/{len(model_names)})[/cyan] [blue]loading[/blue] {name}") else: - console.print(f"❌ [red]Error loading model: {response.status_code}[/red]") - console.print(f"[red]Response:[/red] {response.text}") - ctx.exit(1) + console.print(f"[blue]loading[/blue] {name}") + + # Get saved configuration + saved_config = get_model_config(name) + + if not saved_config: + console.print(f"❌ [red]Model configuration not found:[/red] {name}") + console.print("[yellow]Tip: Use 'openarc list' to see saved configurations.[/yellow]\n") + failed_loads.append(name) + continue + + load_config = saved_config.copy() + + # Make API request to load the model + url = f"{cli_instance.base_url}/openarc/load" + + try: + console.print("[cyan]...working[/cyan]") + response = requests.post(url, json=load_config, headers=cli_instance.get_headers()) - except requests.exceptions.RequestException as e: - console.print(f"❌ [red]Request failed:[/red] {e}") + if response.status_code == 200: + console.print(f"✅ [green]{name} loaded![/green]\n") + successful_loads.append(name) + else: + console.print(f"❌ [red]error: {response.status_code}[/red]") + console.print(f"[red]Response:[/red] {response.text}\n") + failed_loads.append(name) + + except requests.exceptions.RequestException as e: + console.print(f"❌ [red]Request failed:[/red] {e}\n") + failed_loads.append(name) + + # Summary + console.print("─" * 60) + if successful_loads and not failed_loads: + console.print(f"🎉 [green]All models loaded![/green] ({len(successful_loads)}/{len(model_names)})") + elif successful_loads and failed_loads: + console.print(f"⚠️ [yellow]Partial success:[/yellow] {len(successful_loads)}/{len(model_names)} models loaded") + console.print(f" [green]✓ Loaded:[/green] {', '.join(successful_loads)}") + console.print(f" [red]✗ Failed:[/red] {', '.join(failed_loads)}") + else: + console.print(f"❌ [red]All models failed to load![/red] (0/{len(model_names)})") + console.print(f" [red]✗ Failed:[/red] {', '.join(failed_loads)}") + + console.print("[dim]Use 'openarc status' to see loaded models.[/dim]") + + # Exit with error code if any loads failed + if failed_loads: ctx.exit(1) @cli.command() -@click.option('--model-id', required=True, help='Model ID to unload') +@click.option('--model-name', '--mn', required=True, help='Model name to unload') @click.pass_context -def unload(ctx, model_id): +def unload(ctx, model_name): """ - - DELETE a model from memory. + - POST Delete a model from registry and unload from memory. """ cli_instance = OpenArcCLI() - # Make API request - url = f"{cli_instance.base_url}/optimum/model/unload" - params = {"model_id": model_id} + url = f"{cli_instance.base_url}/openarc/unload" + payload = {"model_name": model_name} try: - console.print(f"🗑️ [blue]Unloading model:[/blue] {model_id}") - response = requests.delete(url, params=params, headers=cli_instance.get_headers()) + console.print(f"🗑️ [blue]Unloading model:[/blue] {model_name}") + response = requests.post(url, json=payload, headers=cli_instance.get_headers()) if response.status_code == 200: result = response.json() - console.print(f"✅ [green]{result['message']}[/green]") + # Handle different possible response formats + message = result.get('message', f"Model '{model_name}' unloaded successfully") + console.print(f"✅ [green]{message}[/green]") else: console.print(f"❌ [red]Error unloading model: {response.status_code}[/red]") console.print(f"[red]Response:[/red] {response.text}") @@ -401,13 +310,79 @@ def unload(ctx, model_id): console.print(f"❌ [red]Request failed:[/red] {e}") ctx.exit(1) +@cli.command() +@click.option('--model-name','--mn', help='Model name to remove (used with --rm).') +@click.option('--rm', is_flag=True, help='Remove a model configuration.') +@click.pass_context +def list(ctx, rm, model_name): + """- List saved model configurations. + + - Remove a model configuration.""" + if rm: + if not model_name: + console.print("❌ [red]Error:[/red] --model-name is required when using --rm") + + ctx.exit(1) + + # Check if model exists before trying to remove + existing_config = get_model_config(model_name) + if not existing_config: + console.print(f"❌ {model_name}[red] not found:[/red]") + console.print("[yellow]Use 'openarc list' to see available configurations.[/yellow]") + ctx.exit(1) + + # Remove the configuration + if remove_model_config(model_name): + console.print(f"🗑️ [green]Model configuration removed:[/green] {model_name}") + else: + console.print(f"❌ [red]Failed to remove model configuration:[/red] {model_name}") + ctx.exit(1) + return + + config = load_full_config() + models = config.get("models", {}) + + if not models: + console.print("[yellow]No model configurations found.[/yellow]") + console.print("[dim]Use 'openarc add --help' to see how to save configurations.[/dim]") + return + + console.print(f"📋 [blue]Saved Model Configurations ({len(models)}):[/blue]\n") + + for model_name, model_config in models.items(): + # Create a table for each model configuration + config_table = Table(show_header=False, box=None, pad_edge=False) + + + config_table.add_row("model_name", f"[cyan]{model_name}[/cyan]") + config_table.add_row("device", f"[blue]{model_config.get('device')}[/blue]") + config_table.add_row("engine", f"[green]{model_config.get('engine')}[/green]") + config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]") + + + rtc = model_config.get('runtime_config', {}) + if rtc: + config_table.add_row("", "") + config_table.add_row(Text("runtime_config", style="bold underline yellow"), "") + for key, value in rtc.items(): + config_table.add_row(f" {key}", f"[dim]{value}[/dim]") + + panel = Panel( + config_table, + border_style="green" + ) + console.print(panel) + + console.print("\n[dim]To load saved configurations: openarc load [model_name2 ...][/dim]") + console.print("[dim]To remove a configuration: openarc list --remove --model-name [/dim]") + @cli.command() @click.pass_context def status(ctx): """- GET Status of loaded models.""" cli_instance = OpenArcCLI() - url = f"{cli_instance.base_url}/optimum/status" + url = f"{cli_instance.base_url}/openarc/status" try: console.print("📊 [blue]Getting model status...[/blue]") @@ -415,66 +390,39 @@ def status(ctx): if response.status_code == 200: result = response.json() - loaded_models = result.get("loaded_models", {}) - total_models = result.get("total_models_loaded", 0) + models = result.get("models", []) + total_models = result.get("total_loaded_models", 0) - if not loaded_models: + if not models: console.print("[yellow]No models currently loaded.[/yellow]") else: - for model_id, model_info in loaded_models.items(): - device = model_info.get("device", "unknown") - status_val = model_info.get("status", "unknown") - metadata = model_info.get("model_metadata", {}) - model_type = metadata.get("architecture_type", "unknown") - use_cache = str(metadata.get("use_cache", "-")) - dynamic_shapes = str(metadata.get("dynamic_shapes", "-")) - pad_token_id = str(metadata.get("pad_token_id", "-")) - eos_token_id = str(metadata.get("eos_token_id", "-")) - bos_token_id = str(metadata.get("bos_token_id", "-")) - num_streams = str(metadata.get("NUM_STREAMS", "-")) - precision = str(metadata.get("INFERENCE_PRECISION_HINT", "-")) - perf_hint = str(metadata.get("PERFORMANCE_HINT", "-")) - inf_num_threads = str(metadata.get("INFERENCE_NUM_THREADS", "-")) - enable_ht = str(metadata.get("ENABLE_HYPER_THREADING", "-")) - sched_core_type = str(metadata.get("SCHEDULING_CORE_TYPE", "-")) - - model_table = Table(show_header=False, box=None, pad_edge=False) - - model_table.add_row(Text("Model Info", style="bold underline cyan"), "") - model_table.add_row("Model ID", f"[cyan]{model_id}[/cyan]") - model_table.add_row("Type", f"[yellow]{model_type}[/yellow]") - model_table.add_row("Status", f"[green]{status_val}[/green]" if status_val == "loaded" else f"[red]{status_val}[/red]") - model_table.add_row("", "") - - # Device Info Section - model_table.add_row(Text("Device Info", style="bold underline magenta"), "") - model_table.add_row("Device", f"[magenta]{device}[/magenta]") - model_table.add_row("Use Cache", use_cache) - model_table.add_row("Dynamic Shapes", dynamic_shapes) - model_table.add_row("", "") - - # Token IDs Section - model_table.add_row(Text("Token IDs", style="bold underline yellow"), "") - model_table.add_row("Pad Token ID", pad_token_id) - model_table.add_row("EOS Token ID", eos_token_id) - model_table.add_row("BOS Token ID", bos_token_id) - model_table.add_row("", "") - - # Performance Settings Section - model_table.add_row(Text("Performance Settings", style="bold underline green"), "") - model_table.add_row("NUM_STREAMS", num_streams) - model_table.add_row("INFERENCE_PRECISION_HINT", precision) - model_table.add_row("PERFORMANCE_HINT", perf_hint) - model_table.add_row("INFERENCE_NUM_THREADS", inf_num_threads) - model_table.add_row("ENABLE_HYPER_THREADING", enable_ht) - model_table.add_row("SCHEDULING_CORE_TYPE", sched_core_type) - - panel = Panel( - model_table, - title=f"🧩 Model: [bold]{model_id}[/bold]", - border_style="blue" if status_val == "loaded" else "red" + # Create a table for all models + status_table = Table(title=f"📊 Loaded Models ({total_models})") + status_table.add_column("model_name", style="cyan", width=20) + status_table.add_column("device", style="blue", width=10) + status_table.add_column("model_type", style="magenta", width=15) + status_table.add_column("engine", style="green", width=10) + status_table.add_column("status", style="yellow", width=10) + status_table.add_column("time_loaded", style="dim", width=20) + + for model in models: + model_name = model.get("model_name") + device = model.get("device") + model_type = model.get("model_type") + engine = model.get("engine") + status = model.get("status") + time_loaded = model.get("time_loaded") + + status_table.add_row( + model_name, + device, + model_type, + engine, + status, + time_loaded ) - console.print(panel) + + console.print(status_table) console.print(f"\n[green]Total models loaded: {total_models}[/green]") else: @@ -492,11 +440,11 @@ def tool(ctx): """- Utility scripts.""" pass -@tool.command('device-properties') +@tool.command('device-props') @click.pass_context def device_properties(ctx): """ - - Query device properties for all devices. + - Query OpenVINO device properties for all available devices. """ try: @@ -541,22 +489,22 @@ def device_detect(ctx): diagnostic = DeviceDiagnosticQuery() available_devices = diagnostic.get_available_devices() - table = Table(title="📋 Available Devices") - table.add_column("#", style="cyan", width=4) + table = Table() + table.add_column("Index", style="cyan", width=2) table.add_column("Device", style="green") if not available_devices: - console.print("❌ [red]No OpenVINO devices found![/red]") + console.print("❌ [red] Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red]") ctx.exit(1) for i, device in enumerate(available_devices, 1): table.add_row(str(i), device) console.print(table) - console.print(f"\n✅ [green]OpenVINO runtime found {len(available_devices)} device(s)[/green]") + console.print(f"\n✅ [green] Sanity test passed: found {len(available_devices)} device(s)[/green]") except Exception as e: - console.print(f"❌ [red]Error during device diagnosis:[/red] {e}") + console.print(f"❌ [red]Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red] {e}") ctx.exit(1) @cli.group() @@ -567,12 +515,10 @@ def serve(): pass @serve.command("start") - @click.option("--host", type=str, default="0.0.0.0", show_default=True, help=""" - Host to bind the server to """) - @click.option("--openarc-port", type=int, default=8000, @@ -580,14 +526,43 @@ def serve(): help=""" - Port to bind the server to """) - -def start(host, openarc_port): +@click.option("--load-models", "--lm", + required=False, + help="Load models on startup. Specify once followed by space-separated model names.") +@click.argument('startup_models', nargs=-1, required=False) +def start(host, openarc_port, load_models, startup_models): """ - - 'start' reads --host and --openarc-port and saves them to the config file. Then it starts the server and will read + - 'start' reads --host and --openarc-port from config or defaults to 0.0.0.0:8000 + + Examples: + openarc serve start + openarc serve start --load-models model1 model2 + openarc serve start --lm Dolphin-X1 kokoro whisper """ # Save server configuration for other CLI commands to use save_cli_config(host, openarc_port) + # Handle startup models + models_to_load = [] + if load_models: + models_to_load.append(load_models) + if startup_models: + models_to_load.extend(startup_models) + + if models_to_load: + config = load_full_config() + saved_models = config.get("models", {}) + missing = [m for m in models_to_load if m not in saved_models] + + if missing: + console.print("⚠️ [yellow]Warning: Models not in config (will be skipped):[/yellow]") + for m in missing: + console.print(f" • {m}") + console.print("[dim]Use 'openarc list' to see saved configurations.[/dim]\n") + + os.environ["OPENARC_STARTUP_MODELS"] = ",".join(models_to_load) + console.print(f"📋 [blue]Models to load on startup:[/blue] {', '.join(models_to_load)}\n") + console.print(f"🚀 [green]Starting OpenArc server on {host}:{openarc_port}[/green]") start_server(host=host, openarc_port=openarc_port) @@ -595,3 +570,5 @@ def start(host, openarc_port): if __name__ == "__main__": cli() + +