From 1af5eb73c68b70a717c033cbcc7adfd297ac3483 Mon Sep 17 00:00:00 2001
From: mwrothbe <michaelrothberg@hotmail.com>
Date: Mon, 13 Oct 2025 13:07:08 -0700
Subject: [PATCH 1/5] added reranker api

---
 src/cli/openarc_cli.py           |   4 +-
 src/engine/optimum/optimum_rr.py | 114 +++++++++++++++++++++++++++++++
 src/server/launch.py             |   1 +
 src/server/main.py               |  68 +++++++++++++++++-
 src/server/model_registry.py     |   5 +-
 src/server/models/optimum.py     |  55 ++++++++++++---
 src/server/worker_registry.py    |  94 ++++++++++++++++++++++++-
 7 files changed, 325 insertions(+), 16 deletions(-)
 create mode 100644 src/engine/optimum/optimum_rr.py
diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py
index 98ab976..84d3fef 100644
--- a/src/cli/openarc_cli.py
+++ b/src/cli/openarc_cli.py
@@ -169,9 +169,9 @@ def cli():
     required=True,
     help='Engine used to load the model (ovgenai, openvino, optimum)')
 @click.option('--model-type', '--mt',
-    type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb']),
+    type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb', 'rerank']),
     required=True,
-    help='Model type (llm, vlm, whisper, kokoro, emb)')
+    help='Model type (llm, vlm, whisper, kokoro, emb, rerank)')
 @click.option('--device', '--d',
     required=True,
     help='Device(s) to load the model on.')
diff --git a/src/engine/optimum/optimum_rr.py b/src/engine/optimum/optimum_rr.py
new file mode 100644
index 0000000..3718f5c
--- /dev/null
+++ b/src/engine/optimum/optimum_rr.py
@@ -0,0 +1,114 @@
+
+
+import asyncio
+import gc
+import logging
+from typing import Any, AsyncIterator, Dict, List, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from transformers import AutoTokenizer
+from optimum.intel import OVModelForCausalLM
+
+from src.server.models.optimum import RerankerConfig
+
+from typing import Any, AsyncIterator, Dict
+
+from src.server.model_registry import ModelLoadConfig, ModelRegistry
+
+class Optimum_RR:
+    
+    def __init__(self, load_config: ModelLoadConfig):
+        self.model_path = None
+        self.encoder_tokenizer = None
+        self.load_config = load_config
+
+    def compute_logits(self, inputs, **kwargs):
+        batch_scores = self.model(**inputs).logits[:, -1, :]
+        true_vector = batch_scores[:, self.token_true_id]
+        false_vector = batch_scores[:, self.token_false_id]
+        batch_scores = torch.stack([false_vector, true_vector], dim=1)
+        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+        scores = batch_scores[:, 1].exp().tolist()
+        return scores
+    
+    def format_instruction(self, instruction, query, doc):
+        if instruction is None:
+            instruction = "Given a search query, retrieve relevant passages that answer the query"
+        output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction, query=query, doc=doc)
+        return output
+
+    async def generate_rerankings(self, rr_config: RerankerConfig) -> AsyncIterator[Union[Dict[str, Any], str]]:
+        prefix_tokens = self.tokenizer.encode(rr_config.prefix, add_special_tokens=False)
+        suffix_tokens = self.tokenizer.encode(rr_config.suffix, add_special_tokens=False)
+        self.max_length = rr_config.max_length
+        pairs = [self.format_instruction(rr_config.task, rr_config.query, doc) for doc in rr_config.documents]
+        print(pairs)
+        # Currently hard coding tokenizer args.  If these are model independent than it is fine.  Otherwise
+        # implement the rr_config PreTrainedTokenizerConfig args.
+        max_length = 8192
+        inputs = self.tokenizer(
+            pairs, padding=False, truncation="longest_first", return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
+        )
+
+        for i, ele in enumerate(inputs["input_ids"]):
+            inputs["input_ids"][i] = prefix_tokens + ele + suffix_tokens
+
+        # Currently hard coding tokenizer args.  If these are model independent than it is fine.  Otherwise
+        # implement the rr_config PreTrainedTokenizerConfig args.
+        inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=self.max_length)
+        for key in inputs:
+            inputs[key] = inputs[key].to(self.model.device)
+
+        scores = self.compute_logits(inputs)
+
+        ranked_documents = [{"doc":doc, "score":score} for score, doc in sorted(zip(scores,  rr_config.documents), reverse=True)]
+
+        yield ranked_documents
+
+    #not implemented
+    def collect_metrics(self, rr_config: RerankerConfig, perf_metrics) -> Dict[str, Any]:
+        pass
+
+    def load_model(self, loader: ModelLoadConfig):
+        """Load model using a ModelLoadConfig configuration and cache the tokenizer.
+
+        Args:
+            loader: ModelLoadConfig containing model_path, device, engine, and runtime_config.
+        """
+
+        self.model = OVModelForCausalLM.from_pretrained(loader.model_path, 
+            device=loader.device, 
+            export=False,
+            use_cache=False)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(loader.model_path)
+        self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+        self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
+        logging.info(f"Model loaded successfully: {loader.model_name}")
+
+    async def unload_model(self, registry: ModelRegistry, model_name: str) -> bool:
+        """Unregister model from registry and free memory resources.
+
+        Args:
+            registry: ModelRegistry to unregister from
+            model_id: Private model identifier returned by register_load
+
+        Returns:
+            True if the model was found and unregistered, else False.
+        """
+        removed = await registry.register_unload(model_name)
+
+        if self.model is not None:
+            del self.model
+            self.model = None
+        
+        if self.tokenizer is not None:
+            del self.tokenizer
+            self.tokenizer = None
+        
+        gc.collect()
+        logging.info(f"[{self.load_config.model_name}] weights and tokenizer unloaded and memory cleaned up")
+        return removed
\ No newline at end of file
diff --git a/src/server/launch.py b/src/server/launch.py
index 1a42863..2b889d3 100644
--- a/src/server/launch.py
+++ b/src/server/launch.py
@@ -91,6 +91,7 @@ def start_server(host: str = "0.0.0.0", openarc_port: int = 8001, reload: bool =
     logger.info("  - POST   /v1/audio/transcriptions: Whisper only")
     logger.info("  - POST   /v1/audio/speech: Kokoro only")
     logger.info("  - POST   /v1/embeddings")
+    logger.info("  - POST   /v1/rerank")
     
 
     uvicorn.run(
diff --git a/src/server/main.py b/src/server/main.py
index 299fbfb..0783f68 100644
--- a/src/server/main.py
+++ b/src/server/main.py
@@ -21,7 +21,7 @@
 from src.server.worker_registry import WorkerRegistry
 from src.server.models.openvino import OV_KokoroGenConfig
 from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig
-from src.server.models.optimum import PreTrainedTokenizerConfig
+from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig
 
 #===============================================================#
 # Logging
@@ -162,6 +162,16 @@ class EmbeddingsRequest(BaseModel):
     #end of openai api
     config: Optional[PreTrainedTokenizerConfig] = None
 
+# No openai api to reference
+class RerankRequest(BaseModel):
+    model: str
+    query: str
+    documents: List[str]
+    prefix:Optional[str] = None
+    suffix:Optional[str] = None
+    task:Optional[str] = None
+    config: Optional[PreTrainedTokenizerConfig] = None #not implemented
+
 @app.get("/v1/models", dependencies=[Depends(verify_api_key)])
 async def openai_list_models():
     """OpenAI-compatible endpoint that lists available models."""
@@ -399,4 +409,58 @@ async def embeddings(request: EmbeddingsRequest):
         raise HTTPException(status_code=400, detail=str(exc))
     except Exception as exc:
         traceback.print_exc()
-        raise HTTPException(status_code=500, detail=f"Embedding failed: {str(exc)}")
\ No newline at end of file
+        raise HTTPException(status_code=500, detail=f"Embedding failed: {str(exc)}")
+    
+@app.post("/v1/rerank", dependencies=[Depends(verify_api_key)])
+async def rerank(request: RerankRequest):
+
+    try:
+        if request.config:
+
+            tok_config = PreTrainedTokenizerConfig.model_validate(request.config)
+            base_data = tok_config.model_dump()
+            rr_config = RerankerConfig.model_validate(base_data | {"query":request.query,"documents":request.documents})
+        if request.prefix:
+            rr_config.prefix = request.prefix
+        if request.suffix:
+            rr_config.suffix = request.suffix
+        if request.task:
+            rr_config.task = request.task
+            
+        model_name = request.model
+        created_ts = int(time.time())
+        request_id = f"ov-{uuid.uuid4().hex[:24]}"
+
+        result = await _workers.rerank(model_name, rr_config)
+        data = result.get("data", None)
+        metrics = result.get("metrics", {}) or {}
+
+        prompt_tokens = metrics.get("input_token", 0)
+        total_tokens = metrics.get("total_token", prompt_tokens)
+
+        docs = []
+        for i in range(len(data)):
+            docs.append({
+                "index":i,
+                "object":"ranked_documents",
+                "ranked_documents":data[i]
+            })
+
+        response = {
+            "id": request_id,
+            "object": "list",
+            "created": created_ts,
+            "model": model_name,
+            "data": docs,
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "total_tokens": total_tokens,
+            },
+        }
+
+        return response
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
+    except Exception as exc:
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"Reranking failed: {str(exc)}")
\ No newline at end of file
diff --git a/src/server/model_registry.py b/src/server/model_registry.py
index eb9192b..7a8d5d5 100644
--- a/src/server/model_registry.py
+++ b/src/server/model_registry.py
@@ -64,13 +64,15 @@ class ModelType(str, Enum):
     - vlm: Image-to-text VLM models
     - whisper: Whisper ASR models
     - kokoro: Kokoro TTS models
-    - emb: Text-to-vector models"""    
+    - emb: Text-to-vector models    
+    - rerank: Reranker models"""    
     
     LLM = "llm"
     VLM = "vlm"
     WHISPER = "whisper"
     KOKORO = "kokoro"
     EMB = "emb"
+    RERANK = "rerank"
 
 class EngineType(str, Enum):
     """Engine used to load the model.
@@ -291,6 +293,7 @@ async def status(self) -> dict:
     (EngineType.OV_GENAI, ModelType.WHISPER): "src.engine.ov_genai.whisper.OVGenAI_Whisper",
     (EngineType.OPENVINO, ModelType.KOKORO): "src.engine.openvino.kokoro.OV_Kokoro",
     (EngineType.OV_OPTIMUM, ModelType.EMB): "src.engine.optimum.optimum_emb.Optimum_EMB",
+    (EngineType.OV_OPTIMUM, ModelType.RERANK): "src.engine.optimum.optimum_rr.Optimum_RR",
 }
 
 async def create_model_instance(load_config: ModelLoadConfig) -> Any:
diff --git a/src/server/models/optimum.py b/src/server/models/optimum.py
index 215cf52..6d81e42 100644
--- a/src/server/models/optimum.py
+++ b/src/server/models/optimum.py
@@ -7,7 +7,7 @@ class PreTrainedTokenizerConfig(BaseModel):
     Configuration for tokenizer.
     """
 
-    text: Union[str, List[str], List[List[str]]] = Field(
+    text: Union[str, List[str], List[List[str]]] | None = Field(
         default=None,
         description=(
             "The sequence or batch of sequences to be encoded. Each sequence can be a string "
@@ -17,7 +17,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    text_pair: Union[str, List[str], List[List[str]]] = Field(
+    text_pair: Union[str, List[str], List[List[str]]] | None = Field(
         default=None,
         description=(
             "The sequence or batch of sequences to be encoded. Each sequence can be a string "
@@ -27,7 +27,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    text_target: Union[str, List[str], List[List[str]]] = Field(
+    text_target: Union[str, List[str], List[List[str]]] | None = Field(
         default=None,
         description=(
             "The sequence or batch of sequences to be encoded as target texts. Each sequence can be "
@@ -37,7 +37,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    text_pair_target: Union[str, List[str], List[List[str]]] = Field(
+    text_pair_target: Union[str, List[str], List[List[str]]] | None = Field(
         default=None,
         description=(
             "The sequence or batch of sequences to be encoded as target texts. Each sequence can be "
@@ -85,7 +85,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    max_length: int = Field(
+    max_length: int | None = Field(
         default=None,
         description=(
             "Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to "
@@ -113,7 +113,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    pad_to_multiple_of: int = Field(
+    pad_to_multiple_of: int | None = Field(
         default=None,
         description=(
             "If set will pad the sequence to a multiple of the provided value. Requires padding to be activated. "
@@ -122,7 +122,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    padding_side: str = Field(
+    padding_side: str | None = Field(
         default=None,
         description=(
             "The side on which the model should have padding applied. Should be selected between ['right', 'left']. "
@@ -138,7 +138,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    return_token_type_ids: bool = Field(
+    return_token_type_ids: bool | None  = Field(
         default=None,
         description=(
             "Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific "
@@ -146,7 +146,7 @@ class PreTrainedTokenizerConfig(BaseModel):
         )
     )
 
-    return_attention_mask: bool = Field(
+    return_attention_mask: bool | None  = Field(
         default=None,
         description=(
             "Whether to return the attention mask. If left to the default, will return the attention mask according to the "
@@ -190,3 +190,40 @@ class PreTrainedTokenizerConfig(BaseModel):
             "Whether or not to print more information and warnings."
         )
     )
+
+class RerankerConfig(PreTrainedTokenizerConfig):
+
+    query: str = Field(
+        default=None,
+        description=(
+            "Phrase to compare documents to."
+        )
+    )
+
+    documents:  List[str] = Field(
+        default=None,
+        description=(
+            "Documents to rank."
+        )
+    )
+
+    prefix: str = Field(
+        default='<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n',
+        description=(
+            "Text to append to start of query. This is model specific."
+        )
+    )
+
+    suffix: str = Field(
+        default="<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n",
+        description=(
+            "Text to append to end of query. This is model specific."
+        )
+    )
+
+    task: str = Field(
+        default="Given a search query, retrieve relevant passages that answer the query",
+        description=(
+            "Prompt command delivered to the model."
+        )
+    )
diff --git a/src/server/worker_registry.py b/src/server/worker_registry.py
index 30269f6..b11228f 100644
--- a/src/server/worker_registry.py
+++ b/src/server/worker_registry.py
@@ -9,10 +9,11 @@
 from src.engine.ov_genai.whisper import OVGenAI_Whisper
 from src.engine.openvino.kokoro import OV_Kokoro
 from src.engine.optimum.optimum_emb import Optimum_EMB
+from src.engine.optimum.optimum_rr import Optimum_RR
 
 from src.server.models.openvino import OV_KokoroGenConfig
 from src.server.models.ov_genai import OVGenAI_GenConfig, OVGenAI_WhisperGenConfig
-from src.server.models.optimum import PreTrainedTokenizerConfig
+from src.server.models.optimum import PreTrainedTokenizerConfig, RerankerConfig
 from src.server.model_registry import ModelRecord, ModelRegistry, ModelType
 
 logger = logging.getLogger(__name__)
@@ -68,7 +69,7 @@ class InferWorker:
     - infer_whisper: Process audio transcription requests
     - infer_kokoro: Process speech generation requests
     - infer_emb: Process embedding requests
-
+    - infer_rerank: Process reranking requests
     """
     
     @staticmethod
@@ -249,6 +250,34 @@ async def infer_emb(packet: WorkerPacket, emb_instance: Optimum_EMB) -> WorkerPa
                 await packet.stream_queue.put(None)
                 
         return packet
+
+    @staticmethod
+    async def infer_rerank(packet: WorkerPacket, rerank_instance: Optimum_RR) -> WorkerPacket:
+        """Generate reranking for a single packet using the optimum pipeline"""
+        metrics = None
+        final_data = None
+
+        try:
+            async for item in rerank_instance.generate_rerankings(packet.gen_config):
+                if isinstance(item, dict):
+                    metrics = item
+                else:
+                    final_data = item
+
+            packet.response = final_data
+            packet.metrics = metrics
+            
+        except Exception as e:
+            # Log the full exception with traceback
+            logger.error("Reranking failed!", exc_info=True)
+            # Store error in packet response
+            packet.response = f"Error: {str(e)}"
+            packet.metrics = None
+            # Signal error to stream if streaming
+            if packet.gen_config.stream and packet.stream_queue is not None:
+                await packet.stream_queue.put(None)
+                
+        return packet
     
 class QueueWorker:
     """
@@ -409,6 +438,28 @@ async def queue_worker_emb(model_name: str, model_queue: asyncio.Queue, emb_mode
                 packet.result_future.set_result(completed_packet)
             model_queue.task_done()
 
+    @staticmethod
+    async def queue_worker_rr(model_name: str, model_queue: asyncio.Queue, rr_model: Optimum_RR, registry: ModelRegistry):
+        """Reranker model inference worker that processes packets from queue"""
+        logger.info(f"[{model_name} Reranker Worker] Started, waiting for packets...")
+        while True:
+            packet = await model_queue.get()
+            if packet is None:
+                logger.info(f"[{model_name} Reranker Worker] Shutdown signal received.")
+                break
+
+            completed_packet = await InferWorker.infer_rerank(packet, rr_model)
+            # Check if inference failed and trigger model unload
+            if not completed_packet.response:
+                logger.error(f"[{model_name} Reranker Worker] Inference failed, triggering model unload...")
+                asyncio.create_task(registry.register_unload(model_name))
+                break
+            if completed_packet.metrics:
+                logger.info(f"[{model_name} Reranker Worker] Metrics: {completed_packet.metrics}")
+            if packet.result_future is not None and not packet.result_future.done():
+                packet.result_future.set_result(completed_packet)
+            model_queue.task_done()
+
 class WorkerRegistry:
     """
     Central orchestrator for managing per-model inference workers and request routing.
@@ -464,6 +515,9 @@ def __init__(self, model_registry: ModelRegistry):
         self._model_queues_emb: Dict[str, asyncio.Queue] = {}
         self._model_tasks_emb: Dict[str, asyncio.Task] = {}
 
+        self._model_queues_rerank: Dict[str, asyncio.Queue] = {}
+        self._model_tasks_rerank: Dict[str, asyncio.Task] = {}
+
         self._lock = asyncio.Lock()
 
         self._model_registry.add_on_loaded(self._on_model_loaded)
@@ -520,6 +574,13 @@ async def _on_model_loaded(self, record: ModelRecord) -> None:
                     self._model_queues_emb[record.model_name] = q
                     task = asyncio.create_task(QueueWorker.queue_worker_emb(record.model_name, q, instance, self._model_registry))
                     self._model_tasks_emb[record.model_name] = task
+            
+            elif mt == ModelType.RERANK and isinstance(instance, Optimum_RR):
+                if record.model_name not in self._model_queues_rerank:
+                    q: asyncio.Queue = asyncio.Queue()
+                    self._model_queues_rerank[record.model_name] = q
+                    task = asyncio.create_task(QueueWorker.queue_worker_rr(record.model_name, q, instance, self._model_registry))
+                    self._model_tasks_rerank[record.model_name] = task
             else:
                 logger.info(f"[WorkerRegistry] Model type/instance mismatch for {record.model_name}: {record.model_type}, {type(instance)}")
 
@@ -564,6 +625,14 @@ async def _on_model_unloaded(self, record: ModelRecord) -> None:
                 await q.put(None)
             if t is not None and not t.done():
                 t.cancel()
+                
+            # Try rerank dicts
+            q = self._model_queues_rerank.pop(record.model_name, None)
+            t = self._model_tasks_rerank.pop(record.model_name, None)
+            if q is not None:
+                await q.put(None)
+            if t is not None and not t.done():
+                t.cancel()
 
     def _get_model_queue(self, model_name: str) -> asyncio.Queue:
         q = self._model_queues_llm.get(model_name)
@@ -591,6 +660,12 @@ def _get_emb_queue(self, model_name: str) -> asyncio.Queue:
         if q is not None:
             return q
         raise ValueError(f"Embedding model '{model_name}' is not loaded or no worker is available")
+
+    def _get_rerank_queue(self, model_name: str) -> asyncio.Queue:
+        q = self._model_queues_rerank.get(model_name)
+        if q is not None:
+            return q
+        raise ValueError(f"Rerank model '{model_name}' is not loaded or no worker is available")
     
     async def generate(self, model_name: str, gen_config: OVGenAI_GenConfig) -> Dict[str, Any]:
         """Generate text without streaming."""
@@ -674,4 +749,19 @@ async def embed(self, model_name: str, tok_config: PreTrainedTokenizerConfig) ->
         q = self._get_emb_queue(model_name)
         await q.put(packet)
         completed = await result_future
+        return {"data": completed.response, "metrics": completed.metrics or {}}
+    
+    async def rerank(self, model_name: str, rr_config: RerankerConfig) -> Dict[str, Any]:
+        """Rerank documents."""
+        request_id = uuid.uuid4().hex
+        result_future: asyncio.Future = asyncio.get_running_loop().create_future()
+        packet = WorkerPacket(
+            request_id=request_id,
+            id_model=model_name,
+            gen_config=rr_config,
+            result_future=result_future,
+        )
+        q = self._get_rerank_queue(model_name)
+        await q.put(packet)
+        completed = await result_future
         return {"data": completed.response, "metrics": completed.metrics or {}}
\ No newline at end of file

From bc50a709f98e546f2402b8cef79abb3813a759f6 Mon Sep 17 00:00:00 2001
From: mwrothbe <michaelrothberg@hotmail.com>
Date: Mon, 13 Oct 2025 17:21:49 -0700
Subject: [PATCH 2/5] Bug fix when no tok config is provided

---
 src/server/main.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/server/main.py b/src/server/main.py
index 0783f68..3795245 100644
--- a/src/server/main.py
+++ b/src/server/main.py
@@ -416,10 +416,12 @@ async def rerank(request: RerankRequest):
 
     try:
         if request.config:
-
             tok_config = PreTrainedTokenizerConfig.model_validate(request.config)
             base_data = tok_config.model_dump()
             rr_config = RerankerConfig.model_validate(base_data | {"query":request.query,"documents":request.documents})
+        else:
+            rr_config = RerankerConfig.model_validate({"query":request.query,"documents":request.documents})
+            
         if request.prefix:
             rr_config.prefix = request.prefix
         if request.suffix:

From 22d4284f54c772728b27ae1a046336c0d30349ec Mon Sep 17 00:00:00 2001
From: mwrothbe <michaelrothberg@hotmail.com>
Date: Wed, 15 Oct 2025 09:28:13 -0700
Subject: [PATCH 3/5] Added load all option to cli

---
 src/cli/openarc_cli.py | 63 +++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 23 deletions(-)

diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py
index 84d3fef..170cec2 100644
--- a/src/cli/openarc_cli.py
+++ b/src/cli/openarc_cli.py
@@ -76,6 +76,12 @@ def get_model_config(model_name: str):
     models = config.get("models", {})
     return models.get(model_name)
 
+def get_model_configs():
+    """Get model configuration by name."""
+    config = load_full_config()
+    models = config.get("models", {})
+    return models
+
 def remove_model_config(model_name: str):
     """Remove model configuration by name."""
     config = load_full_config()
@@ -199,42 +205,53 @@ def add(ctx, model_path, model_name, engine, model_type, device, runtime_config)
 
 @cli.command()
 @click.option('--model-name', '--mn',
-    required=True,
+    required=False,
     help='Model name to load from saved configuration.')
+@click.option('--all-models', '--am',
+    required=False,
+    is_flag=True,
+    help='Load all models from saved configuration.')
 @click.pass_context
-def load(ctx, model_name):
+def load(ctx, model_name, all_models):
     """- Load a model from saved configuration."""
     cli_instance = OpenArcCLI()
     
+    configs = {}
     # Get saved configuration
-    saved_config = get_model_config(model_name)
-    
-    if not saved_config:
+    if all_models:
+        configs = get_model_configs()
+    elif model_name:
+        configs[model_name]=get_model_config(model_name)
+    else:
+        console.print(f"❌ [red]A model name or all-models must be selected.[/red] {model_name}")
+        console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]")
+        ctx.exit(1)
+        
+    if len(configs)==0 or (model_name and not configs[model_name]):
         console.print(f"❌ [red]Model configuration not found:[/red] {model_name}")
         console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]")
         ctx.exit(1)
     
-    load_config = saved_config.copy()
-    
-    # Make API request to load the model
-    url = f"{cli_instance.base_url}/openarc/load"
-    
-    try:
-        console.print("[cyan]working...[/cyan]")
-        response = requests.post(url, json=load_config, headers=cli_instance.get_headers())
+    for key, value in configs.items():
+        # Make API request to load the model
+        url = f"{cli_instance.base_url}/openarc/load"
         
-        if response.status_code == 200:
+        try:
+            console.print("[cyan]working...[/cyan]")
+            response = requests.post(url, json=value, headers=cli_instance.get_headers())
+            
+            if response.status_code == 200:
 
-            console.print("[green]Done![/green]")
-            console.print("[dim]Use 'openarc status' to check the status of loaded models.[/dim]")
-        else:
-            console.print(f"❌ [red]error: {response.status_code}[/red]")
-            console.print(f"[red]Response:[/red] {response.text}")
+                console.print("[green]Done![/green]")
+                console.print("[dim]Use 'openarc status' to check the status of loaded models.[/dim]")
+            else:
+                console.print(f"❌ [red]error: {response.status_code}[/red]")
+                console.print(f"[red]Response:[/red] {response.text}")
+                ctx.exit(1)
+                
+        except requests.exceptions.RequestException as e:
+            console.print(f"❌ [red]Request failed:[/red] {e}")
             ctx.exit(1)
-            
-    except requests.exceptions.RequestException as e:
-        console.print(f"❌ [red]Request failed:[/red] {e}")
-        ctx.exit(1)
 
 @cli.command()
 @click.option('--model-name', '--mn', required=True, help='Model name to unload')

From e8d6e71f5104d727a6c2424283c070fd10920739 Mon Sep 17 00:00:00 2001
From: mwrothbe <michaelrothberg@hotmail.com>
Date: Mon, 20 Oct 2025 17:37:15 -0700
Subject: [PATCH 4/5] Reverting CLI to pre-commit

---
 src/cli/openarc_cli.py | 607 +++++++++++++++++++++++------------------
 1 file changed, 343 insertions(+), 264 deletions(-)

diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py
index 170cec2..9938cc0 100644
--- a/src/cli/openarc_cli.py
+++ b/src/cli/openarc_cli.py
@@ -2,108 +2,67 @@
 """
 OpenArc CLI Tool - Command-line interface for OpenArc server operations.
 """
-import os
 import json
+import os
+import sys
+import traceback
+import requests
+import yaml
 from pathlib import Path
 
-import requests
 import rich_click as click
+from rich import print as rprint
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
 
-from src.server.launch import start_server
-from src.cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery
+from api.launcher import start_server
+from cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery
 
 click.rich_click.STYLE_OPTIONS_TABLE_LEADING = 1
 click.rich_click.STYLE_OPTIONS_TABLE_BOX = "SIMPLE"
+
+# click.rich_click.STYLE_OPTIONS_TABLE_ROW_STYLES = ["bold", ""]
 click.rich_click.STYLE_COMMANDS_TABLE_SHOW_LINES = True
+# click.rich_click.STYLE_COMMANDS_TABLE_PAD_EDGE = True
+#click.rich_click.STYLE_COMMANDS_TABLE_BOX = "DOUBLE"
 click.rich_click.STYLE_COMMANDS_TABLE_BORDER_STYLE = "red"
 click.rich_click.STYLE_COMMANDS_TABLE_ROW_STYLES = ["magenta", "yellow", "cyan", "green"]
 
 console = Console()
 
-PROJECT_ROOT = Path(__file__).parent.parent.parent
-CONFIG_FILE = PROJECT_ROOT / "openarc-config.json"
+# Configuration handling - use project root directory
+PROJECT_ROOT = Path(__file__).parent
+CONFIG_FILE = PROJECT_ROOT / "openarc-cli-config.yaml"
 
 def save_cli_config(host: str, port: int):
-    """Save server configuration to JSON config file."""
-    config = load_full_config()  # Load existing config first
-    config.update({
+    """Save server configuration to YAML config file."""
+    config = {
         "server": {
             "host": host,
             "port": port
         },
         "created_by": "openarc-cli",
         "version": "1.0"
-    })
+    }
     
     with open(CONFIG_FILE, "w") as f:
-        json.dump(config, f, indent=2)
+        yaml.dump(config, f, default_flow_style=False, indent=2)
     
     console.print(f"📝 [dim]Configuration saved to: {CONFIG_FILE}[/dim]")
 
-def save_model_config(model_name: str, load_config: dict):
-    """Save model configuration to JSON config file."""
-    config = load_full_config()
-    
-    if "models" not in config:
-        config["models"] = {}
-    
-    config["models"][model_name] = load_config
-    
-    with open(CONFIG_FILE, "w") as f:
-        json.dump(config, f, indent=2)
-    
-    console.print(f"💾 [green]Model configuration saved:[/green] {model_name}")
-
-def load_full_config():
-    """Load full configuration from JSON config file."""
+def load_cli_config():
+    """Load server configuration from YAML config file."""
     if CONFIG_FILE.exists():
         try:
             with open(CONFIG_FILE, "r") as f:
-                config = json.load(f)
-                return config if config else {}
-        except (json.JSONDecodeError, FileNotFoundError):
+                config = yaml.safe_load(f)
+                if config and "server" in config:
+                    return config["server"]
+        except (yaml.YAMLError, FileNotFoundError, KeyError):
             console.print(f"[yellow]Warning: Could not read config file {CONFIG_FILE}[/yellow]")
     
-    return {}
-
-def get_model_config(model_name: str):
-    """Get model configuration by name."""
-    config = load_full_config()
-    models = config.get("models", {})
-    return models.get(model_name)
-
-def get_model_configs():
-    """Get model configuration by name."""
-    config = load_full_config()
-    models = config.get("models", {})
-    return models
-
-def remove_model_config(model_name: str):
-    """Remove model configuration by name."""
-    config = load_full_config()
-    models = config.get("models", {})
-    
-    if model_name not in models:
-        return False
-    
-    del models[model_name]
-    config["models"] = models
-    
-    with open(CONFIG_FILE, "w") as f:
-        json.dump(config, f, indent=2)
-    
-    return True
-
-def load_cli_config():
-    """Load server configuration from YAML config file."""
-    config = load_full_config()
-    if config and "server" in config:
-        return config["server"]
-    
     return {"host": "localhost", "port": 8000}  # defaults
 
 class OpenArcCLI:
@@ -134,7 +93,7 @@ def get_help(self, ctx):
         art.append("      | |                              \n", style="white")
         art.append("      |_|                              \n", style="white")
         art.append(" \n", style="white")
-        art.append(" The CLI application   \n", style="white")
+        art.append("The CLI application   \n", style="white")
         console.print(art)
         return super().get_help(ctx)
 
@@ -144,12 +103,10 @@ def cli():
     Use this application to interface with the OpenArc server.
     
     Features:
-
+    
     • Start the OpenArc server.
     
     • Load models into the OpenArc server.
-    
-    • List models from saved configurations.
 
     • Check the status of loaded models.
 
@@ -164,116 +121,277 @@ def cli():
     """
 
 @cli.command()
-@click.option('--model-name', '--mn',
-    required=True,
-    help='Public facing name of the model.')
-@click.option('--model-path', '--m',
-    required=True, 
-    help='Path to OpenVINO IR converted model.')
-@click.option('--engine', '--en',
-    type=click.Choice(['ovgenai', 'openvino', 'optimum']),
-    required=True,
-    help='Engine used to load the model (ovgenai, openvino, optimum)')
-@click.option('--model-type', '--mt',
-    type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb', 'rerank']),
-    required=True,
-    help='Model type (llm, vlm, whisper, kokoro, emb, rerank)')
-@click.option('--device', '--d',
-    required=True,
-    help='Device(s) to load the model on.')
-@click.option("--runtime-config", "--rtc",
-    type=dict,
-    default={},
-    help='OpenVINO runtime configuration (e.g., performance hints). These are checked serverside at runtime.')
+
+@click.option('--model', 
+              required=True, 
+              help="""
+              - Absolute path to model.
+
+              - The dir name which stores the openvino model files is used in the API to identify the model.
+
+              - The dir name is the same as the model name.
+              """)
+
+@click.option('--model-type', 
+              type=click.Choice(['TEXT', 'VISION']),
+              required=True, 
+              default='TEXT',
+              help="""
+
+              - Type of model.
+
+              """)
+
+@click.option('--device', 
+              required=True, 
+              default='CPU', 
+              help="""
+              - Device: CPU, GPU.0, GPU.1, GPU.2, GPU.3, GPU.4, AUTO
+
+              - GPU.0 is the first GPU, GPU.1 is the second GPU, etc.
+
+              - AUTO will automatically select the best device.
+              """)
+
+@click.option('--use-cache/--no-use-cache',
+              required=True, 
+              default=True,
+              help="""
+              - Use cache for stateful models.
+
+              - Edge cases may require disabling cache, probably based on model architecture.
+
+              """)
+
+@click.option('--dynamic-shapes/--no-dynamic-shapes',
+              required=True, 
+              default=True,
+              help="""
+              - Use dynamic shapes.
+               
+              - If false, the model will be loaded with static shapes.
+
+              - OpenVINO IR usually use dynamic shapes but for NPU it must be disabled.
+
+              """)
+
+@click.option('--pad-token-id', 
+              required=False, 
+              type=int, 
+              help="""
+              - (pad)pad token ID
+
+              - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly.
+
+              """
+              )
+
+@click.option('--eos-token-id', 
+              required=False, 
+              type=int, 
+              help="""
+                - (eos)end of sequence token id
+
+                - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly.  
+
+                - When the eos token is set to the *incorrect* token the model will continue to generate tokens.
+                
+                - Pairing this with a target max_length is a good way to test performance.
+                """
+              
+              )
+
+@click.option('--bos-token-id', 
+              required=False, 
+              type=int, 
+              help='beginning of sequence token ID')
+
+@click.option('--num-streams', 
+              required=False, 
+              type=int, 
+              default=None,
+              show_default=True,
+              help='Number of inference streams')
+
+@click.option('--performance-hint', 
+              required=False, 
+              type=click.Choice(['LATENCY', 'THROUGHPUT', 'CUMULATIVE_THROUGHPUT']),
+              default=None,
+              show_default=True,
+              help="""
+              ---
+
+              - High level performance hint.
+
+              - Usually I use 'LATENCY' which locks to one CPU or one CPU socket.
+
+              - It's best to use the documentation for this.
+
+              https://docs.openvino.ai/2025/openvino-workflow/running-inference/optimize-inference/high-level-performance-hints.html
+
+              ---
+
+              """
+              )
+
+@click.option('--inference-precision-hint', 
+              required=False, 
+              type=click.Choice(['fp32', 'f16', 'bf16', 'dynamic']),
+              default=None,
+              show_default=True,
+              help="""
+              ---
+
+              - Controls precision during inference, at inference time.
+
+                - Works on CPU and GPU.
+
+              - Target device specific features.
+
+              - Ex:'bf16' is probably best on CPUs which support AMX.
+              
+              """
+              )
+
+@click.option('--enable-hyper-threading', 
+              required=False, 
+              type=bool, 
+              default=None,
+              help="""
+              ---
+
+              - CPU ONLY --> Cannot be used with GPU.
+
+              - Enable hyper-threading 
+
+              - This is only relevant for Intel CPUs with hyperthreading i.e, two virtual cores per physical core.
+
+              """
+              )
+
+@click.option('--inference-num-threads', 
+              required=False, 
+              type=int, 
+              default=None,
+              show_default=True,
+              help="""
+              ---
+
+              - CPU ONLY --> Cannot be used with GPU.
+
+              - Number of inference threads
+
+              - More threads usually means faster inference. 
+
+              - Therefore this can be used to constrain the number of threads used for inference.
+              """
+              )
+
+@click.option('--scheduling-core-type', 
+              required=False, 
+              type=click.Choice(['ANY_CORE', 'PCORE_ONLY', 'ECORE_ONLY']),
+              default=None,
+              show_default=True,
+              help="""
+              ---
+
+              - Advanced option to target p-cores or e-cores on CPUs which support it.
+
+              - CPU ONLY --> Cannot be used with GPU.
+
+              - [ANY_CORE]: Any core, so default for 'older' Intel CPUs. Default for most chips but no need to set.
+
+              - [PCORE_ONLY]: Only run inference on threads which are performance cores.
+
+              - [ECORE_ONLY]: Only run inference on threads which are efficency cores.
+              ---
+                """
+              )
+
 @click.pass_context
-def add(ctx, model_path, model_name, engine, model_type, device, runtime_config):
-    """- Add a model configuration to the config file."""
+def load(ctx, model, model_type, device, use_cache, dynamic_shapes,
+         pad_token_id, eos_token_id, bos_token_id, num_streams, performance_hint,
+         inference_precision_hint, enable_hyper_threading, inference_num_threads,
+         scheduling_core_type):
+    """- Load a model."""
+    cli_instance = OpenArcCLI()
     
-    # Build and save configuration
+    # Build load_config from arguments
     load_config = {
-        "model_name": model_name,
-        "model_path": model_path,  
-        "model_type": model_type,  
-        "engine": engine,    
+        "id_model": model,
+        "architecture_type": model_type,
+        "use_cache": use_cache,
         "device": device,
-        "runtime_config": runtime_config if runtime_config else {}
+        "dynamic_shapes": dynamic_shapes,
     }
     
-    save_model_config(model_name, load_config)
-    console.print(f"✅ [green]Saved configuration for:[/green] {model_name}")
-    console.print(f"[dim]Use 'openarc load --mn {model_name}' to load this model.[/dim]")
-
-@cli.command()
-@click.option('--model-name', '--mn',
-    required=False,
-    help='Model name to load from saved configuration.')
-@click.option('--all-models', '--am',
-    required=False,
-    is_flag=True,
-    help='Load all models from saved configuration.')
-@click.pass_context
-def load(ctx, model_name, all_models):
-    """- Load a model from saved configuration."""
-    cli_instance = OpenArcCLI()
+    # Add optional token IDs if provided
+    if pad_token_id is not None:
+        load_config["pad_token_id"] = pad_token_id
+    if eos_token_id is not None:
+        load_config["eos_token_id"] = eos_token_id
+    if bos_token_id is not None:
+        load_config["bos_token_id"] = bos_token_id
     
-    configs = {}
-    # Get saved configuration
-    if all_models:
-        configs = get_model_configs()
-    elif model_name:
-        configs[model_name]=get_model_config(model_name)
-    else:
-        console.print(f"❌ [red]A model name or all-models must be selected.[/red] {model_name}")
-        console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]")
-        ctx.exit(1)
-        
-    if len(configs)==0 or (model_name and not configs[model_name]):
-        console.print(f"❌ [red]Model configuration not found:[/red] {model_name}")
-        console.print("[yellow]Tip: Use 'openarc list' to see saved configurations, or 'openarc add' to create a new one.[/yellow]")
-        ctx.exit(1)
+    # Build ov_config from arguments
+    ov_config = {}
+    if performance_hint is not None:
+        ov_config["PERFORMANCE_HINT"] = performance_hint
+    if inference_precision_hint is not None:
+        ov_config["INFERENCE_PRECISION_HINT"] = inference_precision_hint
+    if enable_hyper_threading is not None:
+        ov_config["ENABLE_HYPER_THREADING"] = enable_hyper_threading
+    if inference_num_threads not in (None, False):
+        ov_config["INFERENCE_NUM_THREADS"] = inference_num_threads
+    if scheduling_core_type is not None:
+        ov_config["SCHEDULING_CORE_TYPE"] = scheduling_core_type
+    if num_streams is not None:
+        ov_config["NUM_STREAMS"] = num_streams
     
-    for key, value in configs.items():
-        # Make API request to load the model
-        url = f"{cli_instance.base_url}/openarc/load"
+    # Prepare payload
+    payload = {
+        "load_config": load_config,
+        "ov_config": ov_config if ov_config else {}
+    }
+    
+    # Make API request
+    url = f"{cli_instance.base_url}/optimum/model/load"
+    
+    try:
+        console.print(f"🚀 [blue]Loading model:[/blue] {model}")
+        response = requests.post(url, json=payload, headers=cli_instance.get_headers())
         
-        try:
-            console.print("[cyan]working...[/cyan]")
-            response = requests.post(url, json=value, headers=cli_instance.get_headers())
-            
-            if response.status_code == 200:
-
-                console.print("[green]Done![/green]")
-                console.print("[dim]Use 'openarc status' to check the status of loaded models.[/dim]")
-            else:
-                console.print(f"❌ [red]error: {response.status_code}[/red]")
-                console.print(f"[red]Response:[/red] {response.text}")
-                ctx.exit(1)
-                
-        except requests.exceptions.RequestException as e:
-            console.print(f"❌ [red]Request failed:[/red] {e}")
+        if response.status_code == 200:
+            console.print("✅ [green]Model loaded successfully![/green]")
+        else:
+            console.print(f"❌ [red]Error loading model: {response.status_code}[/red]")
+            console.print(f"[red]Response:[/red] {response.text}")
             ctx.exit(1)
+            
+    except requests.exceptions.RequestException as e:
+        console.print(f"❌ [red]Request failed:[/red] {e}")
+        ctx.exit(1)
 
 @cli.command()
-@click.option('--model-name', '--mn', required=True, help='Model name to unload')
+@click.option('--model-id', required=True, help='Model ID to unload')
 @click.pass_context
-def unload(ctx, model_name):
+def unload(ctx, model_id):
     """
-    - POST Delete a model from registry and unload from memory.
+    - DELETE a model from memory. 
     """
     cli_instance = OpenArcCLI()
 
-    url = f"{cli_instance.base_url}/openarc/unload"
-    payload = {"model_name": model_name}
+    # Make API request
+    url = f"{cli_instance.base_url}/optimum/model/unload"
+    params = {"model_id": model_id}
     
     try:
-        console.print(f"🗑️  [blue]Unloading model:[/blue] {model_name}")
-        response = requests.post(url, json=payload, headers=cli_instance.get_headers())
+        console.print(f"🗑️  [blue]Unloading model:[/blue] {model_id}")
+        response = requests.delete(url, params=params, headers=cli_instance.get_headers())
         
         if response.status_code == 200:
             result = response.json()
-            # Handle different possible response formats
-            message = result.get('message', f"Model '{model_name}' unloaded successfully")
-            console.print(f"✅ [green]{message}[/green]")
+            console.print(f"✅ [green]{result['message']}[/green]")
         else:
             console.print(f"❌ [red]Error unloading model: {response.status_code}[/red]")
             console.print(f"[red]Response:[/red] {response.text}")
@@ -283,79 +401,13 @@ def unload(ctx, model_name):
         console.print(f"❌ [red]Request failed:[/red] {e}")
         ctx.exit(1)
 
-@cli.command()
-@click.option('--model-name','--mn', help='Model name to remove (used with --rm).')
-@click.option('--rm', is_flag=True, help='Remove a model configuration.')
-@click.pass_context
-def list(ctx, rm, model_name):
-    """- List saved model configurations.
-       
-       - Remove a model configuration."""
-    if rm:
-        if not model_name:
-            console.print("❌ [red]Error:[/red] --model-name is required when using --rm")
-
-            ctx.exit(1)
-        
-        # Check if model exists before trying to remove
-        existing_config = get_model_config(model_name)
-        if not existing_config:
-            console.print(f"❌ {model_name}[red] not found:[/red]")
-            console.print("[yellow]Use 'openarc list' to see available configurations.[/yellow]")
-            ctx.exit(1)
-        
-        # Remove the configuration
-        if remove_model_config(model_name):
-            console.print(f"🗑️  [green]Model configuration removed:[/green] {model_name}")
-        else:
-            console.print(f"❌ [red]Failed to remove model configuration:[/red] {model_name}")
-            ctx.exit(1)
-        return
-    
-    config = load_full_config()
-    models = config.get("models", {})
-    
-    if not models:
-        console.print("[yellow]No model configurations found.[/yellow]")
-        console.print("[dim]Use 'openarc add --help' to see how to save configurations.[/dim]")
-        return
-    
-    console.print(f"📋 [blue]Saved Model Configurations ({len(models)}):[/blue]\n")
-    
-    for model_name, model_config in models.items():
-        # Create a table for each model configuration
-        config_table = Table(show_header=False, box=None, pad_edge=False)
-        
-
-        config_table.add_row("model_name", f"[cyan]{model_name}[/cyan]")
-        config_table.add_row("device", f"[blue]{model_config.get('device')}[/blue]")
-        config_table.add_row("engine", f"[green]{model_config.get('engine')}[/green]")
-        config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]")
-        
-        
-        rtc = model_config.get('runtime_config', {})
-        if rtc:
-            config_table.add_row("", "")
-            config_table.add_row(Text("runtime_config", style="bold underline yellow"), "")
-            for key, value in rtc.items():
-                config_table.add_row(f"  {key}", f"[dim]{value}[/dim]")
-        
-        panel = Panel(
-            config_table,
-            border_style="green"
-        )
-        console.print(panel)
-    
-    console.print("\n[dim]To load a saved configuration: openarc load --model-name <model_name>[/dim]")
-    console.print("[dim]To remove a configuration: openarc list --remove --model-name <model_name>[/dim]")
-
 @cli.command()
 @click.pass_context
 def status(ctx):
     """- GET Status of loaded models."""
     cli_instance = OpenArcCLI()
     
-    url = f"{cli_instance.base_url}/openarc/status"
+    url = f"{cli_instance.base_url}/optimum/status"
     
     try:
         console.print("📊 [blue]Getting model status...[/blue]")
@@ -363,39 +415,66 @@ def status(ctx):
         
         if response.status_code == 200:
             result = response.json()
-            models = result.get("models", [])
-            total_models = result.get("total_loaded_models", 0)
+            loaded_models = result.get("loaded_models", {})
+            total_models = result.get("total_models_loaded", 0)
             
-            if not models:
+            if not loaded_models:
                 console.print("[yellow]No models currently loaded.[/yellow]")
             else:
-                # Create a table for all models
-                status_table = Table(title=f"📊 Loaded Models ({total_models})")
-                status_table.add_column("model_name", style="cyan", width=20)
-                status_table.add_column("device", style="blue", width=10)
-                status_table.add_column("model_type", style="magenta", width=15)
-                status_table.add_column("engine", style="green", width=10)
-                status_table.add_column("status", style="yellow", width=10)
-                status_table.add_column("time_loaded", style="dim", width=20)
-                
-                for model in models:
-                    model_name = model.get("model_name")
-                    device = model.get("device")
-                    model_type = model.get("model_type")
-                    engine = model.get("engine")
-                    status = model.get("status")
-                    time_loaded = model.get("time_loaded")
-                    
-                    status_table.add_row(
-                        model_name,
-                        device,
-                        model_type,
-                        engine,
-                        status,
-                        time_loaded
+                for model_id, model_info in loaded_models.items():
+                    device = model_info.get("device", "unknown")
+                    status_val = model_info.get("status", "unknown")
+                    metadata = model_info.get("model_metadata", {})
+                    model_type = metadata.get("architecture_type", "unknown")
+                    use_cache = str(metadata.get("use_cache", "-"))
+                    dynamic_shapes = str(metadata.get("dynamic_shapes", "-"))
+                    pad_token_id = str(metadata.get("pad_token_id", "-"))
+                    eos_token_id = str(metadata.get("eos_token_id", "-"))
+                    bos_token_id = str(metadata.get("bos_token_id", "-"))
+                    num_streams = str(metadata.get("NUM_STREAMS", "-"))
+                    precision = str(metadata.get("INFERENCE_PRECISION_HINT", "-"))
+                    perf_hint = str(metadata.get("PERFORMANCE_HINT", "-"))
+                    inf_num_threads = str(metadata.get("INFERENCE_NUM_THREADS", "-"))
+                    enable_ht = str(metadata.get("ENABLE_HYPER_THREADING", "-"))
+                    sched_core_type = str(metadata.get("SCHEDULING_CORE_TYPE", "-"))
+
+                    model_table = Table(show_header=False, box=None, pad_edge=False)
+
+                    model_table.add_row(Text("Model Info", style="bold underline cyan"), "")
+                    model_table.add_row("Model ID", f"[cyan]{model_id}[/cyan]")
+                    model_table.add_row("Type", f"[yellow]{model_type}[/yellow]")
+                    model_table.add_row("Status", f"[green]{status_val}[/green]" if status_val == "loaded" else f"[red]{status_val}[/red]")
+                    model_table.add_row("", "")
+
+                    # Device Info Section
+                    model_table.add_row(Text("Device Info", style="bold underline magenta"), "")
+                    model_table.add_row("Device", f"[magenta]{device}[/magenta]")
+                    model_table.add_row("Use Cache", use_cache)
+                    model_table.add_row("Dynamic Shapes", dynamic_shapes)
+                    model_table.add_row("", "")
+
+                    # Token IDs Section
+                    model_table.add_row(Text("Token IDs", style="bold underline yellow"), "")
+                    model_table.add_row("Pad Token ID", pad_token_id)
+                    model_table.add_row("EOS Token ID", eos_token_id)
+                    model_table.add_row("BOS Token ID", bos_token_id)
+                    model_table.add_row("", "")
+
+                    # Performance Settings Section
+                    model_table.add_row(Text("Performance Settings", style="bold underline green"), "")
+                    model_table.add_row("NUM_STREAMS", num_streams)
+                    model_table.add_row("INFERENCE_PRECISION_HINT", precision)
+                    model_table.add_row("PERFORMANCE_HINT", perf_hint)
+                    model_table.add_row("INFERENCE_NUM_THREADS", inf_num_threads)
+                    model_table.add_row("ENABLE_HYPER_THREADING", enable_ht)
+                    model_table.add_row("SCHEDULING_CORE_TYPE", sched_core_type)
+
+                    panel = Panel(
+                        model_table,
+                        title=f"🧩 Model: [bold]{model_id}[/bold]",
+                        border_style="blue" if status_val == "loaded" else "red"
                     )
-                
-                console.print(status_table)
+                    console.print(panel)
                 console.print(f"\n[green]Total models loaded: {total_models}[/green]")
             
         else:
@@ -413,11 +492,11 @@ def tool(ctx):
     """- Utility scripts."""
     pass
 
-@tool.command('device-props')
+@tool.command('device-properties')
 @click.pass_context
 def device_properties(ctx):
     """
-    - Query OpenVINO device properties for all available devices.
+    - Query device properties for all devices.
     """
     
     try:
@@ -462,22 +541,22 @@ def device_detect(ctx):
         diagnostic = DeviceDiagnosticQuery()
         available_devices = diagnostic.get_available_devices()
         
-        table = Table()
-        table.add_column("Index", style="cyan", width=2)
+        table = Table(title="📋 Available Devices")
+        table.add_column("#", style="cyan", width=4)
         table.add_column("Device", style="green")
         
         if not available_devices:
-            console.print("❌ [red] Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red]")
+            console.print("❌ [red]No OpenVINO devices found![/red]")
             ctx.exit(1)
         
         for i, device in enumerate(available_devices, 1):
             table.add_row(str(i), device)
         
         console.print(table)
-        console.print(f"\n✅ [green] Sanity test passed: found {len(available_devices)} device(s)[/green]")
+        console.print(f"\n✅ [green]OpenVINO runtime found {len(available_devices)} device(s)[/green]")
             
     except Exception as e:
-        console.print(f"❌ [red]Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red] {e}")
+        console.print(f"❌ [red]Error during device diagnosis:[/red] {e}")
         ctx.exit(1)
 
 @cli.group()
@@ -488,10 +567,12 @@ def serve():
     pass
 
 @serve.command("start")
+
 @click.option("--host", type=str, default="0.0.0.0", show_default=True,
               help="""
               - Host to bind the server to
               """)
+
 @click.option("--openarc-port", 
               type=int, 
               default=8000, 
@@ -502,7 +583,7 @@ def serve():
 
 def start(host, openarc_port):
     """
-    - 'start' reads --host and --openarc-port from config or defaults to 0.0.0.0:8000
+    - 'start' reads --host and --openarc-port and saves them to the config file. Then it starts the server and will read
     """
     # Save server configuration for other CLI commands to use
     save_cli_config(host, openarc_port)
@@ -514,5 +595,3 @@ def start(host, openarc_port):
 if __name__ == "__main__":
     cli()
 
-
-

From b547cac541356257e77c85566c03f6b10eb58423 Mon Sep 17 00:00:00 2001
From: Emerson Tatelbaum <164939384+SearchSavior@users.noreply.github.com>
Date: Mon, 20 Oct 2025 21:20:20 -0400
Subject: [PATCH 5/5] Refactor OpenArc CLI for JSON config and model handling

copy and pasted cli from main 1.0.6 into fork. essentially I've used a screwdriver as a chisel
---
 src/cli/openarc_cli.py | 665 ++++++++++++++++++++---------------------
 1 file changed, 321 insertions(+), 344 deletions(-)

diff --git a/src/cli/openarc_cli.py b/src/cli/openarc_cli.py
index 9938cc0..222abd6 100644
--- a/src/cli/openarc_cli.py
+++ b/src/cli/openarc_cli.py
@@ -2,67 +2,102 @@
 """
 OpenArc CLI Tool - Command-line interface for OpenArc server operations.
 """
-import json
 import os
-import sys
-import traceback
-import requests
-import yaml
+import json
 from pathlib import Path
 
+import requests
 import rich_click as click
-from rich import print as rprint
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 from rich.text import Text
 
-from api.launcher import start_server
-from cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery
+from src.server.launch import start_server
+from src.cli.device_query import DeviceDataQuery, DeviceDiagnosticQuery
 
 click.rich_click.STYLE_OPTIONS_TABLE_LEADING = 1
 click.rich_click.STYLE_OPTIONS_TABLE_BOX = "SIMPLE"
-
-# click.rich_click.STYLE_OPTIONS_TABLE_ROW_STYLES = ["bold", ""]
 click.rich_click.STYLE_COMMANDS_TABLE_SHOW_LINES = True
-# click.rich_click.STYLE_COMMANDS_TABLE_PAD_EDGE = True
-#click.rich_click.STYLE_COMMANDS_TABLE_BOX = "DOUBLE"
 click.rich_click.STYLE_COMMANDS_TABLE_BORDER_STYLE = "red"
 click.rich_click.STYLE_COMMANDS_TABLE_ROW_STYLES = ["magenta", "yellow", "cyan", "green"]
 
 console = Console()
 
-# Configuration handling - use project root directory
-PROJECT_ROOT = Path(__file__).parent
-CONFIG_FILE = PROJECT_ROOT / "openarc-cli-config.yaml"
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+CONFIG_FILE = PROJECT_ROOT / "openarc-config.json"
 
 def save_cli_config(host: str, port: int):
-    """Save server configuration to YAML config file."""
-    config = {
+    """Save server configuration to JSON config file."""
+    config = load_full_config()  # Load existing config first
+    config.update({
         "server": {
             "host": host,
             "port": port
         },
         "created_by": "openarc-cli",
         "version": "1.0"
-    }
+    })
     
     with open(CONFIG_FILE, "w") as f:
-        yaml.dump(config, f, default_flow_style=False, indent=2)
+        json.dump(config, f, indent=2)
     
     console.print(f"📝 [dim]Configuration saved to: {CONFIG_FILE}[/dim]")
 
-def load_cli_config():
-    """Load server configuration from YAML config file."""
+def save_model_config(model_name: str, load_config: dict):
+    """Save model configuration to JSON config file."""
+    config = load_full_config()
+    
+    if "models" not in config:
+        config["models"] = {}
+    
+    config["models"][model_name] = load_config
+    
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(config, f, indent=2)
+    
+    console.print(f"💾 [green]Model configuration saved:[/green] {model_name}")
+
+def load_full_config():
+    """Load full configuration from JSON config file."""
     if CONFIG_FILE.exists():
         try:
             with open(CONFIG_FILE, "r") as f:
-                config = yaml.safe_load(f)
-                if config and "server" in config:
-                    return config["server"]
-        except (yaml.YAMLError, FileNotFoundError, KeyError):
+                config = json.load(f)
+                return config if config else {}
+        except (json.JSONDecodeError, FileNotFoundError):
             console.print(f"[yellow]Warning: Could not read config file {CONFIG_FILE}[/yellow]")
     
+    return {}
+
+def get_model_config(model_name: str):
+    """Get model configuration by name."""
+    config = load_full_config()
+    models = config.get("models", {})
+    return models.get(model_name)
+
+def remove_model_config(model_name: str):
+    """Remove model configuration by name."""
+    config = load_full_config()
+    models = config.get("models", {})
+    
+    if model_name not in models:
+        return False
+    
+    del models[model_name]
+    config["models"] = models
+    
+    with open(CONFIG_FILE, "w") as f:
+        json.dump(config, f, indent=2)
+    
+    return True
+
+def load_cli_config():
+    """Load server configuration from YAML config file."""
+    config = load_full_config()
+    if config and "server" in config:
+        return config["server"]
+    
     return {"host": "localhost", "port": 8000}  # defaults
 
 class OpenArcCLI:
@@ -93,7 +128,7 @@ def get_help(self, ctx):
         art.append("      | |                              \n", style="white")
         art.append("      |_|                              \n", style="white")
         art.append(" \n", style="white")
-        art.append("The CLI application   \n", style="white")
+        art.append(" The CLI application   \n", style="white")
         console.print(art)
         return super().get_help(ctx)
 
@@ -103,10 +138,12 @@ def cli():
     Use this application to interface with the OpenArc server.
     
     Features:
-    
+
     • Start the OpenArc server.
     
     • Load models into the OpenArc server.
+    
+    • List models from saved configurations.
 
     • Check the status of loaded models.
 
@@ -121,277 +158,149 @@ def cli():
     """
 
 @cli.command()
-
-@click.option('--model', 
-              required=True, 
-              help="""
-              - Absolute path to model.
-
-              - The dir name which stores the openvino model files is used in the API to identify the model.
-
-              - The dir name is the same as the model name.
-              """)
-
-@click.option('--model-type', 
-              type=click.Choice(['TEXT', 'VISION']),
-              required=True, 
-              default='TEXT',
-              help="""
-
-              - Type of model.
-
-              """)
-
-@click.option('--device', 
-              required=True, 
-              default='CPU', 
-              help="""
-              - Device: CPU, GPU.0, GPU.1, GPU.2, GPU.3, GPU.4, AUTO
-
-              - GPU.0 is the first GPU, GPU.1 is the second GPU, etc.
-
-              - AUTO will automatically select the best device.
-              """)
-
-@click.option('--use-cache/--no-use-cache',
-              required=True, 
-              default=True,
-              help="""
-              - Use cache for stateful models.
-
-              - Edge cases may require disabling cache, probably based on model architecture.
-
-              """)
-
-@click.option('--dynamic-shapes/--no-dynamic-shapes',
-              required=True, 
-              default=True,
-              help="""
-              - Use dynamic shapes.
-               
-              - If false, the model will be loaded with static shapes.
-
-              - OpenVINO IR usually use dynamic shapes but for NPU it must be disabled.
-
-              """)
-
-@click.option('--pad-token-id', 
-              required=False, 
-              type=int, 
-              help="""
-              - (pad)pad token ID
-
-              - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly.
-
-              """
-              )
-
-@click.option('--eos-token-id', 
-              required=False, 
-              type=int, 
-              help="""
-                - (eos)end of sequence token id
-
-                - AutoTokenizers usually infers this from config.json but it's useful to configure explicitly.  
-
-                - When the eos token is set to the *incorrect* token the model will continue to generate tokens.
-                
-                - Pairing this with a target max_length is a good way to test performance.
-                """
-              
-              )
-
-@click.option('--bos-token-id', 
-              required=False, 
-              type=int, 
-              help='beginning of sequence token ID')
-
-@click.option('--num-streams', 
-              required=False, 
-              type=int, 
-              default=None,
-              show_default=True,
-              help='Number of inference streams')
-
-@click.option('--performance-hint', 
-              required=False, 
-              type=click.Choice(['LATENCY', 'THROUGHPUT', 'CUMULATIVE_THROUGHPUT']),
-              default=None,
-              show_default=True,
-              help="""
-              ---
-
-              - High level performance hint.
-
-              - Usually I use 'LATENCY' which locks to one CPU or one CPU socket.
-
-              - It's best to use the documentation for this.
-
-              https://docs.openvino.ai/2025/openvino-workflow/running-inference/optimize-inference/high-level-performance-hints.html
-
-              ---
-
-              """
-              )
-
-@click.option('--inference-precision-hint', 
-              required=False, 
-              type=click.Choice(['fp32', 'f16', 'bf16', 'dynamic']),
-              default=None,
-              show_default=True,
-              help="""
-              ---
-
-              - Controls precision during inference, at inference time.
-
-                - Works on CPU and GPU.
-
-              - Target device specific features.
-
-              - Ex:'bf16' is probably best on CPUs which support AMX.
-              
-              """
-              )
-
-@click.option('--enable-hyper-threading', 
-              required=False, 
-              type=bool, 
-              default=None,
-              help="""
-              ---
-
-              - CPU ONLY --> Cannot be used with GPU.
-
-              - Enable hyper-threading 
-
-              - This is only relevant for Intel CPUs with hyperthreading i.e, two virtual cores per physical core.
-
-              """
-              )
-
-@click.option('--inference-num-threads', 
-              required=False, 
-              type=int, 
-              default=None,
-              show_default=True,
-              help="""
-              ---
-
-              - CPU ONLY --> Cannot be used with GPU.
-
-              - Number of inference threads
-
-              - More threads usually means faster inference. 
-
-              - Therefore this can be used to constrain the number of threads used for inference.
-              """
-              )
-
-@click.option('--scheduling-core-type', 
-              required=False, 
-              type=click.Choice(['ANY_CORE', 'PCORE_ONLY', 'ECORE_ONLY']),
-              default=None,
-              show_default=True,
-              help="""
-              ---
-
-              - Advanced option to target p-cores or e-cores on CPUs which support it.
-
-              - CPU ONLY --> Cannot be used with GPU.
-
-              - [ANY_CORE]: Any core, so default for 'older' Intel CPUs. Default for most chips but no need to set.
-
-              - [PCORE_ONLY]: Only run inference on threads which are performance cores.
-
-              - [ECORE_ONLY]: Only run inference on threads which are efficency cores.
-              ---
-                """
-              )
-
+@click.option('--model-name', '--mn',
+    required=True,
+    help='Public facing name of the model.')
+@click.option('--model-path', '--m',
+    required=True, 
+    help='Path to OpenVINO IR converted model.')
+@click.option('--engine', '--en',
+    type=click.Choice(['ovgenai', 'openvino', 'optimum']),
+    required=True,
+    help='Engine used to load the model (ovgenai, openvino, optimum)')
+@click.option('--model-type', '--mt',
+    type=click.Choice(['llm', 'vlm', 'whisper', 'kokoro', 'emb']),
+    required=True,
+    help='Model type (llm, vlm, whisper, kokoro, emb)')
+@click.option('--device', '--d',
+    required=True,
+    help='Device(s) to load the model on.')
+@click.option("--runtime-config", "--rtc",
+    default=None,
+    help='OpenVINO runtime configuration (e.g., performance hints). These are checked serverside at runtime.')
+@click.option('--vlm-type', '--vt',
+    type=click.Choice(['internvl2', 'llava15', 'llavanext', 'minicpmv26', 'phi3vision', 'phi4mm', 'qwen2vl', 'qwen25vl', 'gemma3']),
+    required=False,
+    default=None,
+    help='Vision model type. Used to map correct vision tokens.')
 @click.pass_context
-def load(ctx, model, model_type, device, use_cache, dynamic_shapes,
-         pad_token_id, eos_token_id, bos_token_id, num_streams, performance_hint,
-         inference_precision_hint, enable_hyper_threading, inference_num_threads,
-         scheduling_core_type):
-    """- Load a model."""
-    cli_instance = OpenArcCLI()
+def add(ctx, model_path, model_name, engine, model_type, device, runtime_config, vlm_type):
+    """- Add a model configuration to the config file."""
     
-    # Build load_config from arguments
+    # Build and save configuration
     load_config = {
-        "id_model": model,
-        "architecture_type": model_type,
-        "use_cache": use_cache,
+        "model_name": model_name,
+        "model_path": model_path,  
+        "model_type": model_type,  
+        "engine": engine,    
         "device": device,
-        "dynamic_shapes": dynamic_shapes,
+        "runtime_config": runtime_config if runtime_config else {},
+        "vlm_type": vlm_type if vlm_type else None
     }
     
-    # Add optional token IDs if provided
-    if pad_token_id is not None:
-        load_config["pad_token_id"] = pad_token_id
-    if eos_token_id is not None:
-        load_config["eos_token_id"] = eos_token_id
-    if bos_token_id is not None:
-        load_config["bos_token_id"] = bos_token_id
-    
-    # Build ov_config from arguments
-    ov_config = {}
-    if performance_hint is not None:
-        ov_config["PERFORMANCE_HINT"] = performance_hint
-    if inference_precision_hint is not None:
-        ov_config["INFERENCE_PRECISION_HINT"] = inference_precision_hint
-    if enable_hyper_threading is not None:
-        ov_config["ENABLE_HYPER_THREADING"] = enable_hyper_threading
-    if inference_num_threads not in (None, False):
-        ov_config["INFERENCE_NUM_THREADS"] = inference_num_threads
-    if scheduling_core_type is not None:
-        ov_config["SCHEDULING_CORE_TYPE"] = scheduling_core_type
-    if num_streams is not None:
-        ov_config["NUM_STREAMS"] = num_streams
-    
-    # Prepare payload
-    payload = {
-        "load_config": load_config,
-        "ov_config": ov_config if ov_config else {}
-    }
+    save_model_config(model_name, load_config)
+    console.print(f"✅ [green]Saved configuration for:[/green] {model_name}")
+    console.print(f"[dim]Use 'openarc load {model_name}' to load this model.[/dim]")
+
+@cli.command()
+@click.argument('model_names', nargs=-1, required=True)
+@click.pass_context
+def load(ctx, model_names):
+    """- Load one or more models from saved configuration.
     
-    # Make API request
-    url = f"{cli_instance.base_url}/optimum/model/load"
+    Examples:
+        openarc load model1
+        openarc load Dolphin-X1 kokoro whisper
+    """
+    cli_instance = OpenArcCLI()
     
-    try:
-        console.print(f"🚀 [blue]Loading model:[/blue] {model}")
-        response = requests.post(url, json=payload, headers=cli_instance.get_headers())
-        
-        if response.status_code == 200:
-            console.print("✅ [green]Model loaded successfully![/green]")
+    model_names = list(model_names)
+    
+    # Track results
+    successful_loads = []
+    failed_loads = []
+    
+    # Start loading queue
+    if len(model_names) > 1:
+        console.print(f"🚀 [blue]Starting load queue...[/blue] ({len(model_names)} models)\n")
+    
+    # Load each model
+    for idx, name in enumerate(model_names, 1):
+        # Show progress indicator for multiple models
+        if len(model_names) > 1:
+            console.print(f"[cyan]({idx}/{len(model_names)})[/cyan] [blue]loading[/blue] {name}")
         else:
-            console.print(f"❌ [red]Error loading model: {response.status_code}[/red]")
-            console.print(f"[red]Response:[/red] {response.text}")
-            ctx.exit(1)
+            console.print(f"[blue]loading[/blue] {name}")
+        
+        # Get saved configuration
+        saved_config = get_model_config(name)
+        
+        if not saved_config:
+            console.print(f"❌ [red]Model configuration not found:[/red] {name}")
+            console.print("[yellow]Tip: Use 'openarc list' to see saved configurations.[/yellow]\n")
+            failed_loads.append(name)
+            continue
+        
+        load_config = saved_config.copy()
+        
+        # Make API request to load the model
+        url = f"{cli_instance.base_url}/openarc/load"
+        
+        try:
+            console.print("[cyan]...working[/cyan]")
+            response = requests.post(url, json=load_config, headers=cli_instance.get_headers())
             
-    except requests.exceptions.RequestException as e:
-        console.print(f"❌ [red]Request failed:[/red] {e}")
+            if response.status_code == 200:
+                console.print(f"✅ [green]{name} loaded![/green]\n")
+                successful_loads.append(name)
+            else:
+                console.print(f"❌ [red]error: {response.status_code}[/red]")
+                console.print(f"[red]Response:[/red] {response.text}\n")
+                failed_loads.append(name)
+                
+        except requests.exceptions.RequestException as e:
+            console.print(f"❌ [red]Request failed:[/red] {e}\n")
+            failed_loads.append(name)
+    
+    # Summary
+    console.print("─" * 60)
+    if successful_loads and not failed_loads:
+        console.print(f"🎉 [green]All models loaded![/green] ({len(successful_loads)}/{len(model_names)})")
+    elif successful_loads and failed_loads:
+        console.print(f"⚠️  [yellow]Partial success:[/yellow] {len(successful_loads)}/{len(model_names)} models loaded")
+        console.print(f"   [green]✓ Loaded:[/green] {', '.join(successful_loads)}")
+        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_loads)}")
+    else:
+        console.print(f"❌ [red]All models failed to load![/red] (0/{len(model_names)})")
+        console.print(f"   [red]✗ Failed:[/red] {', '.join(failed_loads)}")
+    
+    console.print("[dim]Use 'openarc status' to see loaded models.[/dim]")
+    
+    # Exit with error code if any loads failed
+    if failed_loads:
         ctx.exit(1)
 
 @cli.command()
-@click.option('--model-id', required=True, help='Model ID to unload')
+@click.option('--model-name', '--mn', required=True, help='Model name to unload')
 @click.pass_context
-def unload(ctx, model_id):
+def unload(ctx, model_name):
     """
-    - DELETE a model from memory. 
+    - POST Delete a model from registry and unload from memory.
     """
     cli_instance = OpenArcCLI()
 
-    # Make API request
-    url = f"{cli_instance.base_url}/optimum/model/unload"
-    params = {"model_id": model_id}
+    url = f"{cli_instance.base_url}/openarc/unload"
+    payload = {"model_name": model_name}
     
     try:
-        console.print(f"🗑️  [blue]Unloading model:[/blue] {model_id}")
-        response = requests.delete(url, params=params, headers=cli_instance.get_headers())
+        console.print(f"🗑️  [blue]Unloading model:[/blue] {model_name}")
+        response = requests.post(url, json=payload, headers=cli_instance.get_headers())
         
         if response.status_code == 200:
             result = response.json()
-            console.print(f"✅ [green]{result['message']}[/green]")
+            # Handle different possible response formats
+            message = result.get('message', f"Model '{model_name}' unloaded successfully")
+            console.print(f"✅ [green]{message}[/green]")
         else:
             console.print(f"❌ [red]Error unloading model: {response.status_code}[/red]")
             console.print(f"[red]Response:[/red] {response.text}")
@@ -401,13 +310,79 @@ def unload(ctx, model_id):
         console.print(f"❌ [red]Request failed:[/red] {e}")
         ctx.exit(1)
 
+@cli.command()
+@click.option('--model-name','--mn', help='Model name to remove (used with --rm).')
+@click.option('--rm', is_flag=True, help='Remove a model configuration.')
+@click.pass_context
+def list(ctx, rm, model_name):
+    """- List saved model configurations.
+       
+       - Remove a model configuration."""
+    if rm:
+        if not model_name:
+            console.print("❌ [red]Error:[/red] --model-name is required when using --rm")
+
+            ctx.exit(1)
+        
+        # Check if model exists before trying to remove
+        existing_config = get_model_config(model_name)
+        if not existing_config:
+            console.print(f"❌ {model_name}[red] not found:[/red]")
+            console.print("[yellow]Use 'openarc list' to see available configurations.[/yellow]")
+            ctx.exit(1)
+        
+        # Remove the configuration
+        if remove_model_config(model_name):
+            console.print(f"🗑️  [green]Model configuration removed:[/green] {model_name}")
+        else:
+            console.print(f"❌ [red]Failed to remove model configuration:[/red] {model_name}")
+            ctx.exit(1)
+        return
+    
+    config = load_full_config()
+    models = config.get("models", {})
+    
+    if not models:
+        console.print("[yellow]No model configurations found.[/yellow]")
+        console.print("[dim]Use 'openarc add --help' to see how to save configurations.[/dim]")
+        return
+    
+    console.print(f"📋 [blue]Saved Model Configurations ({len(models)}):[/blue]\n")
+    
+    for model_name, model_config in models.items():
+        # Create a table for each model configuration
+        config_table = Table(show_header=False, box=None, pad_edge=False)
+        
+
+        config_table.add_row("model_name", f"[cyan]{model_name}[/cyan]")
+        config_table.add_row("device", f"[blue]{model_config.get('device')}[/blue]")
+        config_table.add_row("engine", f"[green]{model_config.get('engine')}[/green]")
+        config_table.add_row("model_type", f"[magenta]{model_config.get('model_type')}[/magenta]")
+        
+        
+        rtc = model_config.get('runtime_config', {})
+        if rtc:
+            config_table.add_row("", "")
+            config_table.add_row(Text("runtime_config", style="bold underline yellow"), "")
+            for key, value in rtc.items():
+                config_table.add_row(f"  {key}", f"[dim]{value}[/dim]")
+        
+        panel = Panel(
+            config_table,
+            border_style="green"
+        )
+        console.print(panel)
+    
+    console.print("\n[dim]To load saved configurations: openarc load <model_name> [model_name2 ...][/dim]")
+    console.print("[dim]To remove a configuration: openarc list --remove --model-name <model_name>[/dim]")
+
 @cli.command()
 @click.pass_context
 def status(ctx):
     """- GET Status of loaded models."""
     cli_instance = OpenArcCLI()
     
-    url = f"{cli_instance.base_url}/optimum/status"
+    url = f"{cli_instance.base_url}/openarc/status"
     
     try:
         console.print("📊 [blue]Getting model status...[/blue]")
@@ -415,66 +390,39 @@ def status(ctx):
         
         if response.status_code == 200:
             result = response.json()
-            loaded_models = result.get("loaded_models", {})
-            total_models = result.get("total_models_loaded", 0)
+            models = result.get("models", [])
+            total_models = result.get("total_loaded_models", 0)
             
-            if not loaded_models:
+            if not models:
                 console.print("[yellow]No models currently loaded.[/yellow]")
             else:
-                for model_id, model_info in loaded_models.items():
-                    device = model_info.get("device", "unknown")
-                    status_val = model_info.get("status", "unknown")
-                    metadata = model_info.get("model_metadata", {})
-                    model_type = metadata.get("architecture_type", "unknown")
-                    use_cache = str(metadata.get("use_cache", "-"))
-                    dynamic_shapes = str(metadata.get("dynamic_shapes", "-"))
-                    pad_token_id = str(metadata.get("pad_token_id", "-"))
-                    eos_token_id = str(metadata.get("eos_token_id", "-"))
-                    bos_token_id = str(metadata.get("bos_token_id", "-"))
-                    num_streams = str(metadata.get("NUM_STREAMS", "-"))
-                    precision = str(metadata.get("INFERENCE_PRECISION_HINT", "-"))
-                    perf_hint = str(metadata.get("PERFORMANCE_HINT", "-"))
-                    inf_num_threads = str(metadata.get("INFERENCE_NUM_THREADS", "-"))
-                    enable_ht = str(metadata.get("ENABLE_HYPER_THREADING", "-"))
-                    sched_core_type = str(metadata.get("SCHEDULING_CORE_TYPE", "-"))
-
-                    model_table = Table(show_header=False, box=None, pad_edge=False)
-
-                    model_table.add_row(Text("Model Info", style="bold underline cyan"), "")
-                    model_table.add_row("Model ID", f"[cyan]{model_id}[/cyan]")
-                    model_table.add_row("Type", f"[yellow]{model_type}[/yellow]")
-                    model_table.add_row("Status", f"[green]{status_val}[/green]" if status_val == "loaded" else f"[red]{status_val}[/red]")
-                    model_table.add_row("", "")
-
-                    # Device Info Section
-                    model_table.add_row(Text("Device Info", style="bold underline magenta"), "")
-                    model_table.add_row("Device", f"[magenta]{device}[/magenta]")
-                    model_table.add_row("Use Cache", use_cache)
-                    model_table.add_row("Dynamic Shapes", dynamic_shapes)
-                    model_table.add_row("", "")
-
-                    # Token IDs Section
-                    model_table.add_row(Text("Token IDs", style="bold underline yellow"), "")
-                    model_table.add_row("Pad Token ID", pad_token_id)
-                    model_table.add_row("EOS Token ID", eos_token_id)
-                    model_table.add_row("BOS Token ID", bos_token_id)
-                    model_table.add_row("", "")
-
-                    # Performance Settings Section
-                    model_table.add_row(Text("Performance Settings", style="bold underline green"), "")
-                    model_table.add_row("NUM_STREAMS", num_streams)
-                    model_table.add_row("INFERENCE_PRECISION_HINT", precision)
-                    model_table.add_row("PERFORMANCE_HINT", perf_hint)
-                    model_table.add_row("INFERENCE_NUM_THREADS", inf_num_threads)
-                    model_table.add_row("ENABLE_HYPER_THREADING", enable_ht)
-                    model_table.add_row("SCHEDULING_CORE_TYPE", sched_core_type)
-
-                    panel = Panel(
-                        model_table,
-                        title=f"🧩 Model: [bold]{model_id}[/bold]",
-                        border_style="blue" if status_val == "loaded" else "red"
+                # Create a table for all models
+                status_table = Table(title=f"📊 Loaded Models ({total_models})")
+                status_table.add_column("model_name", style="cyan", width=20)
+                status_table.add_column("device", style="blue", width=10)
+                status_table.add_column("model_type", style="magenta", width=15)
+                status_table.add_column("engine", style="green", width=10)
+                status_table.add_column("status", style="yellow", width=10)
+                status_table.add_column("time_loaded", style="dim", width=20)
+                
+                for model in models:
+                    model_name = model.get("model_name")
+                    device = model.get("device")
+                    model_type = model.get("model_type")
+                    engine = model.get("engine")
+                    status = model.get("status")
+                    time_loaded = model.get("time_loaded")
+                    
+                    status_table.add_row(
+                        model_name,
+                        device,
+                        model_type,
+                        engine,
+                        status,
+                        time_loaded
                     )
-                    console.print(panel)
+                
+                console.print(status_table)
                 console.print(f"\n[green]Total models loaded: {total_models}[/green]")
             
         else:
@@ -492,11 +440,11 @@ def tool(ctx):
     """- Utility scripts."""
     pass
 
-@tool.command('device-properties')
+@tool.command('device-props')
 @click.pass_context
 def device_properties(ctx):
     """
-    - Query device properties for all devices.
+    - Query OpenVINO device properties for all available devices.
     """
     
     try:
@@ -541,22 +489,22 @@ def device_detect(ctx):
         diagnostic = DeviceDiagnosticQuery()
         available_devices = diagnostic.get_available_devices()
         
-        table = Table(title="📋 Available Devices")
-        table.add_column("#", style="cyan", width=4)
+        table = Table()
+        table.add_column("Index", style="cyan", width=2)
         table.add_column("Device", style="green")
         
         if not available_devices:
-            console.print("❌ [red]No OpenVINO devices found![/red]")
+            console.print("❌ [red] Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red]")
             ctx.exit(1)
         
         for i, device in enumerate(available_devices, 1):
             table.add_row(str(i), device)
         
         console.print(table)
-        console.print(f"\n✅ [green]OpenVINO runtime found {len(available_devices)} device(s)[/green]")
+        console.print(f"\n✅ [green] Sanity test passed: found {len(available_devices)} device(s)[/green]")
             
     except Exception as e:
-        console.print(f"❌ [red]Error during device diagnosis:[/red] {e}")
+        console.print(f"❌ [red]Sanity test failed: No OpenVINO devices found! Maybe check your drivers?[/red] {e}")
         ctx.exit(1)
 
 @cli.group()
@@ -567,12 +515,10 @@ def serve():
     pass
 
 @serve.command("start")
-
 @click.option("--host", type=str, default="0.0.0.0", show_default=True,
               help="""
               - Host to bind the server to
               """)
-
 @click.option("--openarc-port", 
               type=int, 
               default=8000, 
@@ -580,14 +526,43 @@ def serve():
               help="""
               - Port to bind the server to
               """)
-
-def start(host, openarc_port):
+@click.option("--load-models", "--lm",
+              required=False,
+              help="Load models on startup. Specify once followed by space-separated model names.")
+@click.argument('startup_models', nargs=-1, required=False)
+def start(host, openarc_port, load_models, startup_models):
     """
-    - 'start' reads --host and --openarc-port and saves them to the config file. Then it starts the server and will read
+    - 'start' reads --host and --openarc-port from config or defaults to 0.0.0.0:8000
+    
+    Examples:
+        openarc serve start
+        openarc serve start --load-models model1 model2
+        openarc serve start --lm Dolphin-X1 kokoro whisper
     """
     # Save server configuration for other CLI commands to use
     save_cli_config(host, openarc_port)
     
+    # Handle startup models
+    models_to_load = []
+    if load_models:
+        models_to_load.append(load_models)
+    if startup_models:
+        models_to_load.extend(startup_models)
+    
+    if models_to_load:
+        config = load_full_config()
+        saved_models = config.get("models", {})
+        missing = [m for m in models_to_load if m not in saved_models]
+        
+        if missing:
+            console.print("⚠️  [yellow]Warning: Models not in config (will be skipped):[/yellow]")
+            for m in missing:
+                console.print(f"   • {m}")
+            console.print("[dim]Use 'openarc list' to see saved configurations.[/dim]\n")
+        
+        os.environ["OPENARC_STARTUP_MODELS"] = ",".join(models_to_load)
+        console.print(f"📋 [blue]Models to load on startup:[/blue] {', '.join(models_to_load)}\n")
+    
     console.print(f"🚀 [green]Starting OpenArc server on {host}:{openarc_port}[/green]")
     start_server(host=host, openarc_port=openarc_port)
 
@@ -595,3 +570,5 @@ def start(host, openarc_port):
 if __name__ == "__main__":
     cli()
 
+
+