In [1]:
 # server.py
import asyncio
import csv
import gc
import os
import queue
import threading
import time
from collections import deque
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import torch
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    AutoTokenizer,
)
from peft import PeftModel

import json
import glob
import joblib
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

import llama_cpp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODELS_DIR = "./models/"

# base LM for all experts (must be compatible with the adapters)
BASE_LM_NAME = "meta-llama/Llama-3.2-1B-Instruct"
ORCHESTRATOR_MODEL_PATH = MODELS_DIR

EXPERT_ADAPTERS = {
    "anxiety": os.path.join(MODELS_DIR, "anxiety"),
    "bipolar": os.path.join(MODELS_DIR, "bipolar"),
    "depression": os.path.join(MODELS_DIR, "depression"),
    "ocd": os.path.join(MODELS_DIR, "ocd"),
    "schizophrenia": os.path.join(MODELS_DIR, "schizophrenia"),
}

LABEL2ID = {
    "anxiety": 0,
    "bipolar": 1,
    "depression": 2,
    "ocd": 3,
    "schizophrenia": 4,
}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

DEVICE = torch.device("cpu")  # CPU-only as requested

# --- toggles for experiments ---

# 1) MEMORY LIMITING: max number of experts loaded at once
ENABLE_MEMORY_LIMIT = True
MAX_LOADED_EXPERTS = 2  # change to 1..5 ; if 5 == no effective limit

# 2) MICRO-BATCHING per expert (simple, small batches)
ENABLE_BATCHING = False
MAX_BATCH_SIZE = 4
MAX_BATCH_WAIT_S = 0.005   # how long a worker waits to fill batch

# 3) Logging
LOG_FILE_SERVER = "./logs/server_logs.csv"

BASE_MODEL_PATH = "./models/Llama-3.2-1B-Instruct"

In [None]:
HF_TOKEN = ""

In [4]:
class OrchestratorRouter:
    def __init__(self, model_path: str):
        labelmap_path = model_path + "label_mapping.json"
        model_path = model_path + "orchestrator.pkl"

        pipe = joblib.load(model_path)
        with open(labelmap_path, "r", encoding="utf-8") as f:
            mapping = json.load(f)
        label2id = mapping["LABEL2ID"]
        id2label = {int(k): v for k, v in mapping["ID2LABEL"].items()}

        self.model = pipe
        self.label2id = label2id
        self.id2label = id2label

    def route(self, text: str):
        probs = self.model.predict([text])
        print(f"[ORCH] Routing probs: {probs}")
        label_id = probs[0]
        label_str = self.id2label[label_id]
        return label_str

In [5]:
@dataclass
class BatchJob:
    request_id: str
    func_id: int
    prompt: str
    enqueue_time: float
    future: asyncio.Future


@dataclass
class ExpertModel:
    name: str
    adapter_path: str
    tokenizer: AutoTokenizer
    base_lm_name: str = BASE_LM_NAME

    model: Optional[AutoModelForCausalLM] = field(default=None, init=False)
    lock: threading.Lock = field(default_factory=threading.Lock, init=False)

    # for batching
    request_queue: Optional[asyncio.Queue] = field(default=None, init=False)
    worker_task: Optional[asyncio.Task] = field(default=None, init=False)

    def ensure_loaded(self) -> float:
        """
        Lazily load PEFT model. Returns load time in ms (0 if already loaded).
        Also updates global LRU for memory limiting.
        """
        global loaded_lru, experts

        if self.model is not None:
            # mark LRU touch even when already loaded
            mark_expert_used(self.name)
            return 0.0

        print(f"[MEM] Loading expert '{self.name}' from {self.adapter_path}...")
        start = time.perf_counter()
        with self.lock:
            if self.model is None:
                print(f"[MEM] Actually loading expert '{self.name}'...")
                base_model = AutoModelForCausalLM.from_pretrained(
                    BASE_MODEL_PATH,
                    torch_dtype=torch.float32,
                    device_map={"": DEVICE},
                    token=HF_TOKEN,
                )
                print(f"[MEM] Base model loaded for expert '{self.name}'")
                self.model = PeftModel.from_pretrained(
                    base_model,
                    self.adapter_path,
                ).to(DEVICE)
                
                self.model.eval()

                mark_expert_used(self.name)
        end = time.perf_counter()
        return (end - start) * 1000.0

    def unload(self):
        """Free the model to simulate memory pressure."""
        if self.model is not None:
            print(f"[MEM] Unloading expert '{self.name}'")
            self.model = None
            gc.collect()

    # -------- inference helpers --------

    def generate_batch(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]:
        assert self.model is not None, "Model not loaded"

        # 1. Build chat-style inputs
        conversations = [[{"role": "user", "content": p}] for p in prompts]

        input_ids = self.tokenizer.apply_chat_template(
            conversations,
            return_tensors="pt",
            padding=True,
            truncation=True,
            add_generation_prompt=True,  # leaves it at the assistant turn
        ).to(DEVICE)

        # 2. Explicit attention mask (since pad == eos)
        attn_mask = (input_ids != self.tokenizer.pad_token_id).long().to(DEVICE)

        input_len = input_ids.shape[1]

        with torch.no_grad():
            output_ids = self.model.generate(
                input_ids=input_ids,
                attention_mask=attn_mask,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.8,
                top_p=0.9,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.pad_token_id,
            )

        # 3. Drop the prompt part → keep only new assistant tokens
        gen_only = output_ids[:, input_len:]

        texts = self.tokenizer.batch_decode(gen_only, skip_special_tokens=True)
        # optional clean-up whitespace
        texts = [t.strip() for t in texts]
        return texts



    def generate_single(self, prompt: str, max_new_tokens: int = 128) -> str:
        return self.generate_batch([prompt], max_new_tokens=max_new_tokens)[0]

# --- LRU memory book-keeping ---

loaded_lru: deque = deque()  # left = LRU, right = MRU
experts: Dict[str, ExpertModel] = {}  # filled on startup


def mark_expert_used(name: str):
    """Update LRU order and evict experts if over limit."""
    if not ENABLE_MEMORY_LIMIT:
        return

    # move name to right (MRU)
    if name in loaded_lru:
        loaded_lru.remove(name)
    loaded_lru.append(name)

    # evict until limit satisfied
    while len(loaded_lru) > MAX_LOADED_EXPERTS:
        evict_name = loaded_lru.popleft()
        if evict_name == name:
            # shouldn't happen, but guard
            continue
        experts[evict_name].unload()

In [6]:
orchestrator = OrchestratorRouter(ORCHESTRATOR_MODEL_PATH)

base_tokenizer = AutoTokenizer.from_pretrained(BASE_LM_NAME, use_auth_token=HF_TOKEN)
base_tokenizer.pad_token = base_tokenizer.special_tokens_map["eos_token"]
base_tokenizer.padding_side = "left"

experts = {
    label: ExpertModel(
        name=label,
        adapter_path=adapter_path,
        tokenizer=base_tokenizer,
    )
    for label, adapter_path in EXPERT_ADAPTERS.items()
}

# init queues + workers for batching (if enabled)
if ENABLE_BATCHING:
    loop = asyncio.get_event_loop()
    for exp in experts.values():
        exp.request_queue = asyncio.Queue()
        exp.worker_task = loop.create_task(batch_worker(exp))



In [7]:
prompt = "how to have sex with a partner?"
label = orchestrator.route(prompt)
print(f"Routed to expert '{label}'")

[ORCH] Routing probs: [3]
Routed to expert 'ocd'


In [8]:
chosen_expert = experts[label]
queue_time_ms = 0.0
batch_size = 1
load_time_ms = await asyncio.to_thread(chosen_expert.ensure_loaded)

`torch_dtype` is deprecated! Use `dtype` instead!


[MEM] Loading expert 'ocd' from ./models/ocd...
[MEM] Actually loading expert 'ocd'...
[MEM] Base model loaded for expert 'ocd'


In [9]:
output = await asyncio.to_thread(chosen_expert.generate_single, prompt)

In [10]:
output

"When a partner is experiencing compulsive or excessive urges to engage in behaviors (e.g., engaging in auto-sexual rituals, feeling pleasure from sex), the therapist should guide the individual to have sex while they are in a state of normal arousal and while the urges are not overwhelming. This means the couple would work together to manage the urges and reduce their intensity before engaging in intercourse. If the urges are too overwhelming and the urges are so intense that the therapist fears for the couple's safety, a couples therapy approach might be necessary. The therapist should emphasize the importance of mutual support and communication during this challenging time. The couple might also benefit from"

In [3]:
# client.py
import asyncio
import csv
import json
import os
import queue
import random
import threading
import time
from dataclasses import dataclass
from typing import Dict, List

import httpx

TRACE_FILE = "./client_traces/trace1.csv"
PROMPTS_FILE = "./client_traces/prompts.json"
CLIENT_LOG_FILE = "./logs/client_logs.csv"
SERVER_URL = "http://localhost:8000/infer"

In [4]:
# ============================================================
# Logging
# ============================================================

class ClientCsvLogger(threading.Thread):
    def __init__(self, log_queue: queue.Queue, filename: str, fieldnames):
        super().__init__(daemon=True)
        self.log_queue = log_queue
        self.filename = filename
        self.fieldnames = fieldnames

    def run(self):
        os.makedirs(os.path.dirname(self.filename) or ".", exist_ok=True)
        file_exists = os.path.exists(self.filename)
        with open(self.filename, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.fieldnames)
            if not file_exists:
                writer.writeheader()

            while True:
                record = self.log_queue.get()
                if record is None:
                    break
                writer.writerow(record)
                f.flush()


client_log_queue = queue.Queue()
client_logger = ClientCsvLogger(
    client_log_queue,
    CLIENT_LOG_FILE,
    fieldnames=[
        "client_send_ts",
        "request_id",
        "func_id",
        "trace_start_time",
        "prompt",
        "response_label",
        "response_expert",
        "e2e_time_ms",
        "server_total_time_ms",
        "batch_size",
    ],
)
client_logger.start()


def log_client(record):
    client_log_queue.put(record)

In [None]:
# ============================================================
# Trace + prompts
# ============================================================

def load_prompts(path: str) -> Dict[int, List[str]]:
    with open(path, encoding="utf-8") as f:
        raw = json.load(f)
    return {int(k): v for k, v in raw.items()}


# ============================================================
# Worker per trace entry
# ============================================================

async def send_request(
    client: httpx.AsyncClient,
    t0: float,
    first_start: float,
    prompts_by_func: Dict[int, List[str]],
):
    # respect relative timing from trace
    relative_start = time.perf_counter() - first_start
    now = time.perf_counter()
    delay = relative_start - (now - t0)
    if delay > 0:
        await asyncio.sleep(delay)

    prompts = prompts_by_func.get(random.randint(0,4))
    prompt = random.choice(prompts)

    send_ts = time.time()
    t_start = time.perf_counter()

    payload = {
        "request_id": 0,
        "func_id": 0,
        "start_time": 0,
        "prompt": prompt,
    }

    resp = await client.post(SERVER_URL, json=payload, timeout=None)
    resp.raise_for_status()
    data = resp.json()

    t_end = time.perf_counter()
    e2e_ms = (t_end - t_start) * 1000.0

    timings = data.get("timings_ms", {})
    server_total = timings.get("total_time_ms", None)
    batch_size = data.get("batch_size", 1)

    log_client(
        {
            "client_send_ts": send_ts,
            "request_id": 0,
            "func_id": 0,
            "trace_start_time": 0,
            "prompt": prompt,
            "response_label": data.get("chosen_label"),
            "response_expert": data.get("chosen_expert"),
            "e2e_time_ms": e2e_ms,
            "server_total_time_ms": server_total,
            "batch_size": batch_size,
        }
    )

    print(
        f"[{0}] func={0} "
        f"label={data.get('chosen_label')} expert={data.get('chosen_expert')} "
        f"batch={batch_size} e2e={e2e_ms:.1f}ms server={server_total:.1f}ms"
    )

prompts_by_func = load_prompts(PROMPTS_FILE)


t0 = time.perf_counter()

async with httpx.AsyncClient() as client:
    tasks = [
        asyncio.create_task(
            send_request(client, t0, 0, prompts_by_func)
        )
    ]
    await asyncio.gather(*tasks)

In [24]:
# server_gguf.py
import asyncio
import csv
import gc
import os
import queue
import threading
import time
from dataclasses import dataclass, field
from typing import Dict, List, Optional

from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import joblib
import json

# Orchestrator dir: contains orchestrator.pkl + label_mapping.json
ORCHESTRATOR_MODEL_PATH = "./models/"

# Map each label to its GGUF model path
GGUF_MODELS = {
    "anxiety": "./models/gguf/anxiety-q8_0.gguf",
    "bipolar": "./models/gguf/bipolar-q8_0.gguf",
    "depression": "./models/gguf/depression-q8_0.gguf",
    "ocd": "./models/gguf/ocd-q8_0.gguf",
    "schizophrenia": "./models/gguf/schizophrenia-q8_0.gguf",
}

# Hard limit on how many GGUF models can be resident at once
ENABLE_MEMORY_LIMIT = True
MAX_LOADED_MODELS = 2  # you said more than 2 explodes RAM

# Batching per expert
ENABLE_BATCHING = False
MAX_BATCH_SIZE = 4
MAX_BATCH_WAIT_S = 0.05  # how long worker waits to accumulate batch

# llama.cpp params (tune for your CPU)
N_CTX = 4096
N_THREADS = 8
N_PARALLEL = 1  # llama.cpp internal parallel sequences; keep 1 for safety here

LOG_FILE_SERVER = "./logs/server_gguf_logs.csv"


# ============================================================
# Logging infra
# ============================================================

class CsvLoggerThread(threading.Thread):
    def __init__(self, log_queue: queue.Queue, filename: str, fieldnames: List[str]):
        super().__init__(daemon=True)
        self.log_queue = log_queue
        self.filename = filename
        self.fieldnames = fieldnames

    def run(self):
        os.makedirs(os.path.dirname(self.filename) or ".", exist_ok=True)
        file_exists = os.path.exists(self.filename)
        with open(self.filename, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.fieldnames)
            if not file_exists:
                writer.writeheader()
                f.flush()
            while True:
                record = self.log_queue.get()
                if record is None:
                    break
                writer.writerow(record)
                f.flush()


server_log_queue = queue.Queue()
server_logger = CsvLoggerThread(
    server_log_queue,
    LOG_FILE_SERVER,
    fieldnames=[
        "server_receive_ts",
        "request_id",
        "func_id",
        "prompt_len",
        "chosen_label",
        "chosen_expert",
        "orchestrator_time_ms",
        "preprocess_time_ms",
        "queue_time_ms",
        "load_time_ms",
        "inference_time_ms",
        "total_time_ms",
        "batch_size",
    ],
)
server_logger.start()


def log_server(record: Dict):
    server_log_queue.put(record)


# ============================================================
# Orchestrator
# ============================================================

class OrchestratorRouter:
    def __init__(self, model_dir: str):
        labelmap_path = os.path.join(model_dir, "label_mapping.json")
        model_path = os.path.join(model_dir, "orchestrator.pkl")

        pipe = joblib.load(model_path)
        with open(labelmap_path, "r", encoding="utf-8") as f:
            mapping = json.load(f)

        self.model = pipe
        self.label2id = mapping["LABEL2ID"]
        self.id2label = {int(k): v for k, v in mapping["ID2LABEL"].items()}

    def route(self, text: str) -> str:
        pred = self.model.predict([text])
        label_id = pred[0]
        return self.id2label[label_id]


# ============================================================
# Global model state for memory limiting
# ============================================================

experts: Dict[str, "GgufExpert"] = {}  # filled on startup

# Protects: num_loaded, expert.active_count, expert.llm
global_cond = threading.Condition()
num_loaded = 0  # how many GGUF models are currently loaded (llm != None)


# ============================================================
# Expert models (GGUF via llama.cpp)
# ============================================================

@dataclass
class BatchJob:
    request_id: str
    func_id: int
    prompt: str
    enqueue_time: float
    future: asyncio.Future


@dataclass
class GgufExpert:
    name: str
    model_path: str

    llm: Optional[Llama] = field(default=None, init=False)
    active_count: int = field(default=0, init=False)
    # per-expert batching queue & worker
    request_queue: Optional[asyncio.Queue] = field(default=None, init=False)
    worker_task: Optional[asyncio.Task] = field(default=None, init=False)

    # ---------- load accounting ----------

    def inc_active(self):
        global global_cond
        with global_cond:
            self.active_count += 1

    def dec_active(self):
        global global_cond
        with global_cond:
            self.active_count = max(0, self.active_count - 1)
            global_cond.notify_all()

    # ---------- memory-limited loading ----------

    def ensure_loaded(self) -> float:
        """
        Hard memory limit:
        - At most MAX_LOADED_MODELS experts with llm != None.
        - If limit is hit:
            * Try to evict some other idle expert (active_count == 0).
            * If none idle, WAIT until someone finishes and retry.
        """
        global num_loaded, global_cond, experts, ENABLE_MEMORY_LIMIT, MAX_LOADED_MODELS

        start = time.perf_counter()

        with global_cond:
            # Fast path: already loaded
            if self.llm is not None:
                return 0.0

            while True:
                # maybe another thread loaded it while we waited
                if self.llm is not None:
                    return 0.0

                if not ENABLE_MEMORY_LIMIT:
                    num_loaded += 1
                    break

                if num_loaded < MAX_LOADED_MODELS:
                    num_loaded += 1
                    break

                # Limit reached: try to evict some other idle expert
                idle_exp: Optional[GgufExpert] = None
                for other in experts.values():
                    if other is self:
                        continue
                    if other.llm is not None and other.active_count == 0:
                        idle_exp = other
                        break

                if idle_exp is not None:
                    print(f"[MEM] Evicting idle expert '{idle_exp.name}'")
                    idle_exp.llm = None
                    num_loaded = max(0, num_loaded - 1)
                    gc.collect()
                    continue

                # All loaded experts busy: wait
                global_cond.wait()

        # We reserved a slot (num_loaded++). Load outside lock.
        try:
            print(f"[MEM] Loading GGUF for expert '{self.name}' from {self.model_path}")
            llm = Llama(
                model_path=self.model_path,
                n_ctx=N_CTX,
                n_threads=N_THREADS,
                n_batch=MAX_BATCH_SIZE * 32,  # heuristic; tune if needed
                n_gpu_layers=0,  # CPU only
                logits_all=False,
                vocab_only=False,
                seed=0,
            )
        except Exception:
            # rollback on failure
            with global_cond:
                num_loaded = max(0, num_loaded - 1)
                global_cond.notify_all()
            raise

        # Commit model
        with global_cond:
            self.llm = llm
            global_cond.notify_all()

        end = time.perf_counter()
        return (end - start) * 1000.0

    # ---------- unload (not called directly except via eviction) ----------

    def unload(self):
        global num_loaded, global_cond
        with global_cond:
            if self.llm is not None:
                print(f"[MEM] Unloading expert '{self.name}'")
                self.llm = None
                num_loaded = max(0, num_loaded - 1)
                gc.collect()
                global_cond.notify_all()

    # ---------- generate helpers (via llama.cpp) ----------

    def _generate_one(self, prompt: str, max_new_tokens: int = 128) -> str:
        assert self.llm is not None, "llm not loaded"
        # Chat-style call so you get instruct behaviour
        result = self.llm.create_chat_completion(
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_new_tokens,
            temperature=0.8,
            top_p=0.9,
        )
        return result["choices"][0]["message"]["content"].strip()

    def generate_single(self, prompt: str, max_new_tokens: int = 128) -> str:
        # run blocking in current thread
        return self._generate_one(prompt, max_new_tokens=max_new_tokens)

    def generate_batch_sequential(self, prompts: List[str], max_new_tokens: int = 128) -> List[str]:
        # llama.cpp-python doesn't do multi-prompt chat in one call cleanly,
        # so we just run them sequentially on the same loaded llm.
        outputs = []
        for p in prompts:
            outputs.append(self._generate_one(p, max_new_tokens=max_new_tokens))
        return outputs
    

In [25]:
global orchestrator, experts, num_loaded
orchestrator = OrchestratorRouter(ORCHESTRATOR_MODEL_PATH)

experts = {
    label: GgufExpert(
        name=label,
        model_path=gguf_path,
    )
    for label, gguf_path in GGUF_MODELS.items()
}

num_loaded = 0

if ENABLE_BATCHING:
    loop = asyncio.get_event_loop()
    for exp in experts.values():
        exp.request_queue = asyncio.Queue()
        exp.worker_task = loop.create_task(batch_worker(exp))

In [26]:
t0 = time.perf_counter()
preprocess_start = t0

# orchestrator routing
orch_start = time.perf_counter()
prompt = "I have no freinds. will you be my friend?"
label = orchestrator.route(prompt)
orch_end = time.perf_counter()
preprocess_end = orch_end
chosen_expert = experts[label]
print(f"Routed to expert '{label}'")

Routed to expert 'anxiety'


In [27]:
queue_time_ms = 0.0
batch_size = 1

chosen_expert.inc_active()

In [28]:
load_time_ms = await asyncio.to_thread(chosen_expert.ensure_loaded)

llama_model_loader: loaded meta data with 28 key-value pairs and 147 tensors from ./models/gguf/anxiety-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Anxiety_Fp16
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   7:                  llama.feed_forward_length

[MEM] Loading GGUF for expert 'anxiety' from ./models/gguf/anxiety-q8_0.gguf


llama_model_loader: - kv  22:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  23:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 128009
llama_model_loader: - kv  25:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  26:               tokenizer.ggml.add_sep_token bool             = false
llama_model_loader: - kv  27:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
llama_model_loader: - type  f32:   34 tensors
llama_model_loader: - type q8_0:  113 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type   = Q8_0
print_info: file size   = 1.22 GiB (8.50 BPW) 
init_tokenizer: initializing tokenizer for type 2
load: control token: 128098 '<|reserved_special_token_90|>' is not marked as EOG
load: 

In [30]:

inf_start = time.perf_counter()
output = await asyncio.to_thread(chosen_expert.generate_single, prompt)
inf_end = time.perf_counter()


llama_perf_context_print:        load time =    1165.29 ms
llama_perf_context_print: prompt eval time =    1164.92 ms /    47 tokens (   24.79 ms per token,    40.35 tokens per second)
llama_perf_context_print:        eval time =    5943.24 ms /   127 runs   (   46.80 ms per token,    21.37 tokens per second)
llama_perf_context_print:       total time =    7342.69 ms /   174 tokens
llama_perf_context_print:    graphs reused =        122


In [31]:
chosen_expert.dec_active()

In [32]:

inf_time_ms = elapsed_ms(inf_start, inf_end)

t_end = time.perf_counter()

NameError: name 'elapsed_ms' is not defined