In [1]:
import httpx
import time

def wait_for_vllm_ready(url="http://localhost:8000/v1/models", timeout=300):
    print("⏳ Waiting for vLLM API to be ready...")
    start_time = time.time()
    while True:
        try:
            response = httpx.get(url, timeout=2)
            if response.status_code == 200:
                print("✅ vLLM is ready!")
                break
        except Exception as e:
            if time.time() - start_time > timeout:
                raise TimeoutError("❌ Timed out waiting for vLLM.") from e
            time.sleep(5)

wait_for_vllm_ready()


⏳ Waiting for vLLM API to be ready...
✅ vLLM is ready!


In [2]:
!lsof -i :8000

/bin/bash: lsof: command not found


In [3]:
from openai import OpenAI
import json
import ast
from pathlib import Path
import traceback
from tqdm.notebook import tqdm

# initial VLLM
client = OpenAI(
    base_url='http://localhost:8000/v1',
    api_key='EMPTY', 
)

In [8]:
# Define api sturcture
def openai_api_predict(model, query):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': query}
        ],
        stream=False
    )
    return response.choices[0].message.content

def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def write_json(data, file_path, indent=4):
    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=indent)


In [5]:
# Setting model and path
model_name = 'Qwen/Qwen2.5-3B-Instruct'
test_name = 'ner'

data_path = Path('/scratch/project_2005072/keshu/cascade-camp-hands-on/data')
data_file = data_path / 'topres19th/HIPE-prep.json'
pred_dir = f'output/{model_name}_{test_name}.json'

# read data
qas = read_json(data_file)
print(f"Loaded {len(qas)} entries.")

Loaded 309 entries.


In [11]:
# Running inference

preds = []

for qa in tqdm(qas[:20]):  # manage to test a small subset, clear to run all
    text = qa['text']

    query = f'''
    This is a named entity recognition task, which consists of two steps:
    1) First, identify all entity mentions in the text.
    2) Then classify each mention into one of the following categories:
    ["LOC", "STREET", "BUILDING"].

    Given the following text:
    {text}

    Output format: {{"LOC": [...], "STREET": [...], "BUILDING": [...]}}
    Do not provide any explanation.
    '''

    try:
        start = time.time()
        answer = openai_api_predict(model_name, query)
        duration = time.time() - start
        print(f"Model output: {answer}")
        print(f"⏱ Time: {duration:.2f}s\n")

        parsed = ast.literal_eval(answer)
        parsed = json.loads(json.dumps(parsed))  # make sure it returns JSON

    except Exception as e:
        print(f"⚠️ Error: {e}")
        traceback.print_exc()
        parsed = {"LOC": [], "STREET": [], "BUILDING": []}

    preds.append({
        "text": text,
        "preds": parsed
    })


  0%|          | 0/20 [00:00<?, ?it/s]

Model output: {"LOC": ["AUSTRALIA", "NEW ZEALAND", "MELBOURNE", "TASMANIA", "LAUNCESTON", "HOBTART TOWN"], "STREET": [], "BUILDING": ["Morning Light", "Prinre of the Seas", "Sirocco", "Red Jacket", "White Jacket", "Star of the East", "White star", "Mermaid", "Telegraph Blue Jaci,et", "Shalimar", "Arabian"]}
⏱ Time: 1.59s

Model output: {"LOC": [], "STREET": [], "BUILDING": []}
⏱ Time: 0.30s

Model output: {"LOC": ["New York", "London", "Leeds", "Birmingham", "Liverpool"], "STREET": [], "BUILDING": []}
⏱ Time: 0.54s

Model output: {"LOC": ["Edinburgh", "Invernessshire", "Scotland"], "STREET": [], "BUILDING": ["Saughton-Hall Asylum"]}
⏱ Time: 0.56s

Model output: {"LOC": ["Kensal Green Cemetery", "London"], "STREET": ["Harrow-road"], "BUILDING": []}
⏱ Time: 0.47s

Model output: {"LOC": ["TAUNTON", "Bradford"], "STREET": ["High-street"], "BUILDING": []}
⏱ Time: 0.46s

Model output: {"LOC": ["WILTSHIRE", "SALISBURY", "Southern Counties", "Wilts", "Bath and West of England Society", "Church

In [10]:
# Save results
write_json(preds, pred_dir)
print(f"✅ Saved {len(preds)} results to: {pred_dir}")

✅ Saved 20 results to: output/Qwen/Qwen2.5-3B-Instruct_ner.json


In [14]:
# Evaluate
!python evaluator.py


===== Per-Type Scores =====
BUILDING   | P: 0.1951, R: 0.3478, F1: 0.25, TP: 8, FP: 33, FN: 15
LOC        | P: 0.7143, R: 0.3571, F1: 0.4762, TP: 40, FP: 16, FN: 72
STREET     | P: 0.2, R: 0.2857, F1: 0.2353, TP: 2, FP: 8, FN: 5

===== Overall Scores =====
Micro-F1:  {'precision': 0.4673, 'recall': 0.3521, 'f1': 0.4016}
Macro-F1:  {'f1': 0.3205}

✅ Results saved to output/Qwen/Qwen2.5-3B-Instruct_ner_f1_result.json
