In [1]:
import httpx
import time

def wait_for_vllm_ready(url="http://localhost:8000/v1/models", timeout=300):
    print("⏳ Waiting for vLLM API to be ready...")
    start_time = time.time()
    while True:
        try:
            response = httpx.get(url, timeout=2)
            if response.status_code == 200:
                print("✅ vLLM is ready!")
                break
        except Exception as e:
            if time.time() - start_time > timeout:
                raise TimeoutError("❌ Timed out waiting for vLLM.") from e
            time.sleep(5)

wait_for_vllm_ready()


⏳ Waiting for vLLM API to be ready...
✅ vLLM is ready!


In [4]:
!ps -u $USER | grep 8000

In [5]:
from openai import OpenAI
import json
import ast
from pathlib import Path
import traceback
from tqdm.notebook import tqdm
from evaluator import evaluate_ner

# initial VLLM
client = OpenAI(
    base_url='http://localhost:8000/v1',
    api_key='EMPTY', 
)

In [6]:
# Define api sturcture
def openai_api_predict(model, query):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {'role': 'system', 'content': 'You are a helpful assistant.'},
            {'role': 'user', 'content': query}
        ],
        stream=False,
        max_tokens=512,
    )
    return response.choices[0].message.content

def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def write_json(data, file_path, indent=4):
    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=indent)


# A Minimal Demo Experiment on NER

In [7]:
# Setting model and path
model_name = 'Qwen/Qwen2.5-3B-Instruct'
test_name = 'ner'

data_path = Path('/scratch/project_2014520/hands-on-ner/cascade-camp-hands-on/data')
data_file = data_path / 'topres19th/HIPE-prep.json'
pred_path = f'output/{model_name}_{test_name}.json'

# read data
qas = read_json(data_file)
print(f"Loaded {len(qas)} entries.")

Loaded 309 entries.


In [8]:
# Running inference

preds = []

for qa in tqdm(qas[:20]):  # manage to test a small subset, then clear to run all
    text = qa['text']

    query = f'''
    This is a named entity recognition task, which consists of two steps:
    1) First, identify all entity mentions in the text.
    2) Then classify each mention into one of the following categories:
    ["LOC", "STREET", "BUILDING"].

    Given the following text:
    {text}

    Output format: {{"LOC": [...], "STREET": [...], "BUILDING": [...]}}
    Do not provide any explanation.
    '''

    try:
        start = time.time()
        answer = openai_api_predict(model_name, query)
        duration = time.time() - start
        print(f"Model output: {answer}")
        print(f"⏱ Time: {duration:.2f}s\n")

        parsed = ast.literal_eval(answer)
        parsed = json.loads(json.dumps(parsed))  # make sure it returns JSON

    except Exception as e:
        print(f"⚠️ Error: {e}")
        traceback.print_exc()
        parsed = {"LOC": [], "STREET": [], "BUILDING": []}

    preds.append({
        "text": text,
        "preds": parsed
    })


  0%|          | 0/20 [00:00<?, ?it/s]

Model output: {"LOC": ["AUSTRALIA", "NEW ZEALAND", "MELBOURNE", "TASMANIA", "LAUNCESTON", "HOBBART TOWN"], "STREET": [], "BUILDING": ["Morning Light", "Prinre of the Seas", "Sirocco", "Red Jacket", "White Jacket", "Star of the East", "White star", "Mermaid", "Telegraph Blue Jaci,et", "Shalimar", "Arabian"]}
⏱ Time: 2.75s

Model output: {"LOC": [], "STREET": [], "BUILDING": []}
⏱ Time: 0.27s

Model output: {"LOC": ["New York", "London", "Leeds", "Birmingham", "Liverpool"], "STREET": [], "BUILDING": []}
⏱ Time: 0.48s

Model output: {"LOC": ["Scotland", "Edinburgh", "Invernessshire"], "STREET": [], "BUILDING": ["Saughton-Hall Asylum", "Holme"]}
⏱ Time: 0.53s

Model output: {"LOC": ["Kensal Green Cemetery", "Thackeray's sepulture"], "STREET": ["Harrow-road"], "BUILDING": []}
⏱ Time: 0.46s

Model output: {"LOC": ["TAUNTON", "Bradford"], "STREET": ["High-street"], "BUILDING": []}
⏱ Time: 0.41s

Model output: {"LOC": ["WILTSHIRE", "SALISBURY", "Basingstoke", "Wilts", "Dorset", "Wiltshire", "B

In [9]:
# Save results
write_json(preds, pred_path)
print(f"✅ Saved {len(preds)} results to: {pred_path}")

✅ Saved 20 results to: output/Qwen/Qwen2.5-3B-Instruct_ner.json


In [10]:
# Evaluate
gt_path = "/scratch/project_2005072/keshu/cascade-camp-hands-on/data/topres19th/HIPE-prep.json"
output_path = f"output/{model_name}_{test_name}_f1_result.json"

per_type_scores, overall_scores = evaluate_ner(gt_path, pred_path)
result = {
    "per_type": per_type_scores,
    "overall": overall_scores
}

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"\n✅ Results saved to {output_path}")


===== Per-Type Scores =====
BUILDING   | P: 0.1667, R: 0.3478, F1: 0.2254, TP: 8, FP: 40, FN: 15
LOC        | P: 0.7333, R: 0.3929, F1: 0.5116, TP: 44, FP: 16, FN: 68
STREET     | P: 0.2222, R: 0.2857, F1: 0.25, TP: 2, FP: 7, FN: 5

===== Overall Scores =====
Micro-F1:  {'precision': 0.4615, 'recall': 0.3803, 'f1': 0.417}
Macro-F1:  {'f1': 0.329}

✅ Results saved to output/Qwen/Qwen2.5-3B-Instruct_ner_f1_result.json


# Playground
Try out your ideas! They could be but not limited to:
1. Try a different prompt for the task (do not change the output format if you want to use pre-set evaluation method).
2. Try different models in the same pipeline.
3. Try to apply the pipeline to your data.
4. Try to apply the pipeline to your task.
   
Things like 3 and 4 may be limited by the time to establish a valid evaluation script. But you are still encouraged to evaluate the results manually!

Let's report the most interesting part you found in your experiments.


In [12]:
# Setting model and path
model_name = 'Qwen/Qwen2.5-3B-Instruct' # You may change the model and also the one run by vllm. Refer to https://docs.vllm.ai/en/v0.9.2/models/supported_models.html#list-of-text-only-language-models
test_name = 'ner_new_prompt' # Edit the test name

data_path = Path('/scratch/project_2014520/hands-on-ner/cascade-camp-hands-on/data') # You may change the data file path
data_file = data_path / 'topres19th/HIPE-prep.json'
pred_path = f'output/{model_name}_{test_name}.json' # You could also have your output filename format

# read data
qas = read_json(data_file) # question and answers
print(f"Loaded {len(qas)} entries.")


# Running inference
preds = []

for qa in tqdm(qas[:20]):  # manage to test a small subset, then clear to run all
    text = qa['text']

    # Edit the prompt or even define another task. Here is one example given by Gemini.
    query = f"""
    Your task is to perform Named Entity Recognition.
    
    Extract entities from the text and classify them into one of three categories:
    - LOC: Cities, states, countries, or well-known areas.
    - STREET: Specific names of roads or streets.
    - BUILDING: Specific names of buildings or facilities.
    
    Provide the output in a single, valid dictionary object format with the keys "LOC", "STREET", and "BUILDING".
    If a category has no entities, its value must be an empty list [].
    Do not include any additional format words like "json". Do not provide any explanations.
    
    ---
    
    Example 1:
    Text: The famous Lihpao Land is located in Houli District, not far from San Feng Road.
    Output:
    {{
      "LOC": ["Houli District"],
      "STREET": ["San Feng Road"],
      "BUILDING": ["Lihpao Land"]
    }}
    
    ---
    
    Example 2:
    Text: Many visitors travel along Freeway 1 to get to Taichung City Hall.
    Output:
    {{
      "LOC": ["Taichung"],
      "STREET": ["Freeway 1"],
      "BUILDING": ["Taichung City Hall"]
    }}
    
    ---
    
    Text to process:
    {text}
    Output:
    """

    try:
        start = time.time()
        answer = openai_api_predict(model_name, query)
        duration = time.time() - start
        print(f"Model output: {answer}")
        print(f"⏱ Time: {duration:.2f}s\n")

        parsed = ast.literal_eval(answer)
        parsed = json.loads(json.dumps(parsed))  # make sure it returns JSON

    except Exception as e:
        print(f"⚠️ Error: {e}")
        traceback.print_exc()
        parsed = {"LOC": [], "STREET": [], "BUILDING": []}

    preds.append({
        "text": text,
        "preds": parsed
    })


# Save results
write_json(preds, pred_path)
print(f"✅ Saved {len(preds)} results to: {pred_path}")


Loaded 309 entries.


  0%|          | 0/20 [00:00<?, ?it/s]

Model output: {
  "LOC": ["Australia", "New Zealand", "Melbourne", "Launceston", "Hobart Town", "Tasmania"],
  "STREET": [],
  "BUILDING": []
}
⏱ Time: 0.63s

Model output: {
  "LOC": [],
  "STREET": [],
  "BUILDING": ["DORSET COUNTY CHRONICLE AND SOMERSETSHIRE GAZETTE"]
}
⏱ Time: 0.55s

Model output: {
  "LOC": [],
  "STREET": [],
  "BUILDING": []
}
⏱ Time: 0.33s

Model output: {
  "LOC": ["Edinburgh", "Invernessshire", "Saughton-Hall Asylum", "Scotland"],
  "STREET": [],
  "BUILDING": ["Holme", "Invernessshire", "Saughton-Hall Asylum"]
}
⏱ Time: 0.79s

Model output: {
  "LOC": ["Kensal Green Cemetery", "London", "Thackeray's early schoolfellow"],
  "STREET": ["Harrow-road"],
  "BUILDING": []
}
⏱ Time: 0.59s

Model output: {
  "LOC": ["Taunton", "Bradford", "Somerset"],
  "STREET": ["High-street"],
  "BUILDING": []
}
⏱ Time: 0.52s

Model output: {
  "LOC": ["Wiltshire", "Salisbury", "Basingstoke", "Southern Counties", "Wilts", "Dorset", "Churchel Parva", "More Critchell", "Long Creche

Traceback (most recent call last):
  File "/tmp/shuke111/29317696/ipykernel_960250/4030759423.py", line 69, in <module>
    parsed = ast.literal_eval(answer)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 66, in literal_eval
    node_or_string = parse(node_or_string.lstrip(" \t"), mode='eval')
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 52, in parse
    return compile(source, filename, mode, flags,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<unknown>", line 3
    "STREET": ["Sanitary Street", "Main Street", "High Street", "Market Street", "Church Street", "King Street", "Queen Street", "Street", "Lane", "Road", "Lane", "Street", "High Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street", "Street"

Model output: {
  "LOC": ["Malmesbury", "Salisbury", "Brembill", "West Laviogton", "Devizes"],
  "STREET": ["King’s Arms Inn", "Black Horse Inn"],
  "BUILDING": ["King’s Arms Inn", "Black Horse Inn"]
}
⏱ Time: 0.85s

Model output: {
  "LOC": [],
  "STREET": [],
  "BUILDING": []
}
⏱ Time: 0.32s

Model output: {
  "LOC": [],
  "STREET": [],
  "BUILDING": ["Piceewno"]
}
⏱ Time: 0.40s

Model output: {
  "LOC": ["Blandford", "Taichung", "Blandfoid Forum", "Salisbury-street"],
  "STREET": ["Freeway 1", "Salisbury-street"],
  "BUILDING": ["Lihpao Land", "Taichung City Hall", "Mansion-Housk", "Mansion", "Close of Blandfoid Forum", "Close of valuable and excellent MEADOW or PASTURE", "Mansion-Housk", "Stable"]
}
⏱ Time: 1.42s

Model output: {
  "LOC": ["Beaminster", "Dorset", "Ulandfotd", "Sturminster Newton", "Shaston South Down"],
  "STREET": ["Turnpike Road"],
  "BUILDING": ["Green-House"]
}
⏱ Time: 0.77s

Model output: {
  "LOC": ["Castle of Winchester", "Hamshire", "Portsmouth", "Taichung"

Traceback (most recent call last):
  File "/tmp/shuke111/29317696/ipykernel_960250/4030759423.py", line 69, in <module>
    parsed = ast.literal_eval(answer)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 66, in literal_eval
    node_or_string = parse(node_or_string.lstrip(" \t"), mode='eval')
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 52, in parse
    return compile(source, filename, mode, flags,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<unknown>", line 1
    ```json
    ^
SyntaxError: invalid syntax


Model output: {
  "LOC": ["Mauritius", "his Majesty", "sugar colonies", "English government", "colonies"],
  "STREET": [],
  "BUILDING": []
}
⏱ Time: 0.58s

Model output: {
  "LOC": ["Weymouth", "Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Portland", "English Channel", "the island of Portland", "the English Channel", "Brunswick", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmou

Traceback (most recent call last):
  File "/tmp/shuke111/29317696/ipykernel_960250/4030759423.py", line 69, in <module>
    parsed = ast.literal_eval(answer)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 66, in literal_eval
    node_or_string = parse(node_or_string.lstrip(" \t"), mode='eval')
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 52, in parse
    return compile(source, filename, mode, flags,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<unknown>", line 2
    "LOC": ["Weymouth", "Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Portland", "English Channel", "the island of Portland", "the English Channel", "Brunswick", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Portsmouth", "Flymouth", "Flymoilhto Po

Model output: {
  "LOC": ["Cherbourg", "London", "Esplanade", "Yarborough", "Thomas", "Pelham", "Thomas", "Poole", "Urquhart", "Hamilton"],
  "STREET": [],
  "BUILDING": []
}
⏱ Time: 0.81s

Model output: {
  "LOC": [],
  "STREET": [],
  "BUILDING": ["Mop Fair", "Taichung City Hall", "Farnhatn-row", "Country-row", "Jrondale-row", "Country Hops", "Andover fair", "Melksham market", "Reef Its.", "Coventry"]
}
⏱ Time: 1.03s

Model output: {
  "LOC": ["London", "Bristol", "Southampton", "Bath", "Guernsey", "Jersey"],
  "STREET": ["Weymouth", "Hatchets New White Horse Cellar", "Snow Hill", "Ludgate Hill", "New White Horse Cellar"],
  "BUILDING": ["Golden Lion Inn", "Royal Dorset", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath

Traceback (most recent call last):
  File "/tmp/shuke111/29317696/ipykernel_960250/4030759423.py", line 69, in <module>
    parsed = ast.literal_eval(answer)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 66, in literal_eval
    node_or_string = parse(node_or_string.lstrip(" \t"), mode='eval')
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib64/python3.12/ast.py", line 52, in parse
    return compile(source, filename, mode, flags,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<unknown>", line 4
    "BUILDING": ["Golden Lion Inn", "Royal Dorset", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bath", "Bat

✅ Saved 20 results to: output/Qwen/Qwen2.5-3B-Instruct_ner_new_prompt.json


In [None]:
# Evaluate
gt_path = "/scratch/project_2014520/hands-on-ner/cascade-camp-hands-on/data/topres19th/HIPE-prep.json"
output_path = f"output/{model_name}_{test_name}_f1_result.json" # You need to change username to your own directory

per_type_scores, overall_scores = evaluate_ner(gt_path, pred_path)
result = {
    "per_type": per_type_scores,
    "overall": overall_scores
}

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"\n✅ Results saved to {output_path}")