# Inference with Fine-tuned NER Model

This notebook demonstrates how to use the fine-tuned Qwen model for NER label generation.


In [None]:
# Imports
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json


In [None]:
# Load base model and LoRA weights
base_model_path = "models/Qwen2.5-0.5B-Instruct"
adapter_path = "outputs/final_model"

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(adapter_path)

# Load base model without quantization
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16  # Use fp16 for efficiency
)

# Load LoRA adapter
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(model, adapter_path)

# Set to evaluation mode
model.eval()
print("Model loaded successfully!")


In [None]:
# Inference function
def generate_ner_labels(sentence, max_length=512):
    """Generate NER labels for a given sentence"""
    
    # Prepare the prompt
    instruction = "Given the following sentence, identify and label each word with its named entity tag (PER for person, LOC for location, ORG for organization, MISC for miscellaneous, or O for no entity)."
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{sentence}\n\n### Response:\n"
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.1,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the response part
    response_marker = "### Response:\n"
    if response_marker in generated_text:
        response = generated_text.split(response_marker)[-1].strip()
    else:
        response = generated_text
    
    return response


In [None]:
# Test on sample sentences
test_sentences = [
    "Barack Obama was born in Hawaii.",
    "Microsoft announced a new partnership with OpenAI in San Francisco.",
    "The Eiffel Tower is located in Paris, France.",
    "John Smith works at Google headquarters in Mountain View.",
    "The United Nations conference will be held in New York next month."
]

print("Testing on sample sentences:\n")
for sentence in test_sentences:
    print(f"Input: {sentence}")
    labels = generate_ner_labels(sentence)
    print(f"Output: {labels}")
    print("-" * 80)


In [None]:
# Evaluate on test set
# Load test data
with open("outputs/data/test_instruction_data.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Test on a subset
num_samples = 10
print(f"\nEvaluating on {num_samples} test samples:\n")

results = []
for i in range(num_samples):
    sample = test_data[i]
    input_text = sample['input']
    expected_output = sample['output']
    
    # Generate prediction
    predicted_output = generate_ner_labels(input_text)
    
    results.append({
        'input': input_text,
        'expected': expected_output,
        'predicted': predicted_output
    })
    
    print(f"Sample {i+1}:")
    print(f"Input: {input_text}")
    print(f"Expected: {expected_output[:100]}...")
    print(f"Predicted: {predicted_output[:100]}...")
    print("-" * 80)

# Save results
with open("outputs/results/inference_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nResults saved to outputs/results/inference_results.json")


In [None]:
# Simple evaluation metrics
def parse_output(output_str):
    """Parse the output string into a dictionary of word: tag pairs"""
    pairs = {}
    parts = output_str.split(", ")
    for part in parts:
        if ": " in part:
            word, tag = part.split(": ", 1)
            pairs[word.strip()] = tag.strip()
    return pairs

# Calculate exact match accuracy
exact_matches = 0
for result in results:
    if result['expected'].strip() == result['predicted'].strip():
        exact_matches += 1

print(f"\nExact match accuracy: {exact_matches}/{len(results)} = {exact_matches/len(results)*100:.2f}%")

# Calculate tag-level accuracy (approximate)
total_tags = 0
correct_tags = 0

for result in results:
    expected_pairs = parse_output(result['expected'])
    predicted_pairs = parse_output(result['predicted'])
    
    for word, expected_tag in expected_pairs.items():
        total_tags += 1
        if word in predicted_pairs and predicted_pairs[word] == expected_tag:
            correct_tags += 1

if total_tags > 0:
    print(f"Tag-level accuracy (approximate): {correct_tags}/{total_tags} = {correct_tags/total_tags*100:.2f}%")
