# Extracting NER Skills from LLMsm

# Loadings

## Imports

In [1]:
import torch, gc, sys, os, logging
from pathlib import Path
import matplotlib.pyplot as plt
import circuitsvis as cv
from importlib import reload
logging.basicConfig(level=logging.INFO)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
torch.autograd.set_detect_anomaly(True)
# Import our own code
import llm2ner
import llm2ner.utils as utils
import llm2ner.results as results
import llm2ner.plotting as plotting
from llm2ner.models import NERmodel
from experimaestro import settings

if (wspace := settings.get_workspace('LLMinterp')) is not None:
    print(f"Found experimaestro workspace: {wspace.id}")
    xp_path =  wspace.path
    print(f"xp_path: {xp_path}")
else:
    xp_path = None
    print("No experimaestro workspace found.")

USER = os.environ.get("USER")
print(f"USER is {USER}")
if USER == "morand":
    data_path = Path("/data/morand/NER") 
elif USER == "victor": 
    data_path = Path("/Users/victor/code/data/NER")
else:
    logging.error(f"Unknown user {USER}, please set data_path manually.")
    data_path = None

repo_path = Path(llm2ner.__file__).parent.parent.parent
os.chdir(repo_path)

print(f"working dir: {os.getcwd()}")
print(f"cuda available: {torch.cuda.is_available()}")

## Load Pretrained Model

In [None]:
# or use a saved model path directly:
# path = "/home/morand/experiments/llminterp/xp/llm2ner_TM/results/models/dataset_name=Pile-NER_dilate_entities=None_layer=5_llm_name=meta-llama-Llama-3.2-1B_method=cl_fn_minmaxpool_pos_weight=0_rank=64_teacher_thr_prob=0.9"
path = next(Path("./saved_models").glob("ToMMeR*"))

print(f"Loading learner from {path}")

tommer = NERmodel.from_pretrained(path)
model_name = tommer.llm_name
print(f"Ner Model {tommer}\n with {tommer.count_parameters()/1e3:.2f} K parameters")

## Load LLM
Now we can load the backbone LLM that our `ToMMeR` was trained on.
Note that while we only use hidden states from an early layer, we cut the LLM to this layer in order to save GMU memory.

In [3]:
model = utils.load_llm(
    model_name, to_hookedtransformer=tommer.need_hookedtransformer,
    # dtype=torch.bfloat16,
    cut_to_layer=tommer.layer,
)
device = next(model.parameters()).device
tommer.to(device)
c_length = utils.get_model_max_length(model_name)
dim = utils.get_model_dim(model_name)
print(f"Model dimension is {dim}, context length is {c_length}")
#print current gpu mem
print(f"{model_name} loaded on {device} as {model.__class__.__name__} with {utils.count_parameters(model)/1e9:.3f} B parameters")
print(f"GPU allocated memory : {torch.cuda.memory_allocated()/1024**3:.3f} GB")

# Demonstration


## Inference on any String

In [None]:
reload(plotting)
import llm2ner.plotting as plotting

text = "__ dans un rapport publié le 12 mars, le Health Data Hub fait le point sur sa procédure de migration de Microsoft Azure vers une offre \"souveraine\". En 2024, elle raconte avoir intensifié ses échanges avec les industriels et compte mettre en oeuvre une \"solution intercalaire\" en 2025."
text = "The Eiffel Tower is located near the Seine river, in Paris. It was built in 1889 by Gustave Eiffel."
text = "Selon les informations de la Cellule investigation de Radio France et du journal \"Le Monde\", la Ligue des droits de l'homme (LDH) vient de transmettre un signalement doublé d'une plainte au parquet de Paris visant l'assistant vocal de la marque à la pomme. Des accusations que la marque à la pomme a toujours réfutées mais contre lesquelles elle s’apprête tout de même à ouvrir un fonds d'indemnisation de 95 millions de dollars afin d’empêcher toute nouvelle procédure à son encontre aux Etats-Unis. S’il est validé par la justice californienne, l’accord amiable prévoit que les propriétaires américains d’iPhone, iPad, Apple Watch, MacBook, iMac, HomePod, iPod touch ou AppleTV pourront être indemnisés d’une somme de 20 dollars par appareil possédé."
text = "Large language models are awesome. While trained on language modeling, they exhibit emergent abilities that make them suitable for a wide range of tasks, including Named Entity Recognition (NER). "
text = "Our work aims to bridge this gap, connecting known theoretical results in geometric algebra with modern advances in neural information retrieval. We draw upon research in communication complexity theory to provide a lower bound on the embedding dimension needed to represent a given combination of relevant documents and queries. Specifically, we show that for a given embedding dimension d there exists top-k combinations of documents that cannot be returned—no matter the query—highlighting a theoretical and fundamental limit to embedding models."

outputs = plotting.demo_inference(text, tommer, model, 
                                #   decoding_strategy="greedy", 
                                #   threshold=0.7, 
                                  return_logits=False,
                                  show_attn=True,
                                #   show_values=True,
                                #   verbose=True,
                                  )

## Evaluation on given dataset

In [5]:
eval_data_name = "Ontonotes"
eval_data_name = "CoNLL2003"
eval_data_name = "WikiNeural"
eval_data_name = "WikiANN en"
eval_data_name = "CrossNER_literature"
eval_data_name = "ncbi"
eval_data_name = "GENIA_NER"
eval_data_name = "CrossNER_AI"
eval_data_name = "CrossNER_politics"

decoding_strategy = "threshold" # threshold greedy
threshold = 0.5

eval_dataset = llm2ner.data.load_all_splits(eval_data_name, data_folder=Path("/data/morand/NER"), mode="last", model=model)
logging.info(f"\n{len(eval_dataset)} samples in {eval_data_name}")
logging.info(f"Evaluating at layer {tommer.layer} of {tommer.llm_name}, Computing metrics on test set of {eval_data_name}...")
tommer = tommer.cuda()
metrics = tommer.evaluate(eval_dataset.get_loader(batch_size=30), decoding_strategy=decoding_strategy, threshold=threshold, verbose=True)

## Inference on dataset

In [None]:
reload(plotting)
from llm2ner import plotting

data = eval_dataset
id = torch.randint(len(data), (1,)).item()
item = data[id]
print(id)

outputs = plotting.test_inference(
    tommer,
    model,
    item,
    # decoding_strategy="greedy", 
    threshold=0.5,
    show_attn=True,
    # verbose=True,
    return_logits=False,
)