# LLM-based NER with Groq 

In [1]:
# In[Imports]
import os
os.environ["GROQ_API_KEY"] = "YOUR GROQ KEY"
# !pip install groq # As it has free LLMs in comparaison to OPEN AI which needs a paid account
# https://console.groq.com/
import os, json, time, re, math, random
from typing import List, Dict, Any, Tuple
from collections import defaultdict, Counter

import pandas as pd


## Path to Dataset and Groq Model

In [2]:
# Data files
PATH_JSON  = r"sentence_chunk_Dawood_Annotated.json"
PATH_TEXT  = r"sentence_chunk_Dawood.txt"

# Project labels
LABELS     = ["OTHER", "FINDING", "ANATOMY", "LOCATION", "DESCRIPTION"]

# Choose Groq model 
GROQ_MODEL = "llama-3.1-8b-instant"

# Groq client
try:
    from groq import Groq
    groq_client = Groq(api_key = os.getenv("GROQ_API_KEY"))
    _           = groq_client  # to avoid lint warnings
except Exception as e:
    groq_client = None
    print("Groq client not available. Install with `%pip install groq` and set GROQ_API_KEY.")



print("Has key?", bool(os.getenv("GROQ_API_KEY")))
print("Client is None?", groq_client is None)
print("Model:", GROQ_MODEL)

Has key? True
Client is None? False
Model: llama-3.1-8b-instant


## Load dataset

In [3]:
# In[Load the Dataset]
with open(PATH_JSON, "r") as f:
    annotation = json.load(f)

with open(PATH_TEXT, "r") as f:
    rawText = f.read()

n_ents       = 0
n_sent       = len(annotation["annotations"])
for sent, meta in annotation["annotations"]:
    
    entities = meta.get("entities", [])
    n_ents  += len(entities)

print(f"Loaded {n_sent} sentences with {n_ents} gold entities.")

Loaded 25 sentences with 175 gold entities.


## Prompt, LLM caller, and evaluation utils

In [4]:
SYSTEM_PROMPT = """
You are an information extraction model for radiology sentences.
Extract entity spans and classify each span into one of:
OTHER, FINDING, ANATOMY, LOCATION, DESCRIPTION.

Return ONLY a strict JSON object with this schema:
{
  "entities": [
    {"start": <int>, "end": <int>, "label": "<one of [OTHER,FINDING,ANATOMY,LOCATION,DESCRIPTION]>"}
  ]
}
Guidelines:
- 'start' and 'end' are character offsets over the ORIGINAL input string, using Python slicing semantics; 'end' is exclusive.
- The substring must exactly equal the characters between start and end.
- If nothing is found, return {"entities": []}.
- Do NOT include any text outside the JSON.
"""

def call_groq_llm(_sentence: str, _mdl: str = "llama-3.1-70b-versatile", max_retries: int = 4, min_backoff: float = 0.5) -> Dict[str, Any]:
    
    
    if groq_client is None: # If user forgot to set the API key, then skip LLM and return no entities.
        return {"entities": []}

    _msg = [{"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Sentence: {_sentence}\nReturn JSON only."},]
    
    last_err = None
    for attempt in range(max_retries):
        try:
            _req     = groq_client.chat.completions.create(model = _mdl, temperature = 0, messages = _msg,)
            _content = _req.choices[0].message.content
            
            # Extract JSON
            _srt     = _content.find("{")
            _end     = _content.rfind("}")
            
            if _srt == -1 or _end == -1:
                return {"entities": []}
            _data    = json.loads(_content[_srt:_end + 1]) # Converts .json into a python dict
           
            if "entities" not in _data or not isinstance(_data["entities"], list):
                return {"entities": []}
            
                
            _cleaned  = []
            # But because LLMs sometimes produce invalid positions, the foor loop below verifies the output.

            for _ele in _data["entities"]:
                try:
                    _s, _e = int(_ele["start"]), int(_ele["end"])
                    _lab   = str(_ele["label"]).upper()
                    
                    if 0 <= _s <= _e <= len(_sentence) and _lab in LABELS:
                        _cleaned.append({"start": _s, "end": _e, "label": _lab})
                except Exception:
                    pass
            return {"entities": _cleaned}
        
        except Exception as _ele:
            last_err = _ele
            # basic exponential backoff
            time.sleep(min_backoff * (2 ** attempt) + random.random()*0.25) #time delay funtion to avoid overloading the server
    # on persistent failure
    print("LLM call failed after retries:", last_err)
    return {"entities": []}

In [5]:
"""
Compares the annotated entities against the predicted ones and computes :
    - True & False Positive 
    - False Negatives
    - Precision 
    - Recall and F1-score
"""
def evaluatellm(_y : List[Tuple[int,int,str]], _yHat: List[Tuple[int,int,str]]):
    _truth, _pred = set(_y), set(_yHat)
    _tp = len(_truth & _pred) 
    _fp = len(_pred - _truth)
    _fn = len(_truth - _pred)
    _p = _tp/(_tp + _fp) if ( _tp + _fp) else 0.0
    _r = _tp/(_tp + _fn) if (_tp + _fn) else 0.0
    _f1 = (2 * _p * _r)/(_p + _r) if (_p + _r) else 0.0
    return {"tp":_tp,"fp":_fp,"fn":_fn,"precision":_p,"recall":_r,"f1":_f1}

## Run LLM NER and evaluate

In [6]:
all_rows  = []
per_label = defaultdict(lambda: {"tp" : 0,"fp" : 0,"fn" : 0})

for sent, meta in annotation["annotations"]:
    
    # gold               = [(int(s), int(e), str(lab)) for (s,e,lab) in meta.get("entities", [])]
    gold = []
    pred = []

    for entity in meta.get("entities", []):
        s, e, lab      = entity        # unpack the tuple
        s              = int(s)
        e              = int(e)
        lab            = str(lab)
        gold.append((s, e, lab))
    
    pred_json          = call_groq_llm(sent, _mdl = GROQ_MODEL, max_retries = 2) # Call the LLM to get predictions
    
    # pred               = [(int(e["start"]), int(e["end"]), str(e["label"])) for e in pred_json.get("entities", [])]
    
    for ent in pred_json.get("entities", []):
        s = int(ent["start"])
        e = int(ent["end"])
        lab = str(ent["label"])
        pred.append((s, e, lab))    
    
    
    res                = evaluatellm(gold, pred)

    all_rows.append({
        "sentence": sent,
        "gold_n": len(gold),
        "pred_n": len(pred),
        "p_exact": round(res["precision"],3),
        "r_exact": round(res["recall"],3),
        "f1_exact": round(res["f1"],3),
    })

    gold_set, pred_set = set(gold), set(pred)
    inter              = gold_set & pred_set
    labels             = set([g[2] for g in gold] + [m[2] for m in pred])
    for lab in labels:
        tp = sum(1 for s in inter if s[2]==lab)
        fp = sum(1 for s in (pred_set - gold_set) if s[2]==lab)
        fn = sum(1 for s in (gold_set - pred_set) if s[2]==lab)
        per_label[lab]["tp"] += tp
        per_label[lab]["fp"] += fp
        per_label[lab]["fn"] += fn

df_sent     = pd.DataFrame(all_rows)
macro_exact = {"precision" : round(df_sent["p_exact"].mean(),3),
               "recall"    : round(df_sent["r_exact"].mean(),3),
               "f1"        : round(df_sent["f1_exact"].mean(),3),}

print("Macro over sentences — Exact span+label:", macro_exact)

# Compute the precision - recall - F1 Score and store them in the form of a df
per_label_rows = []
for lab, m in per_label.items():
    tp, fp, fn = m["tp"], m["fp"], m["fn"]
    p          = tp/(tp+fp) if (tp+fp) else 0.0
    r          = tp/(tp+fn) if (tp+fn) else 0.0
    f          = (2*p*r)/(p+r) if (p+r) else 0.0
    per_label_rows.append({ "label": lab, "tp": tp, "fp": fp, "fn": fn,
                            "precision": round(p,3), "recall": round(r,3), "f1": round(f,3)})
    
df_labels       = pd.DataFrame(per_label_rows).sort_values("f1", ascending = False)
df_labels

Macro over sentences — Exact span+label: {'precision': 0.024, 'recall': 0.016, 'f1': 0.019}


Unnamed: 0,label,tp,fp,fn,precision,recall,f1
1,DESCRIPTION,1,10,36,0.091,0.027,0.042
3,ANATOMY,1,33,35,0.029,0.028,0.029
0,OTHER,0,17,49,0.0,0.0,0.0
2,FINDING,0,37,40,0.0,0.0,0.0
4,LOCATION,0,26,13,0.0,0.0,0.0
