In [8]:
import os
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

def get_model_pipeline(model_path):
    """
    Loads the model directly from the specified path.
    """
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Path {model_path} does not exist.")
    
    print(f"Loading model from: {model_path}")

    # Load Tokenizer and Model
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForTokenClassification.from_pretrained(model_path)
    except OSError as e:
        print(f"Error loading model. Ensure 'config.json' and 'pytorch_model.bin' (or safetensors) exist in {model_path}")
        raise e
    
    # Create Pipeline
    return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [19]:
import re

def process_text(text, nlp_pipeline):
    """
    1. Identify 'Atomic Tokens' using regex:
       - Defined as text NOT separated by whitespace AND NOT separated by brackets [ ].
    2. If the model finds ANY entity inside an atomic token, the WHOLE token is selected.
    3. Priority: IPAddress > DNSName.
    4. Merge adjacent tokens if they share the same label.
    """
    # 1. Run raw inference
    raw_results = nlp_pipeline(text)
    
    # 2. Identify "Physical Tokens" (sequences excluding whitespace and brackets)
    # Regex: [^\[\]\s]+ matches runs of characters that are NOT [ or ] or whitespace
    physical_tokens = [match for match in re.finditer(r'[^\[\]\s]+', text)]
    
    atomic_entities = []

    # Step A: Quantize predictions to Physical Tokens
    for pt in physical_tokens:
        t_start, t_end = pt.span()
        
        # Find all raw NER predictions that overlap with this physical token
        matches = [
            r for r in raw_results 
            if r['start'] < t_end and r['end'] > t_start
        ]
        
        if not matches:
            continue

        # Determine the Label for this physical token
        labels = set(m['entity_group'] for m in matches)
        
        # Priority Rule 1: IPAddress > DNSName
        if 'IPAddress' in labels and 'DNSName' in labels:
            chosen_label = 'IPAddress'
        else:
            # Priority Rule 2: First detected part wins
            matches.sort(key=lambda x: x['start'])
            chosen_label = matches[0]['entity_group']
        
        # Calculate average confidence
        avg_score = sum(float(m['score']) for m in matches) / len(matches)
        
        atomic_entities.append({
            "label": chosen_label,
            "text": text[t_start:t_end],
            "start": t_start,
            "end": t_end,
            "confidence": avg_score
        })

    # Step B: Merge adjacent Atomic Entities
    # (e.g., "Jan" + "21" -> "Jan 21")
    if not atomic_entities:
        return []
        
    final_entities = [atomic_entities[0]]
    
    for curr in atomic_entities[1:]:
        prev = final_entities[-1]
        
        # Check if they are neighbors (text between is whitespace or brackets)
        text_between = text[prev['end']:curr['start']]
        
        # We only merge if the separator is pure whitespace
        # (If the separator contains [ or ], we usually DO NOT merge, as per your request to treat them as new tokens)
        is_pure_whitespace = text_between.strip() == '' and '[' not in text_between and ']' not in text_between
        
        if is_pure_whitespace and prev['label'] == curr['label']:
            # Merge them
            prev['end'] = curr['end']
            prev['text'] = text[prev['start']:prev['end']]
            prev['confidence'] = (prev['confidence'] + curr['confidence']) / 2
        else:
            final_entities.append(curr)

    # Step C: Final Cleanup (Rounding)
    for ent in final_entities:
        ent['confidence'] = round(ent['confidence'] * 100, 2)
        
    return final_entities

# roberta-large

In [20]:
# 1. Initialize Pipeline
# Point directly to your Kaggle input directory
model_path = '/kaggle/input/hf-ner-uk-garawise-roberta-large/results/checkpoint-2772'
nlp = get_model_pipeline(model_path) 

# 2. Define your text
text = "Jan 21 17:36:40 dnsmasq[3468]: query[AAAA] de-lcs.naver.com.akadns.net from 10.143.1.78"

# 3. Get Results
entities = process_text(text, nlp)

# 4. View Results
for ent in entities:
    print(f"[{ent['label']}] {ent['text']} (Conf: {ent['confidence']}%)")

Loading model from: /kaggle/input/hf-ner-uk-garawise-roberta-large/results/checkpoint-2772


Device set to use cpu


[DateTime] Jan 21 17:36:40 (Conf: 100.0%)
[Action] dnsmasq (Conf: 98.14%)
[Process] 3468 (Conf: 100.0%)
[DNSName] de-lcs.naver.com.akadns.net (Conf: 83.32%)
[IPAddress] 10.143.1.78 (Conf: 69.43%)


# xlm-roberta-large

In [21]:
# 1. Initialize Pipeline
# Point directly to your Kaggle input directory
model_path = '/kaggle/input/hf-ner-uk-garawise-xlm-roberta-large/results/checkpoint-12240'
nlp = get_model_pipeline(model_path) 

# 2. Define your text
text = "Jan 21 17:36:40 dnsmasq[3468]: query[AAAA] de-lcs.naver.com.akadns.net from 10.143.1.78"

# 3. Get Results
entities = process_text(text, nlp)

# 4. View Results
for ent in entities:
    print(f"[{ent['label']}] {ent['text']} (Conf: {ent['confidence']}%)")

Loading model from: /kaggle/input/hf-ner-uk-garawise-xlm-roberta-large/results/checkpoint-12240


Device set to use cpu


[DateTime] Jan 21 17:36:40 (Conf: 100.0%)
[Process] 3468 (Conf: 100.0%)
[DNSName] de-lcs.naver.com.akadns.net (Conf: 89.46%)
[IPAddress] 10.143.1.78 (Conf: 99.42%)


# roberta-base

In [22]:
# 1. Initialize Pipeline
# Point directly to your Kaggle input directory
model_path = '/kaggle/input/hf-ner-uk-garawise-roberta-base/results/checkpoint-2590'
nlp = get_model_pipeline(model_path) 

# 2. Define your text
text = "Jan 21 17:36:40 dnsmasq[3468]: query[AAAA] de-lcs.naver.com.akadns.net from 10.143.1.78"

# 3. Get Results
entities = process_text(text, nlp)

# 4. View Results
for ent in entities:
    print(f"[{ent['label']}] {ent['text']} (Conf: {ent['confidence']}%)")

Loading model from: /kaggle/input/hf-ner-uk-garawise-roberta-base/results/checkpoint-2590


Device set to use cpu


[DateTime] Jan 21 17:36:40 (Conf: 100.0%)
[DNSName] de-lcs.naver.com.akadns.net (Conf: 89.51%)
[IPAddress] 10.143.1.78 (Conf: 100.0%)
