In [None]:
import json
import pandas as pd
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load your labeled dataset (gold labels)
with open('tokenized_ner_data_3.json', 'r') as f:
    data = json.load(f)

# Convert to a DataFrame for easier handling
df = pd.DataFrame(data)



In [3]:
# Load the pre-trained NER model and tokenizer
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a pipeline for NER
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
label_mapping = {
    'MAL-ORG': 'I-ORG',  # Map MAL-ORG to I-ORG
    'ORG': 'I-ORG',       # Map ORG to I-ORG
    'PER': 'I-PER',       # Map PERSON entities to I-PER
    'LOC': 'I-LOC',       # Map LOCATION entities to I-LOC
    'Event': 'I-MISC',    # Map Event-related entities to I-MISC
    'Software': 'I-MISC', # Map Software-related entities to I-MISC
    'CVE': 'I-MISC',      # Map CVE to I-MISC
    'Malware': 'I-MISC',  # Map Malware to I-MISC
    'MISC': 'I-MISC'      # General MISC category to I-MISC
}

In [5]:
def map_labels(labels, label_mapping):
    """
    Map domain-specific labels to generic labels using the provided mapping.
    """
    mapped_labels = []
    for label in labels:
        if label == "O":  # Keep "O" as is
            mapped_labels.append(label)
        else:
            # Extract the prefix (B- or I-) and the entity type
            prefix = label[:2]  # "B-" or "I-"
            entity_type = label[2:]  # The rest of the label

            # Map the entity type using the mapping
            mapped_entity = label_mapping.get(entity_type, "O")  # Default to "O" if not in mapping

            # Combine the prefix with the mapped entity type only once
            mapped_labels.append(f"{prefix}{mapped_entity.lstrip('B-').lstrip('I-')}")
    return mapped_labels

In [6]:
# Predict labels for each text in the dataset
def predict_labels(tokens):
    # Decode tokens to reconstruct the original text
    text = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True)
    
    # Get model predictions using the NER pipeline
    predictions = ner_pipeline(text)
    
    # Initialize the predicted labels with "O"
    predicted_labels = ["O"] * len(tokens)
    
    for pred in predictions:
        pred_text = pred['word']
        pred_start, pred_end = pred['start'], pred['end']
        
        # Encode the predicted text to match token positions
        encoded_pred = tokenizer.encode(pred_text, add_special_tokens=False)
        
        # Match the predicted span with tokens
        for i, token_id in enumerate(tokenizer.convert_tokens_to_ids(tokens)):
            token_start = text.find(tokenizer.decode([token_id], skip_special_tokens=True), pred_start)
            token_end = token_start + len(tokenizer.decode([token_id], skip_special_tokens=True))
            
            if token_start == pred_start and token_end == pred_end:
                predicted_labels[i] = pred['entity']
    
    return predicted_labels

# Apply predictions to each row
df['predicted_labels'] = df['tokens'].apply(predict_labels)

In [7]:
# Apply the label mapping to the gold labels
df['mapped_labels'] = df['labels'].apply(lambda x: map_labels(x, label_mapping))

In [8]:
# Flatten gold labels and predictions for evaluation
all_gold = [label for labels in df['mapped_labels'] for label in labels]
all_pred = [pred for preds in df['predicted_labels'] for pred in preds]

# Generate a classification report
print(classification_report(all_gold, all_pred))

              precision    recall  f1-score   support

       B-LOC       0.03      0.03      0.03       134
      B-MISC       0.04      0.04      0.04       456
       B-ORG       0.06      0.08      0.06       653
       B-PER       0.10      0.15      0.12        80
       I-LOC       0.09      0.24      0.13       104
      I-MISC       0.23      0.03      0.05      1791
       I-ORG       0.26      0.16      0.19      1374
       I-PER       0.30      0.16      0.21       216
           O       0.86      0.92      0.89     26071

    accuracy                           0.79     30879
   macro avg       0.22      0.20      0.19     30879
weighted avg       0.76      0.79      0.77     30879



In [9]:
def inspect_predictions(df, num_examples=5):
    """
    Inspect predicted labels, gold labels, and tokens for a sample of examples.
    :param df: The dataframe containing 'tokens', 'mapped_gold_labels', and 'mapped_predictions'.
    :param num_examples: Number of examples to display.
    """
    for i in range(num_examples):
        print(f"Example {i + 1}:")
        print("-" * 40)
        print("Tokens:            ", " ".join(df.loc[i, 'tokens']))
        print("Gold Labels:       ", " ".join(df.loc[i, 'mapped_labels']))
        print("Predicted Labels:  ", " ".join(df.loc[i, 'predicted_labels']))
        print("\n")

# Inspect a few examples
inspect_predictions(df, num_examples=5)

Example 1:
----------------------------------------
Tokens:             [CLS] A new ransom ##ware - as - a - service ( Ra ##a ##S ) operation named C ##ica ##da ##33 ##01 has already listed 19 victims on its ex ##tor ##tion portal , as it quickly attacked companies worldwide . The new c ##y ##ber ##c ##rim ##e operation is named after the mysterious 2012 - 2014 online / real - world game that involved elaborate cry ##pt ##ographic puzzles and used the same logo for promotion on c ##y ##ber ##c ##rim ##e forums . However , there ' s no connection between the two , and the legitimate project has issued a statement to re ##nounce any association and con ##de ##m ##n the ransom ##ware operators ' actions . The C ##ica ##da ##33 ##01 Ra ##a ##S first began promoting the operation and recruiting affiliates on June 29 , 202 ##4 , in a forum post to the ransom ##ware and c ##y ##ber ##c ##rim ##e forum known as RAM ##P . However , B ##lee ##ping ##C ##om ##pute ##r is aware of C ##ica ##da att