In [1]:
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    DataCollatorForTokenClassification,
)
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
from tqdm import tqdm
from seqeval.metrics import classification_report
from nlp_project.models import CubeBert
from nlp_project.data import json_to_Dataset, json_to_Dataset_adv, json_to_Dataset_ensemble
from nlp_project.utils import compute_metrics, inference

all_labels = [
    "B-STREET",
    "B-CITY",
    "I-DATE",
    "B-PASS",
    "I-CITY",
    "B-TIME",
    "B-EMAIL",
    "I-DRIVERLICENSE",
    "I-POSTCODE",
    "I-BOD",
    "B-USERNAME",
    "B-BOD",
    "B-COUNTRY",
    "B-SECADDRESS",
    "B-IDCARD",
    "I-SOCIALNUMBER",
    "I-PASSPORT",
    "B-IP",
    "O",
    "B-TEL",
    "B-SOCIALNUMBER",
    "I-TIME",
    "B-BUILDING",
    "B-PASSPORT",
    "I-TITLE",
    "I-SEX",
    "I-STREET",
    "B-STATE",
    "I-STATE",
    "B-TITLE",
    "B-DATE",
    "B-GEOCOORD",
    "I-IDCARD",
    "I-TEL",
    "B-POSTCODE",
    "B-DRIVERLICENSE",
    "I-GEOCOORD",
    "I-COUNTRY",
    "I-EMAIL",
    "I-PASS",
    "B-SEX",
    "I-USERNAME",
    "I-BUILDING",
    "I-IP",
    "I-SECADDRESS",
    "B-CARDISSUER",
    "I-CARDISSUER",
]

id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
n_labels = len(all_labels)

ModuleNotFoundError: No module named 'auxiliary'

In [41]:
def compute_all_metrics(model, tokenizer, dataset, batch_size=8):
    model = model.to("cuda")
    model.eval()

    # Drop non-numeric columns
    dataset = dataset.remove_columns(["source_text", "tokens"])

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=DataCollatorForTokenClassification(
            tokenizer, return_tensors="pt"
        ),
    )

    all_predictions = []
    all_labels = []
    all_predictions_text = []
    all_labels_text = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = labels.numpy()

            all_predictions.extend(preds)
            all_labels.extend(labels)

            for pred_seq, label_seq in zip(preds, labels):
                pred_labels = []
                true_labels = []
                for p, l in zip(pred_seq, label_seq):
                    if l == -100:
                        continue
                    pred_labels.append(id2label[p])
                    true_labels.append(id2label[l])
                all_predictions_text.append(pred_labels)
                all_labels_text.append(true_labels)

    print(classification_report(all_labels_text, all_predictions_text))

    return compute_metrics(all_predictions, all_labels)

In [33]:
d_test = json_to_Dataset("data/distilbert_test.json")
a_test = json_to_Dataset("data/albert_test.json")
d_val = json_to_Dataset("data/distilbert_val.json")
a_val = json_to_Dataset("data/albert_val.json")

In [20]:
old_d1_model = AutoModelForTokenClassification.from_pretrained(
    "to_share/distilbert1"
)
old_d1_tokenizer = AutoTokenizer.from_pretrained("to_share/distilbert1")
new_d1_model = AutoModelForTokenClassification.from_pretrained(
    "models/distilbert1"
)
new_d1_tokenizer = AutoTokenizer.from_pretrained("models/distilbert1")

old_d2_model = AutoModelForTokenClassification.from_pretrained(
    "to_share/distilbert2"
)
old_d2_tokenizer = AutoTokenizer.from_pretrained("to_share/distilbert2")
new_d2_model = AutoModelForTokenClassification.from_pretrained(
    "models/distilbert2"
)
new_d2_tokenizer = AutoTokenizer.from_pretrained("models/distilbert2")

old_a_model = AutoModelForTokenClassification.from_pretrained(
    "to_share/albert1"
)
old_a_tokenizer = AutoTokenizer.from_pretrained("to_share/albert1")
new_a_model = AutoModelForTokenClassification.from_pretrained("models/albert1")
new_a_tokenizer = AutoTokenizer.from_pretrained("models/albert1")

In [None]:
d_test = json_to_Dataset_adv("data/distilbert_test_adv.json")
model = AutoModelForTokenClassification.from_pretrained("models/distilbert1")
tokenizer = AutoTokenizer.from_pretrained("models/distilbert1")
res = compute_all_metrics(model, d_test)
res

In [None]:
test = json_to_Dataset_adv("data/albert_test_adv.json")
model = AutoModelForTokenClassification.from_pretrained("models/albert1")
tokenizer = AutoTokenizer.from_pretrained("models/albert1")
res = compute_all_metrics(model, test)
res

In [None]:
res1_old = compute_all_metrics(old_d1_model, old_d1_tokenizer, d_test)
res1_new = compute_all_metrics(new_d1_model, new_d1_tokenizer, d_test)
res2_old = compute_all_metrics(old_d2_model, old_d2_tokenizer, d_test)
res2_new = compute_all_metrics(new_d2_model, new_d2_tokenizer, d_test)

In [None]:
res1_old

In [None]:
res1_new

In [None]:
res2_old

In [None]:
res2_new

In [None]:
resa_old = compute_all_metrics(old_a_model, old_a_tokenizer, a_test)
resa_new = compute_all_metrics(new_a_model, new_a_tokenizer, a_test)

In [None]:
resa_old

In [None]:
resa_new

In [None]:
res1_new_val = compute_all_metrics(new_d1_model, new_d1_tokenizer, d_val)
res2_new_val = compute_all_metrics(new_d2_model, new_d2_tokenizer, d_val)

In [None]:
res1_new_val

In [None]:
res2_new_val

In [None]:
resa_new_val = compute_all_metrics(new_a_model, new_a_tokenizer, a_val)

In [None]:
resa_new_val

In [None]:
resac_new_val

In [16]:
conf = np.log(res["confusion_matrix"] + 0.01)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(conf, cmap="Reds")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix Distilbert Finetuned 1")
plt.show()

In [None]:
# Define a test phrase with special tokens

test_phrase = "My name is Cubo, and my credit card is 4111-1111-1111-1111 issued by VISA."
test_phrase = test_phrase


# Tokenize the input
inputs = tokenizer.tokenize(test_phrase)
input_ids = torch.tensor(
    [[101] + tokenizer.convert_tokens_to_ids(inputs) + [102]]
)
attention_mask = torch.tensor([1 for i in range(len(input_ids))]).unsqueeze(0)

print(input_ids)
print(attention_mask)

In [None]:
logits, predictions, predicted_token_class, inputs = inference(
    model, input_ids, attention_mask
)

print(predicted_token_class)

In [None]:
# Get the tokens from the tokenizer
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# Print results
print("Token\tPrediction")
print("-" * 30)
for token, prediction in zip(tokens, predicted_token_class):
    print(f"{token}\t{prediction}")

# Visualize the results with color coding
colored_text = []
current_entity = None
for token, label in zip(tokens, predicted_token_class):
    # Skip special tokens
    if token in [
        tokenizer.cls_token,
        tokenizer.sep_token,
        tokenizer.pad_token,
    ]:
        continue

    # Handle subword tokens (starting with ##)
    token_display = token.replace("##", "")

    if label == "O":
        # Not an entity
        colored_text.append(token_display)
        current_entity = None
    elif label.startswith("B-"):
        # Beginning of entity
        entity_type = label[2:]
        colored_text.append(f"\033[1m\033[91m{token_display}\033[0m")
        current_entity = entity_type
    elif label.startswith("I-"):
        # Inside an entity
        entity_type = label[2:]
        colored_text.append(f"\033[1m\033[91m{token_display}\033[0m")
        current_entity = entity_type

# Join tokens to form text (this is simplified and might not be perfect for all tokenizers)
reconstructed_text = "".join(colored_text).replace(" ##", "")
print("\nColored text (PII in red):")
print(reconstructed_text)

# Create a more readable visualization
print("\nDetected PII entities:")
entity_spans = []
current_entity = None
current_start = None

for i, (token, label) in enumerate(zip(tokens, predicted_token_class)):
    # Skip special tokens
    if token in [
        tokenizer.cls_token,
        tokenizer.sep_token,
        tokenizer.pad_token,
    ]:
        continue

    if label.startswith("B-"):
        # Beginning of a new entity
        if current_entity:
            # Save the previous entity
            entity_spans.append((current_start, i - 1, current_entity))
        current_entity = label[2:]
        current_start = i
    elif label.startswith("I-"):
        # Inside an entity - continue
        pass
    elif label == "O":
        # Outside any entity
        if current_entity:
            # Save the previous entity
            entity_spans.append((current_start, i - 1, current_entity))
            current_entity = None

# Add the last entity if there is one
if current_entity:
    entity_spans.append((current_start, len(tokens) - 1, current_entity))

# Print the original text with highlighted entities
original_tokens = tokenizer.convert_tokens_to_string(tokens).split()
for start, end, entity_type in entity_spans:
    entity_text = " ".join(original_tokens[start : end + 1])
    print(f"{entity_type}: {entity_text}")

In [8]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert_finetuned2"
)
tokenizer = AutoTokenizer.from_pretrained("distilbert_finetuned2")

In [None]:
res = compute_all_metrics(model, data)
print(res)

In [10]:
conf = np.log(res["confusion_matrix"] + 0.01)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(conf, cmap="Reds")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix Distilbert Finetuned 2")
plt.show()

In [28]:
test = json_to_Dataset("data/albert_test.json")

In [29]:
data = [i for i in test]

In [30]:
model = AutoModelForTokenClassification.from_pretrained("albert_finetuned")
tokenizer = AutoTokenizer.from_pretrained("albert_finetuned")

In [None]:
res = aux.compute_all_metrics(model, data)
print(res)

In [16]:
conf = np.log(res["confusion_matrix"] + 0.01)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(conf, cmap="Reds")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix Albert Finetuned 1")
plt.show()

In [18]:
model = AutoModelForTokenClassification.from_pretrained("albert_finetuned2")
tokenizer = AutoTokenizer.from_pretrained("albert_finetuned2")

In [None]:
res = compute_all_metrics(model, data)
print(res)

In [20]:
conf = np.log(res["confusion_matrix"] + 0.01)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(conf, cmap="Reds")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix Albert Finetuned 2")
plt.show()

In [8]:
distilbert_tuned = AutoModelForTokenClassification.from_pretrained(
    "distilbert_finetuned"
)
albert_tuned = AutoModelForTokenClassification.from_pretrained(
    "albert_finetuned"
)

In [None]:
model = CuboBert(distilbert_tuned=distilbert_tuned, albert_tuned=albert_tuned)
state_dict = torch.load("model_state.pth")
model.load_state_dict(state_dict)
model.eval()

In [10]:
dataset = json_to_Dataset_ensemble("data/ensemble_test.json")

In [11]:
data = [i for i in dataset]

In [None]:
res = compute_ensemble_metrics(model, data)
print(res)

In [13]:
conf = np.log(res["confusion_matrix"] + 0.01)

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(conf, cmap="Reds")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix KingBERT")
plt.show()

In [None]:
model.alpha