In [31]:
import auxiliary as aux
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorForTokenClassification
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
from kingbert import KingBert

In [32]:
train = aux.json_to_Dataset("data/distilbert_train.json")

In [33]:
train

Dataset({
    features: ['input_ids', 'labels', 'source_text', 'tokens', 'attention_mask'],
    num_rows: 18244
})

In [34]:
model = AutoModelForTokenClassification.from_pretrained('to_share/distilbert1')
tokenizer = AutoTokenizer.from_pretrained('to_share/distilbert1')

  return torch.load(checkpoint_file, map_location=map_location)


In [35]:
data = train[0]

logits, prediction, predicted_token_class, inputs = aux.inference(model, torch.tensor([data['input_ids']]), torch.tensor([data['attention_mask']]))

res = aux.compute_metrics([prediction.tolist()[0]], [data['labels']])

In [None]:
res = aux.compute_all_metrics(model, train)
print(res)

In [73]:
# ------------------------------------------------------------------
# Helper: fitness for ONE mutated sentence  (uses aux.compute_metrics)
# ------------------------------------------------------------------
from copy import deepcopy
import random

from tqdm import tqdm


def fitness_score(model, datum, mutated_ids):
    _, preds, _, _ = aux.inference(
        model,
        torch.tensor([mutated_ids]),
        torch.tensor([datum["attention_mask"]])
    )
    metrics = aux.compute_metrics([preds.tolist()[0]], [datum["labels"]])
    return 1.0 - metrics["recall"]   # maximise mismatch ⟹ minimise recall


# ------------------------------------------------------------------
# Evolutionary attack on ONE sentence
# ------------------------------------------------------------------
def evolve_sentence(
    model,
    datum,
    valid_token_ids,
    pop_size=30,
    n_generations=20,
    mutation_rate=0.15,
    elite_frac=0.2,
    target_recall=0.2,
    seed=42,
):
    random.seed(seed)
    original = datum["input_ids"]
    seq_len  = len(original)

    def random_mutation(base):
        child = base.copy()
        for i, lab in enumerate(datum["labels"]):
            if lab == 18 and random.random() < mutation_rate:
                child[i] = random.choice(valid_token_ids)
        return child

    population = [random_mutation(original) for _ in range(pop_size)]

    for _ in range(n_generations):
        fitness_vals = [fitness_score(model, datum, ind) for ind in population]
        ranked = sorted(zip(population, fitness_vals), key=lambda x: x[1], reverse=True)
        best_ind, best_fit = ranked[0]
        best_rec = 1.0 - best_fit
        if best_rec <= target_recall:
            break

        n_elite = max(1, int(elite_frac * pop_size))
        elites  = [deepcopy(ind) for ind, _ in ranked[:n_elite]]

        offspring = []
        while len(offspring) < pop_size - n_elite:
            parent = random.choice(elites)
            child  = random_mutation(parent)
            if random.random() < 0.3:          # crossover
                other  = random.choice(elites)
                pt     = random.randint(1, seq_len - 2)
                child  = child[:pt] + other[pt:]
            offspring.append(child)

        population = elites + offspring

    return best_ind, best_rec

# ------------------------------------------------------------------
# Attack the whole dataset and report new metrics
# ------------------------------------------------------------------
def adversarial_dataset(
    model,
    tokenizer,
    dataset,
    pop_size=30,
    n_generations=20,
    mutation_rate=0.15,
):
    # Cache normal vocab minus specials
    special_ids = set(tokenizer.all_special_ids)
    valid_token_ids = [tid for tid in tokenizer.get_vocab().values()
                       if tid not in special_ids]

    adv_inputs = []
    adv_labels = []

    for datum in tqdm(dataset, desc="Evolving sentences"):
        best_ids, _ = evolve_sentence(
            model, datum, valid_token_ids,
            pop_size=pop_size,
            n_generations=n_generations,
            mutation_rate=mutation_rate,
        )
        adv_inputs.append(best_ids)
        adv_labels.append(datum["labels"])   # labels unchanged

    # Evaluate the whole adversarial corpus
    preds = []
    for ids, datum in tqdm(zip(adv_inputs, dataset),
                           total=len(dataset),
                           desc="Inference on adversarial set"):
        _, p, _, _ = aux.inference(
            model,
            torch.tensor([ids]),
            torch.tensor([datum["attention_mask"]])
        )
        preds.append(p.tolist()[0])

    metrics = aux.compute_metrics(preds, adv_labels)
    return metrics, preds, adv_inputs, dataset

In [74]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

# Suppress only the specific warning from seqeval
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# 1️⃣ load everything exactly as you did
model      = AutoModelForTokenClassification.from_pretrained('to_share/distilbert1')
tokenizer  = AutoTokenizer.from_pretrained('to_share/distilbert1')
train_ds   = aux.json_to_Dataset("data/distilbert_train.json")

# 2️⃣ run the adversarial evolution
adv_metrics, adv_preds, adv_inputs, dataset = adversarial_dataset(
    model,
    tokenizer,
    train_ds.select(range(10)),    # start small for speed
    pop_size        = 10,
    n_generations   = 5,
    mutation_rate   = 0.05,
)

print("Metrics on evolved examples:")
for k, v in adv_metrics.items():
    if k != "confusion_matrix":
        print(f"{k:>12}: {v:.4f}")


  return torch.load(checkpoint_file, map_location=map_location)
Evolving sentences: 100%|██████████| 10/10 [00:16<00:00,  1.65s/it]
Inference on adversarial set: 100%|██████████| 10/10 [00:00<00:00, 50.55it/s]


Metrics on evolved examples:
   precision: 0.6909
      recall: 0.7451
          f1: 0.7170
    accuracy: 0.9716


In [75]:
# Suppose adv_inputs[0] is a list of token IDs (ints)
token_ids = adv_inputs[1]

# Convert to tokens
tokens = tokenizer.convert_ids_to_tokens(token_ids)

# Convert to string (human-readable sentence)
sentence = tokenizer.decode(token_ids, skip_special_tokens=True)

In [77]:
print(dataset[1]['source_text'])
print(dataset[1]['labels'])
print("\n\n\n")
# Suppose adv_inputs[0] is a list of token IDs (ints)
token_ids = adv_inputs[1]

# Convert to tokens
tokens = tokenizer.convert_ids_to_tokens(token_ids)

# Convert to string (human-readable sentence)
sentence = tokenizer.decode(token_ids, skip_special_tokens=True)
print(sentence)
print(adv_preds[1])

- Meeting at 2:33 PM
- N23 - Meeting at 11:29pm
- wennmann27 - Meeting at 4:45 PM

Kindly make sure to adjust your schedules accordingly. Our group messaging platform will be utilized for any updates, reminders, or changes to the agenda. This collaborative approach will ensure smooth communication and coordination throughout the admissions process.

Looking forward to our productive discussions and successful collaboration.

Warm regards,
[Your Name]
[-100, 18, 18, 18, 5, -100, -100, 21, 18, 10, -100, 18, 18, 18, 5, -100, -100, -100, 18, 10, -100, -100, -100, 18, 18, 18, 5, -100, -100, 21, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, -100, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, -100]




- witches at 2 : 33 pm - n23san meeting at 11 : 29pm - wennmann27unes meeting at 4 : 45 pm romanilean sure to adjust your schedules accordingly. our gr

In [88]:
# Adversarial input details
token_ids = adv_inputs[1]
predicted_labels = adv_preds[1]
true_labels = dataset[1]['labels']

# --- ORIGINAL INPUT ---
# Tokenize original input
original_encoding = tokenizer(dataset[1]['source_text'], truncation=True, padding="max_length", max_length=len(token_ids))
original_ids = original_encoding['input_ids']
original_tokens = tokenizer.convert_ids_to_tokens(original_ids)
original_attention = original_encoding['attention_mask']
original_labels = dataset[1]['labels']

# Get model prediction on original
_, original_preds, _, _ = aux.inference(
    model,
    torch.tensor([original_ids]),
    torch.tensor([original_attention])
)
original_preds = original_preds[0]

# Highlight for original sentence
original_highlighted = []
for token, true_label, pred_label in zip(original_tokens, original_labels, original_preds):
    display_token = token
    if true_label != -100 and true_label != 18:
        display_token = f"**{display_token}**"
    if pred_label != -100 and pred_label != 18:
        display_token = f"^^{display_token}^^"
    original_highlighted.append(display_token)

original_sentence = tokenizer.convert_tokens_to_string(original_highlighted)

# --- ADVERSARIAL INPUT ---
adv_tokens = tokenizer.convert_ids_to_tokens(token_ids)
adv_pred_labels = predicted_labels  # already adv_preds[1]
adv_true_labels = true_labels       # dataset[1]['labels']

adv_highlighted = []
for token, pred_label, true_label in zip(adv_tokens, adv_pred_labels, adv_true_labels):
    display_token = token
    if pred_label != -100 and pred_label != 18:
        display_token = f"^^{display_token}^^"
    adv_highlighted.append(display_token)

adv_sentence = tokenizer.convert_tokens_to_string(adv_highlighted)

# --- OUTPUT ---
print("🔵 Original Sentence:")
print(original_sentence)

print("\n🔴 Adversarial Sentence:")
print(adv_sentence)


🔵 Original Sentence:
[CLS] - meeting at ^^**2**^^ : ^^33^^ ^^**pm**^^ - ^^**n**^^ ^^##23^^ - meeting at ^^**11**^^ : ^^29^^ ^^##pm^^ - ^^**wen**^^ ^^##n^^ ^^##mann^^ ^^##27^^ - meeting at ^^**4**^^ : ^^45^^ ^^**pm**^^ kindly make sure to adjust your schedules accordingly. our group messaging platform will be utilized for any updates, reminders, or changes to the agenda. this collaborative approach will ensure smooth communication and coordination throughout the admissions process. looking forward to our productive discussions and successful collaboration. warm regards, [ your name ] [SEP]

🔴 Adversarial Sentence:
[CLS] - ^^witches^^ at ^^2^^ : ^^33^^ ^^pm^^ - ^^n^^ ^^##23^^ ^^##san^^ meeting at ^^11^^ : ^^29^^ ^^##pm^^ - ^^wen^^ ^^##n^^ ^^##mann^^ ^^##27^^ ^^##unes^^ meeting at ^^4^^ : ^^45^^ ^^pm^^ ^^romani^^lean sure to adjust your schedules accordingly. our group messaging platform will be utilized for any born, reminders, or changes to the agenda. recruiting collaborative approachk

In [89]:
# ---- Ground Truth PII Tokens in Original ----
original_text = dataset[1]['source_text']
original_encoding = tokenizer(original_text, truncation=True, padding="max_length", max_length=len(adv_inputs[1]))
original_tokens = tokenizer.convert_ids_to_tokens(original_encoding['input_ids'])
original_labels = dataset[1]['labels']

ground_truth_tokens = [
    token for token, label in zip(original_tokens, original_labels)
    if label != -100 and label != 18
]

print("✅ Ground Truth PII Tokens (Original):")
print(ground_truth_tokens)


# ---- Predicted PII Tokens in Adversarial ----
adv_token_ids = adv_inputs[1]
adv_pred_labels = adv_preds[1]
adv_tokens = tokenizer.convert_ids_to_tokens(adv_token_ids)

predicted_pii_tokens = [
    token for token, pred_label in zip(adv_tokens, adv_pred_labels)
    if pred_label != -100 and pred_label != 18
]

print("\n❌ Predicted PII Tokens (Adversarial):")
print(predicted_pii_tokens)


✅ Ground Truth PII Tokens (Original):
['2', 'pm', 'n', '11', 'wen', '4', 'pm']

❌ Predicted PII Tokens (Adversarial):
['witches', '2', '33', 'pm', 'n', '##23', '##san', '11', '29', '##pm', 'wen', '##n', '##mann', '##27', '##unes', '4', '45', 'pm', 'romani']


In [96]:
print(adv_preds)

[[18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 10, 10, 10, 10, 10, 18, 8, 18, 18, 18, 5, 18, 5, 21, 18, 10, 10, 41, 41, 41, 18, 18, 18, 5, 18, 10, 10, 10, 41, 41, 41, 41, 18, 18, 18, 5, 21, 21, 18, 10, 10, 10, 10, 10, 41, 41, 41, 18, 41, 18, 18, 18, 5, 18, 5, 21, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18], [18, 18, 29, 18, 5, 18, 5, 21, 18, 10, 10, 41, 18, 18, 5, 18, 5, 21, 18, 10, 10, 10, 10, 41, 18, 18, 5, 18, 5, 21, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18], [18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 

In [98]:
aux.compute_metrics(adv_preds, [original_labels])


{'precision': 0.0,
 'recall': 0.0,
 'f1': 0.0,
 'accuracy': 0.5974025974025974,
 'confusion_matrix': array([[ 0,  0,  0,  3,  0,  0],
        [ 0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  2,  0,  0],
        [ 4,  1, 10, 46,  2,  7],
        [ 0,  0,  0,  2,  0,  0],
        [ 0,  0,  0,  0,  0,  0]])}

In [117]:
for i,j in zip(adv_preds[1], dataset[1]['labels']):
    if j not in [18, -100] and i == 18:
        print(f"Original: {j}, Predicted: {i}")
        print("\n")

print(aux.compute_metrics([adv_preds[1]], [dataset[1]['labels']]))





{'precision': 0.42857142857142855, 'recall': 0.6, 'f1': 0.5, 'accuracy': 0.948051948051948, 'confusion_matrix': array([[ 3,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0,  0],
       [ 0,  1, 66,  0,  1,  2],
       [ 0,  0,  0,  2,  0,  0],
       [ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0]])}
