In [1]:
import torch
import json
import numpy as np
import pandas as pd
import nltk
import random
import itertools
import collections
import datasets
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

In [2]:
device = torch.device("mps")

In [2]:
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 512

In [3]:
train = datasets.load_dataset("Isotonic/pii-masking-200k_distilbert-base-multilingual-cased", token="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")
train

DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language'],
        num_rows: 209261
    })
})

In [4]:
df = train['train'].to_pandas()

In [5]:
df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,language
0,Our [ORDINALDIRECTION_1] campus is scheduled f...,Our Southwest campus is scheduled for a remode...,"{'[ORDINALDIRECTION_1]': 'Southwest', '[IBAN_1...","[[0, 4, 'O'], [4, 13, 'ORDINALDIRECTION_1'], [...","[O, B-ORDINALDIRECTION, O, O, O, O, O, O, O, O...","[Our, Southwest, campus, is, scheduled, for, a...",en
1,"Come parte dell'arbitrato, il tuo IP [IP_1] è ...","Come parte dell'arbitrato, il tuo IP 185.48.85...","{'[IP_1]': '185.48.85.30', '[MASKEDNUMBER_1]':...","[[0, 37, 'O'], [37, 49, 'IP_1'], [49, 103, 'O'...","[O, O, O, O, O, O, O, O, O, O, O, B-IP, I-IP, ...","[Come, parte, dell, ', ar, ##bit, ##rato, ,, i...",it
2,Utilizzando il [USERAGENT_1] del tuo dispositi...,Utilizzando il Mozilla/5.0 (Windows; U; Window...,{'[USERAGENT_1]': 'Mozilla/5.0 (Windows; U; Wi...,"[[0, 15, 'O'], [15, 128, 'USERAGENT_1'], [128,...","[O, O, O, O, O, B-USERAGENT, I-USERAGENT, I-US...","[Ut, ##ili, ##zza, ##ndo, il, Mozilla, /, 5, ....",it
3,[AGE_1] [GENDER_1] offering chemistry tutoring...,"67 Two-spirit offering chemistry tutoring, loc...","{'[AGE_1]': '67', '[GENDER_1]': 'Two-spirit', ...","[[0, 2, 'AGE_1'], [2, 3, 'O'], [3, 13, 'GENDER...","[B-AGE, B-GENDER, I-GENDER, I-GENDER, O, O, O,...","[67, Two, -, spirit, offering, chemistry, tuto...",en
4,Dans les propositions de politique pour l'ense...,Dans les propositions de politique pour l'ense...,"{'[SEX_1]': 'Female', '[EYECOLOR_1]': 'Blue', ...","[[0, 183, 'O'], [183, 189, 'SEX_1'], [189, 190...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Dans, les, proposition, ##s, de, politique, p...",fr
...,...,...,...,...,...,...,...
209256,Sono uno/a studente/a [SEX_1] che sta cercando...,Sono uno/a studente/a Female che sta cercando ...,"{'[SEX_1]': 'Female', '[MASKEDNUMBER_1]': '657...","[[0, 22, 'O'], [22, 28, 'SEX_1'], [28, 137, 'O...","[O, O, O, O, O, O, O, B-SEX, O, O, O, O, O, O,...","[Sono, uno, /, a, studente, /, a, Female, che,...",it
209257,Vorrei cambiare l'orario della mia sessione in...,Vorrei cambiare l'orario della mia sessione in...,"{'[ORDINALDIRECTION_1]': 'Northeast', '[PIN_1]...","[[0, 133, 'O'], [133, 142, 'ORDINALDIRECTION_1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Vor, ##rei, cambiar, ##e, l, ', ora, ##rio, d...",it
209258,"Patient âgé de [AGE_1] ans, Sexe : [SEX_1], or...","Patient âgé de 77 years old ans, Sexe : Male, ...","{'[AGE_1]': '77 years old', '[SEX_1]': 'Male',...","[[0, 15, 'O'], [15, 27, 'AGE_1'], [27, 40, 'O'...","[O, O, O, O, B-AGE, I-AGE, I-AGE, O, O, O, O, ...","[Pat, ##ient, âgé, de, 77, years, old, ans, ,,...",fr
209259,"Bonjour, je suis [FIRSTNAME_1] du club environ...","Bonjour, je suis Lacy du club environnemental ...","{'[FIRSTNAME_1]': 'Lacy', '[CREDITCARDNUMBER_1...","[[0, 17, 'O'], [17, 21, 'FIRSTNAME_1'], [21, 1...","[O, O, O, O, O, O, B-FIRSTNAME, I-FIRSTNAME, O...","[Bon, ##jou, ##r, ,, je, suis, Lac, ##y, du, c...",fr


In [9]:
tt[0]

array(['Our', 'Southwest', 'campus', 'is', 'scheduled', 'for', 'a', 're',
       '##mo', '##dell', '##ing', 'session', '.', 'We', 'aim', 'to',
       'make', 'the', 'environment', 'more', 'con', '##duc', '##ive',
       'for', 'learning', '.', 'Dona', '##tions', 'can', 'be', 'made',
       'through', 'PL', '##0', '##66', '##00', '##900', '##97', '##0',
       '##90', '##100', '##200', '##9', '##35', '##57', '##7', '##2', '.'],
      dtype=object)

In [16]:
kp = len(tokenizer(list(tt[0]), is_split_into_words=True, truncation=True, max_length=512)

In [17]:
kp

92

In [18]:
# remove more than 512
tt = df.tokenised_text.tolist()
tt_lens = [len(tokenizer(list(t), is_split_into_words=True, truncation=True, max_length=512)['input_ids']) for t in tqdm(tt)]
df[['tt_lens']] = 0
df.tt_lens = tt_lens
df = df.loc[df.tt_lens <= 510].reset_index(drop=True)

  0%|          | 0/209261 [00:00<?, ?it/s]

In [19]:
nt = df.bio_labels.tolist()
nt = list(itertools.chain.from_iterable(nt)) # merge the list of lists into one list
nt = collections.Counter(nt) # Get count of each tag
all_labels = list(nt.keys()) # get all unique tags(labels)

In [20]:
source_texts = df.unmasked_text.tolist()
target_texts = df.masked_text.tolist()
tokenized_texts = df.tokenised_text.tolist()
ner_tags = df.bio_labels.tolist()

In [21]:
len(all_labels)

112

In [22]:
source_texts[0]

'Our Southwest campus is scheduled for a remodelling session. We aim to make the environment more conducive for learning. Donations can be made through PL06600900970901002009355772.'

In [23]:
target_texts[0]

'Our [ORDINALDIRECTION_1] campus is scheduled for a remodelling session. We aim to make the environment more conducive for learning. Donations can be made through [IBAN_1].'

In [24]:
## checking if the tokens align
i = random.randint(0, len(source_texts))
x0 = tokenizer.convert_ids_to_tokens(tokenizer(source_texts[i])['input_ids'])
x0.pop(0)  # CLS is not present in the dataset
for t in zip(x0, tokenized_texts[i]):
    print(t)

('Rap', 'Rap')
('##pel', '##pel')
('pour', 'pour')
('collecte', 'collecte')
('##r', '##r')
('le', 'le')
('pai', 'pai')
('##ement', '##ement')
('de', 'de')
('la', 'la')
('pro', 'pro')
('##cha', '##cha')
('##ine', '##ine')
('sé', 'sé')
('##ance', '##ance')
('de', 'de')
('Or', 'Or')
('##ie', '##ie')
('.', '.')
('Con', 'Con')
('##fir', '##fir')
('##mez', '##mez')
('son', 'son')
('adresse', 'adresse')
('de', 'de')
('fact', 'fact')
('##urat', '##urat')
('##ion', '##ion')
('A', 'A')
('##pt', '##pt')
('.', '.')
('998', '998')
(',', ',')
('code', 'code')
('postal', 'postal')
('1825', '1825')
('##6', '##6')
('.', '.')
('Elle', 'Elle')
('pai', 'pai')
('##e', '##e')
('en', 'en')
('Bit', 'Bit')
('##co', '##co')
('##in', '##in')
(',', ',')
('en', 'en')
('##voy', '##voy')
('##ez', '##ez')
('la', 'la')
('demande', 'demande')
('à', 'à')
('3', '3')
('##SM', '##SM')
('##a', '##a')
('##M', '##M')
('##1', '##1')
('##ca', '##ca')
('##4', '##4')
('##eo', '##eo')
('##3', '##3')
('##99', '##99')
('##f', '##f')

In [25]:
# Checking if the tokens align with tags
# i = random.randint(0, len(target_texts))
for t in zip(ner_tags[i], tokenized_texts[i]):
    print(t)

('O', 'Rap')
('O', '##pel')
('O', 'pour')
('O', 'collecte')
('O', '##r')
('O', 'le')
('O', 'pai')
('O', '##ement')
('O', 'de')
('O', 'la')
('O', 'pro')
('O', '##cha')
('O', '##ine')
('O', 'sé')
('O', '##ance')
('O', 'de')
('B-FIRSTNAME', 'Or')
('I-FIRSTNAME', '##ie')
('O', '.')
('O', 'Con')
('O', '##fir')
('O', '##mez')
('O', 'son')
('O', 'adresse')
('O', 'de')
('O', 'fact')
('O', '##urat')
('O', '##ion')
('B-SECONDARYADDRESS', 'A')
('I-SECONDARYADDRESS', '##pt')
('I-SECONDARYADDRESS', '.')
('I-SECONDARYADDRESS', '998')
('O', ',')
('O', 'code')
('O', 'postal')
('B-ZIPCODE', '1825')
('I-ZIPCODE', '##6')
('O', '.')
('O', 'Elle')
('O', 'pai')
('O', '##e')
('O', 'en')
('O', 'Bit')
('O', '##co')
('O', '##in')
('O', ',')
('O', 'en')
('O', '##voy')
('O', '##ez')
('O', 'la')
('O', 'demande')
('O', 'à')
('B-BITCOINADDRESS', '3')
('I-BITCOINADDRESS', '##SM')
('I-BITCOINADDRESS', '##a')
('I-BITCOINADDRESS', '##M')
('I-BITCOINADDRESS', '##1')
('I-BITCOINADDRESS', '##ca')
('I-BITCOINADDRESS', '##4'

In [26]:
# Create label dict
label2id = dict([(value,key) for key, value in enumerate(all_labels)])
id2label = dict(map(reversed, label2id.items()))

label2id, id2label

({'O': 0,
  'B-ORDINALDIRECTION': 1,
  'B-IBAN': 2,
  'I-IBAN': 3,
  'B-IP': 4,
  'I-IP': 5,
  'B-MASKEDNUMBER': 6,
  'I-MASKEDNUMBER': 7,
  'B-USERAGENT': 8,
  'I-USERAGENT': 9,
  'B-PIN': 10,
  'I-PIN': 11,
  'B-AGE': 12,
  'B-GENDER': 13,
  'I-GENDER': 14,
  'B-ZIPCODE': 15,
  'I-ZIPCODE': 16,
  'B-SEX': 17,
  'B-EYECOLOR': 18,
  'B-HEIGHT': 19,
  'I-HEIGHT': 20,
  'B-FIRSTNAME': 21,
  'I-FIRSTNAME': 22,
  'B-SSN': 23,
  'I-SSN': 24,
  'B-DOB': 25,
  'I-DOB': 26,
  'B-USERNAME': 27,
  'I-USERNAME': 28,
  'B-PASSWORD': 29,
  'I-PASSWORD': 30,
  'B-STREET': 31,
  'I-STREET': 32,
  'B-SECONDARYADDRESS': 33,
  'I-SECONDARYADDRESS': 34,
  'B-COUNTY': 35,
  'I-COUNTY': 36,
  'B-STATE': 37,
  'I-STATE': 38,
  'B-PREFIX': 39,
  'I-PREFIX': 40,
  'B-LASTNAME': 41,
  'I-LASTNAME': 42,
  'I-AGE': 43,
  'B-CITY': 44,
  'I-CITY': 45,
  'B-URL': 46,
  'I-URL': 47,
  'B-IPV4': 48,
  'I-IPV4': 49,
  'B-MIDDLENAME': 50,
  'B-NEARBYGPSCOORDINATE': 51,
  'I-NEARBYGPSCOORDINATE': 52,
  'B-CURRENCYSYMBO

In [27]:
for j in tqdm(range(len(ner_tags))):
    tags = ner_tags[j]
    for i in range(len(tags)):
        for k,v in label2id.items():
            if tags[i] == k:
                tags[i] = v
    ner_tags[j] = tags
df.ner_tags = ner_tags

  0%|          | 0/209229 [00:00<?, ?it/s]

  df.ner_tags = ner_tags


In [28]:
ner_tags = [list(ner) for ner in ner_tags]
ner_tags[0]

[0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 0]

In [29]:
df[['source_words']] = "source_words"
source_words = [text.split(" ") for text in source_texts]
df.source_words = source_words

In [30]:
# removing rows where the len(tokenized_texts[i]) does not match len(ner_tags[i])
idx = [i for i in range(len(ner_tags)) if len(tokenized_texts[i]) != len(ner_tags[i])]
df = df.drop(index=idx).reset_index(drop=True)

In [27]:
dataset = datasets.Dataset.from_pandas(df)
dataset

Dataset({
    features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text', 'language', 'tt_lens', 'source_words'],
    num_rows: 209261
})

In [35]:
dataset[0]

{'masked_text': 'Our [ORDINALDIRECTION_1] campus is scheduled for a remodelling session. We aim to make the environment more conducive for learning. Donations can be made through [IBAN_1].',
 'unmasked_text': 'Our Southwest campus is scheduled for a remodelling session. We aim to make the environment more conducive for learning. Donations can be made through PL06600900970901002009355772.',
 'privacy_mask': "{'[ORDINALDIRECTION_1]': 'Southwest', '[IBAN_1]': 'PL06600900970901002009355772'}",
 'span_labels': "[[0, 4, 'O'], [4, 13, 'ORDINALDIRECTION_1'], [13, 151, 'O'], [151, 179, 'IBAN_1'], [179, 180, 'O']]",
 'bio_labels': [0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  0],
 'tokenised_text': ['Our',
  'Southwest',
  'campus',
  'is',
  'scheduled',
  'for',
  'a',
  're',
  '##mo',
  '##dell',
  '##ing

In [32]:
def align_labels(example):
    tokenized_input = tokenizer(example["tokenised_text"], is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
    word_ids = tokenized_input.word_ids()
    aligned_labels = [-100 if i is None else example["bio_labels"][i] for i in word_ids]
    tokenized_input['labels'] = aligned_labels
    return tokenized_input

In [33]:
al = align_labels(dataset[0])
print(len(al['input_ids']), len(al['attention_mask']), len(al['labels']))

92 92 92


In [49]:
tokenizer("hello")

{'input_ids': [101, 61694, 10133, 102], 'attention_mask': [1, 1, 1, 1]}

In [50]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer.encode_plus(examples["tokenised_text"], is_split_into_words=True, truncation=True, max_length=512)

    labels = []
    for i, label in enumerate(examples["bio_labels"]):
        word_ids = tokenized_inputs.word_ids(i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [51]:
x = dataset.map(align_labels, num_proc=8, remove_columns=dataset.column_names)

Map (num_proc=8):   0%|          | 0/209261 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1415 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (885 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1167 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

TimeoutError: 

In [40]:
tokenized_dataset = x.train_test_split(test_size=0.2)

In [41]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 167408
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 41853
    })
})

In [42]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [43]:
metric = datasets.load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }

    for k in results.keys():
        if (k not in flattened_results.keys()):
            flattened_results[f"{k}_f1"] = results[k]["f1"]

    return flattened_results

  metric = datasets.load_metric("seqeval")


In [44]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(all_labels), label2id=label2id, id2label=id2label)
print(model.config)

Downloading model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertConfig {
  "_name_or_path": "distilbert-base-multilingual-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "B-ORDINALDIRECTION",
    "2": "B-IBAN",
    "3": "I-IBAN",
    "4": "B-IP",
    "5": "I-IP",
    "6": "B-MASKEDNUMBER",
    "7": "I-MASKEDNUMBER",
    "8": "B-USERAGENT",
    "9": "I-USERAGENT",
    "10": "B-PIN",
    "11": "I-PIN",
    "12": "B-AGE",
    "13": "B-GENDER",
    "14": "I-GENDER",
    "15": "B-ZIPCODE",
    "16": "I-ZIPCODE",
    "17": "B-SEX",
    "18": "B-EYECOLOR",
    "19": "B-HEIGHT",
    "20": "I-HEIGHT",
    "21": "B-FIRSTNAME",
    "22": "I-FIRSTNAME",
    "23": "B-SSN",
    "24": "I-SSN",
    "25": "B-DOB",
    "26": "I-DOB",
    "27": "B-USERNAME",
    "28": "I-USERNAME",
    "29": "B-PASSWORD",
    "30": "I-PASSWORD",
    "31": "B-STREET",
    "32": "I-STREET",
    "33": "B-SECONDARYADDRESS

In [45]:
args = TrainingArguments(
    output_dir=f"{model_name}_finetuned_ai4privacy",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    overwrite_output_dir=True,
    warmup_ratio=0.2,
    weight_decay=0.01,
    save_strategy='epoch',
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    lr_scheduler_type='cosine_with_restarts',
    report_to='wandb',
    push_to_hub=False,
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["test"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [30]:
train_result = trainer.train()
test_result = trainer.evaluate(tokenized_dataset['test'])

train_metrics = train_result.metrics
test_metrics = test_result.metrics

max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])

train_metrics["train_samples"] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)

test_metrics["eval_samples"] = min(max_eval_samples, len(tokenized_dataset['test']))
trainer.log_metrics("eval", test_metrics)

trainer.save_metrics("train", train_metrics)
trainer.save_metrics("eval", test_metrics)

trainer.save_state()
trainer.save_model(args.output_dir)

[34m[1mwandb[0m: Currently logged in as: [33msripaadsrinivasan[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 5.52 GB, other allocations: 3.45 GB, max allowed: 9.07 GB). Tried to allocate 119.23 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).