In [1]:
import torch
import json
import numpy as np
import pandas as pd
import nltk
import random
import itertools
import collections
import datasets
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

In [None]:
device = torch.device("mps")

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/electra-large-generator")

In [3]:
# df1 = pd.read_json("data/english_balanced_10k.jsonl", lines=True)
# df2 = pd.read_json("data/PII43k_original.jsonl", lines=True)

In [4]:
# df = df1.append([df2]).reset_index(drop=True)
# df = df.rename(columns={
#     "Masked text" : "target_text",
#     "Unmasked text": "source_text",
#     "Tokenised Masked text": "tokenized_text",
#     "Tokensised Unmasked text": "ner_tags", # need it to find all present labels
# }) # why add spaces in column names?
# # df = df.drop_duplicates()
# df = df.dropna().reset_index(drop=True)

In [5]:
df = pd.read_json("../data/pii200k_english.jsonl", lines=True)
df = df.rename(columns={
    "masked_text" : "target_text",
    "unmasked_text": "source_text",
    "tokenised_unmasked_text": "tokenized_text",
    "token_entity_labels": "ner_tags", # need it to find all present labels
})
df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,target_text,source_text,ner_tags,tokenized_text
0,"[PREFIX_1] [FIRSTNAME_1] [LASTNAME_1], please ...","Ms. Savion Von, please do not forget your appo...","[B-PREFIX, I-PREFIX, B-FIRSTNAME, I-FIRSTNAME,...","[ms, ., sa, ##vio, ##n, von, ,, please, do, no..."
1,Support group meeting for [GENDER_1] affected ...,Support group meeting for Transexual person af...,"[O, O, O, O, B-GENDER, I-GENDER, I-GENDER, I-G...","[support, group, meeting, for, trans, ##ex, ##..."
2,"[FIRSTNAME_1], this is a reminder about your p...","Trycia, this is a reminder about your psycho-o...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[try, ##cia, ,, this, is, a, reminder, about, ..."
3,"[USERNAME_1], please visit our page [URL_1] to...","Juliet.Murazik85, please visit our page https:...","[B-USERNAME, I-USERNAME, I-USERNAME, I-USERNAM...","[juliet, ., mu, ##raz, ##ik, ##85, ,, please, ..."
4,"Psycho-oncology research forum, [DATE_1] at [T...","Psycho-oncology research forum, 2/11 at 9 AM, ...","[O, O, O, O, O, O, O, B-DATE, I-DATE, I-DATE, ...","[psycho, -, on, ##cology, research, forum, ,, ..."
...,...,...,...,...
59387,Can you please provide a breakdown of the comp...,Can you please provide a breakdown of the comp...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[can, you, please, provide, a, breakdown, of, ..."
59388,A transaction for user [USERNAME_1] on [DATE_1...,A transaction for user Patsy_Volkman on 23/10/...,"[O, O, O, O, B-USERNAME, I-USERNAME, I-USERNAM...","[a, transaction, for, user, patsy, _, vol, ##k..."
59389,We are curious about the current investments i...,We are curious about the current investments i...,"[O, O, O, O, O, O, O, O, B-STATE, O, O, O, O, ...","[we, are, curious, about, the, current, invest..."
59390,Can you create an update presentation about th...,Can you create an update presentation about th...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[can, you, create, an, update, presentation, a..."


In [6]:
# remove more than 512
tt = df.tokenized_text.tolist()
tt_lens = [len(tokenizer(t, is_split_into_words=True, truncation=True, max_length=512)) for t in tqdm(tt)]
df[['tt_lens']] = 0
df.tt_lens = tt_lens
df = df.loc[df.tt_lens <= 510].reset_index(drop=True)

  0%|          | 0/59392 [00:00<?, ?it/s]

In [7]:
nt = df.ner_tags.tolist()
nt = list(itertools.chain.from_iterable(nt)) # merge the list of lists into one list
nt = collections.Counter(nt) # Get count of each tag
all_labels = list(nt.keys()) # get all unique tags(labels)

In [8]:
source_texts = df.source_text.tolist()
target_texts = df.target_text.tolist()
tokenized_texts = df.tokenized_text.tolist()
ner_tags = df.ner_tags.tolist()

In [9]:
len(all_labels)

122

In [10]:
source_texts[0]

'Ms. Savion Von, please do not forget your appointment on 28th November at 09. Keep our 69-616376-555417-0 and Gilbert_Dooley in mind.'

In [11]:
target_texts[0]

'[PREFIX_1] [FIRSTNAME_1] [LASTNAME_1], please do not forget your appointment on [DATE_1] at [TIME_1]. Keep our [PHONEIMEI_1] and [USERNAME_1] in mind.'

In [12]:
## checking if the tokens align
i = random.randint(0, len(source_texts))
x0 = tokenizer.convert_ids_to_tokens(tokenizer(source_texts[i])['input_ids'])
x0.pop(0)  # CLS is not present in the dataset
for t in zip(x0, tokenized_texts[i]):
    print(t)

('enroll', 'enroll')
('patient', 'patient')
('blair', 'blair')
('_', '_')
('den', 'den')
('##es', '##es')
('##ik', '##ik')
('-', '-')
('lu', 'lu')
('##bow', '##bow')
('##itz', '##itz')
('##44', '##44')
('in', 'in')
('trial', 'trial')
('id', 'id')
('580', '580')
('##14', '##14')
('##41', '##41')
('##44', '##44')
('##36', '##36')
('##75', '##75')
('##7', '##7')
('##70', '##70')
('.', '.')
('sex', 'sex')
(':', ':')
('female', 'female')
(',', ',')
('do', 'do')
('##b', '##b')
(':', ':')
('october', 'october')
('6', '6')
(',', ',')
('1952', '1952')
('.', '.')


In [13]:
# Checking if the tokens align with tags
# i = random.randint(0, len(target_texts))
for t in zip(ner_tags[i], tokenized_texts[i]):
    print(t)

('O', 'enroll')
('O', 'patient')
('B-USERNAME', 'blair')
('I-USERNAME', '_')
('I-USERNAME', 'den')
('I-USERNAME', '##es')
('I-USERNAME', '##ik')
('I-USERNAME', '-')
('I-USERNAME', 'lu')
('I-USERNAME', '##bow')
('I-USERNAME', '##itz')
('I-USERNAME', '##44')
('O', 'in')
('O', 'trial')
('O', 'id')
('B-MASKEDNUMBER', '580')
('I-MASKEDNUMBER', '##14')
('I-MASKEDNUMBER', '##41')
('I-MASKEDNUMBER', '##44')
('I-MASKEDNUMBER', '##36')
('I-MASKEDNUMBER', '##75')
('I-MASKEDNUMBER', '##7')
('I-MASKEDNUMBER', '##70')
('O', '.')
('O', 'sex')
('O', ':')
('B-SEX', 'female')
('O', ',')
('O', 'do')
('O', '##b')
('O', ':')
('B-DOB', 'october')
('I-DOB', '6')
('I-DOB', ',')
('I-DOB', '1952')
('O', '.')


In [14]:
# Create label dict
label2id = dict([(value,key) for key, value in enumerate(all_labels)])
id2label = dict(map(reversed, label2id.items()))

label2id, id2label

({'B-PREFIX': 0,
  'I-PREFIX': 1,
  'B-FIRSTNAME': 2,
  'I-FIRSTNAME': 3,
  'B-LASTNAME': 4,
  'O': 5,
  'B-DATE': 6,
  'I-DATE': 7,
  'B-TIME': 8,
  'B-PHONEIMEI': 9,
  'I-PHONEIMEI': 10,
  'B-USERNAME': 11,
  'I-USERNAME': 12,
  'B-GENDER': 13,
  'I-GENDER': 14,
  'B-CITY': 15,
  'I-CITY': 16,
  'B-STATE': 17,
  'B-URL': 18,
  'I-URL': 19,
  'B-JOBAREA': 20,
  'I-TIME': 21,
  'B-EMAIL': 22,
  'I-EMAIL': 23,
  'B-JOBTYPE': 24,
  'I-LASTNAME': 25,
  'B-COMPANYNAME': 26,
  'I-COMPANYNAME': 27,
  'B-JOBTITLE': 28,
  'I-JOBTITLE': 29,
  'B-STREET': 30,
  'I-STREET': 31,
  'B-SECONDARYADDRESS': 32,
  'I-SECONDARYADDRESS': 33,
  'B-COUNTY': 34,
  'I-COUNTY': 35,
  'B-AGE': 36,
  'I-AGE': 37,
  'B-USERAGENT': 38,
  'I-USERAGENT': 39,
  'B-ACCOUNTNAME': 40,
  'I-ACCOUNTNAME': 41,
  'B-ACCOUNTNUMBER': 42,
  'I-ACCOUNTNUMBER': 43,
  'B-CURRENCYSYMBOL': 44,
  'I-CURRENCYSYMBOL': 45,
  'B-AMOUNT': 46,
  'I-AMOUNT': 47,
  'B-CREDITCARDISSUER': 48,
  'B-CREDITCARDNUMBER': 49,
  'I-CREDITCARDNUMBER'

In [15]:
for j in tqdm(range(len(ner_tags))):
    tags = ner_tags[j]
    for i in range(len(tags)):
        for k,v in label2id.items():
            if tags[i] == k:
                tags[i] = v
    ner_tags[j] = tags
df.ner_tags = ner_tags

  0%|          | 0/59392 [00:00<?, ?it/s]

In [16]:
ner_tags[0]

[0,
 1,
 2,
 3,
 3,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 7,
 5,
 8,
 5,
 5,
 5,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 5,
 11,
 12,
 12,
 12,
 5,
 5,
 5]

In [17]:
df[['source_words']] = "source_words"
source_words = [text.split(" ") for text in source_texts]
df.source_words = source_words

In [18]:
# removing rows where the len(tokenized_texts[i]) does not match len(ner_tags[i])
idx = [i for i in range(len(ner_tags)) if len(tokenized_texts[i]) != len(ner_tags[i])]
df = df.drop(index=idx).reset_index(drop=True)

In [19]:
dataset = datasets.Dataset.from_pandas(df)
dataset

Dataset({
    features: ['target_text', 'source_text', 'ner_tags', 'tokenized_text', 'tt_lens', 'source_words'],
    num_rows: 58113
})

In [20]:
def align_labels(example):
    tokenized_input = tokenizer(example["tokenized_text"], is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
    word_ids = tokenized_input.word_ids()
    aligned_labels = [-100 if i is None else example["ner_tags"][i] for i in word_ids]
    tokenized_input['labels'] = aligned_labels
    return tokenized_input

In [21]:
al = align_labels(dataset[0])
print(len(al['input_ids']), len(al['attention_mask']), len(al['labels']))

60 60 60


In [22]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokenized_text"], is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
x = dataset.map(align_labels, num_proc=8, remove_columns=dataset.column_names)

Map (num_proc=8):   0%|          | 0/58113 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (745 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (997 > 512). Running this sequence through the model will result in indexing errors


In [24]:
tokenized_dataset = x.train_test_split(test_size=0.2)

In [25]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 46490
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11623
    })
})

In [26]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [27]:
metric = datasets.load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }

    for k in results.keys():
        if (k not in flattened_results.keys()):
            flattened_results[f"{k}_f1"] = results[k]["f1"]

    return flattened_results

  metric = datasets.load_metric("seqeval")


In [28]:
model = AutoModelForTokenClassification.from_pretrained("google/electra-large-generator", num_labels=len(all_labels), label2id=label2id, id2label=id2label)
print(model.config)

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at google/electra-large-generator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraConfig {
  "_name_or_path": "google/electra-large-generator",
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "B-PREFIX",
    "1": "I-PREFIX",
    "2": "B-FIRSTNAME",
    "3": "I-FIRSTNAME",
    "4": "B-LASTNAME",
    "5": "O",
    "6": "B-DATE",
    "7": "I-DATE",
    "8": "B-TIME",
    "9": "B-PHONEIMEI",
    "10": "I-PHONEIMEI",
    "11": "B-USERNAME",
    "12": "I-USERNAME",
    "13": "B-GENDER",
    "14": "I-GENDER",
    "15": "B-CITY",
    "16": "I-CITY",
    "17": "B-STATE",
    "18": "B-URL",
    "19": "I-URL",
    "20": "B-JOBAREA",
    "21": "I-TIME",
    "22": "B-EMAIL",
    "23": "I-EMAIL",
    "24": "B-JOBTYPE",
    "25": "I-LASTNAME",
    "26": "B-COMPANYNAME",
    "27": "I-COMPANYNAME",
    "28": "B-JOBTITLE",
    "29": "I-JOBTITLE",
    "30": "B-STREET",
    "31": "I-

In [29]:
args = TrainingArguments(
    output_dir="electra-large-generator_finetuned_ai4privacy",
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    overwrite_output_dir=True,
    warmup_ratio=0.2,
    weight_decay=0.01,
    save_strategy='epoch',
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    lr_scheduler_type='cosine_with_restarts',
    report_to='wandb',
    push_to_hub=False,
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["test"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [30]:
train_result = trainer.train()
test_result = trainer.evaluate(tokenized_dataset['test'])

train_metrics = train_result.metrics
test_metrics = test_result.metrics

max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])

train_metrics["train_samples"] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)

test_metrics["eval_samples"] = min(max_eval_samples, len(tokenized_dataset['test']))
trainer.log_metrics("eval", test_metrics)

trainer.save_metrics("train", train_metrics)
trainer.save_metrics("eval", test_metrics)

trainer.save_state()
trainer.save_model(args.output_dir)

[34m[1mwandb[0m: Currently logged in as: [33msripaadsrinivasan[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 5.52 GB, other allocations: 3.45 GB, max allowed: 9.07 GB). Tried to allocate 119.23 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).