In [1]:
class cfg:
    TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
    TRAINING_MAX_LENGTH = 1024
    OUTPUT_DIR = "output"
    seed = 42
    use_fp16 = True
    lr = 2e-5
    epochs = 8
    train_batch_size = 2
    eval_batch_size = 2
    accumulation_step = 2
    weight_decay = 0.01 
    scheduler = "cosine" 
    metric = "fbeta"
    warmup_ratio = 0.1

In [2]:
!pip install seqeval evaluate -q

In [3]:
import json
import argparse
from itertools import chain
from functools import partial

import os
import gc
import torch
import torch.nn as nn 
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd



In [4]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))

external = json.load(open("/kaggle/input/fix-punctuation-tokenization-external-dataset/pii_dataset_fixed.json"))
print("external datapoints: ", len(external))

moredata = json.load(open("/kaggle/input/fix-punctuation-tokenization-external-dataset/moredata_dataset_fixed.json"))
print("moredata datapoints: ", len(moredata))

more_more_data = json.load(open("/kaggle/input/pii-mixtral8x7b-generated-essays/mpware_mixtral8x7b_v1.1-no-i-username.json"))
print("more_more_data datapoints: ", len(more_more_data))

external datapoints:  4434
moredata datapoints:  2000
more_more_data datapoints:  2692


In [5]:
# downsampling of negative examples
p=[] # positive samples (contain relevant labels)
n=[] # negative samples (presumably contain entities that are possibly wrongly classified as entity)
for d in data:
    if any(np.array(d["labels"]) != "O"): p.append(d)
    else: n.append(d)
print("original datapoints: ", len(data))

original datapoints:  6807


In [6]:
data = moredata+external+ more_more_data +p+n[:len(n)//3]
print("combined: ", len(data))

combined:  12025


In [7]:
P = 0.023797656310188375

In [8]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [9]:
def tokenize(example, tokenizer, label2id, max_length):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length,truncation = True)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [10]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TRAINING_MODEL_PATH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": cfg.TRAINING_MAX_LENGTH}, num_proc=3)
# ds = ds.class_encode_column("group")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



    

#0:   0%|          | 0/4009 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/4008 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/4008 [00:00<?, ?ex/s]

In [11]:
x = ds[0]

for t,l in zip(x["tokens"], x["provided_labels"]):
    if l != "O":
        print((t,l))

print("*"*100)

for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t,id2label[l]))

('Richard', 'B-NAME_STUDENT')
('Chang', 'I-NAME_STUDENT')
('Richard', 'B-NAME_STUDENT')
('gwilliams@yahoo.com', 'B-EMAIL')
('brandy38', 'B-USERNAME')
('Richard', 'B-NAME_STUDENT')
('GB41EJEY19489241157815', 'B-ID_NUM')
('Richard', 'B-NAME_STUDENT')
('Richard', 'B-NAME_STUDENT')
('(', 'B-PHONE_NUM')
('259)938', 'I-PHONE_NUM')
('-', 'I-PHONE_NUM')
('7784x08016', 'I-PHONE_NUM')
('Richard', 'B-NAME_STUDENT')
('Richard', 'B-NAME_STUDENT')
('https://twitter.com/john51', 'B-URL_PERSONAL')
('Richard', 'B-NAME_STUDENT')
('Richard', 'B-NAME_STUDENT')
('Richard', 'B-NAME_STUDENT')
('https://youtube.com/c/sallywalker', 'B-URL_PERSONAL')
('Richard', 'B-NAME_STUDENT')
('Richard', 'B-NAME_STUDENT')
('711', 'B-STREET_ADDRESS')
('Golden', 'I-STREET_ADDRESS')
('Overpass', 'I-STREET_ADDRESS')
(',', 'I-STREET_ADDRESS')
('West', 'I-STREET_ADDRESS')
('Andreaville', 'I-STREET_ADDRESS')
(',', 'I-STREET_ADDRESS')
('OH', 'I-STREET_ADDRESS')
('44115', 'I-STREET_ADDRESS')
('Richard', 'B-NAME_STUDENT')
('Richard',

In [12]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    fbeta_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'fbeta': fbeta_score
    }
    return results

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    cfg.TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# I actually chose to not use any validation set. This is only for the model I use for submission.
args = TrainingArguments(
    logging_dir = './logs',
    output_dir=cfg.OUTPUT_DIR, 
    fp16=cfg.use_fp16,
    learning_rate=cfg.lr,
    num_train_epochs=cfg.epochs,
    per_device_train_batch_size=cfg.train_batch_size,
    gradient_accumulation_steps=cfg.accumulation_step,
    report_to="none",
    evaluation_strategy="no",
    do_eval=False,
    save_total_limit=1,
    logging_steps=500,
    lr_scheduler_type=cfg.scheduler,
    metric_for_best_model=cfg.metric,
    greater_is_better=True,
    warmup_ratio=cfg.warmup_ratio,
    weight_decay=cfg.weight_decay,
)


In [15]:
class MyCustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        pred = F.softmax(logits, dim=-1)
        b_term_tensor = torch.zeros_like(pred)
        b_term = (5**2)*(P/(1-P))
        b_term_tensor[:, :, 12] = b_term       

        loss = F.nll_loss((pred + b_term_tensor).log().transpose(1, 2), labels)
        
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = MyCustomTrainer(
    model=model, 
    args=args, 
    train_dataset=ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)
trainer.train()
trainer.save_model(f"deberta3base_1024")
tokenizer.save_pretrained(f"deberta3base_1024")
del trainer
del model
gc.collect()
torch.cuda.empty_cache()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,-0.1111
1000,-0.439
1500,-0.4445
2000,-0.4457
2500,-0.4475
3000,-0.447
