In [1]:
import os
os.chdir("../")

# 🏋️ PII Model Training Notebook

## 📦 Imports

From Packages

In [2]:
from itertools import chain
from functools import partial
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
)
import pandas as pd
from types import SimpleNamespace
import torch
import wandb
import spacy

From utility scripts

In [3]:
from src.metric import compute_metrics as eval_compute_metrics
from src.data import create_dataset
from src.utils import (
    get_reference_df_parquet, 
    parse_predictions,
    filter_errors,
    generate_htmls_concurrently,
    visualize,
    convert_for_upload,
    CustomTrainer,
    parse_args
)

## 🆕 Initialization

In [4]:
FIRST_PART = "first"
LAST_PART = "last"
MIDDLE_PART = "middle"
PART = FIRST_PART

In [5]:
MODEL_SIZE = "base"

In [6]:
WANDB_PROJECT = "Kaggle-PII"
USER_NAME = "shakleenishfar"
PROJECT_PATH = f"{USER_NAME}/{WANDB_PROJECT}"
EXPERIMENT = f"pii013_{PART}"
WANDB_NOTEBOOK_NAME = "pii-model-training.ipynb"
WANDB_NAME=f"DeBERTA-v3-{MODEL_SIZE}-512-{PART}"
WANDB_NOTES=f"""Training using DeBERTA-v3-{MODEL_SIZE}-512 {PART} one-third negative samples. 
Included data from Valentin, Moth, NBroad, MPWare, Dileep, Newton, PJ Mathematician, and No fit just luck."""

In [7]:
config = SimpleNamespace(
    experiment=EXPERIMENT,
    threshold=0.95,
    o_weight=1.0,
    stride_artifact=f"{PROJECT_PATH}/processed_data:latest",
    raw_artifact=f"{PROJECT_PATH}/raw_data:latest",
    external_data_1="none",
    external_data_2="none",
    external_data_3="none",
    external_data_4="none",
    external_data_5="none",
    output_dir=f"model_dir/DeBERTA-V3-{MODEL_SIZE}-512-{PART}",
    inference_max_length=512,
    training_max_length=512,
    training_model_path=f"microsoft/deberta-v3-{MODEL_SIZE}",
    fp16=True,
    learning_rate=4e-5,
    num_train_epochs=3,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    gradient_accumulation_steps=2,
    report_to="wandb",
    evaluation_strategy="epoch",
    do_eval=True,
    save_total_limit=1,
    logging_steps=10,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    random_state=29,
)

In [8]:
wandb.login(key="0bf204609ea345c7c595565d736a9d62ca69f838")
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_NAME,
    notes=WANDB_NOTES,
    save_code=True,
    job_type="train",
    config=config,
)
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mshakleenishfar[0m ([33mlaplacesdemon43[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ishfar/.netrc


## 💾 Data Preparation

### Fetching Data

Getting data from Weights and Biases

In [9]:
stride_artifact = wandb.use_artifact(config.stride_artifact)
stride_artifact_dir = stride_artifact.download()
df = pd.read_parquet(stride_artifact_dir + "/stride_data.parquet")

[34m[1mwandb[0m: Downloading large artifact processed_data:latest, 162.08MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.4


### Splitting Data

Into train and evaluation splits.

In [10]:
train_df = df[df.valid == False].reset_index(drop=True)
eval_df = df[df.valid == True].reset_index(drop=True)

print("Size of training dataset:", train_df.shape[0])
print("Size of validation dataset:", eval_df.shape[0])

Size of training dataset: 66986
Size of validation dataset: 3079


### Negative Sampling

Used to handle the extreme class imbalance in the data. Suggested by Valentin Warner.

* positive samples (contain relevant labels)

* negative samples (presumably contain entities that are possibly wrongly classified as entity)

In [11]:
negatives, positives = [], []

for _, row in train_df.iterrows():
    if any(row.labels != "O"):
        positives.append(row)
    else:
        negatives.append(row)
        
positives, negatives = pd.DataFrame(positives), pd.DataFrame(negatives)
print("Negative samples:", len(negatives))
print("Positive samples:", len(positives))

Negative samples: 32137
Positive samples: 34849


Take one third of the negative samples for downsampling.

In [12]:
if PART == FIRST_PART:
    negatives = negatives.iloc[: negatives.shape[0] // 3]
elif PART == MIDDLE_PART:
    negatives = negatives.iloc[negatives.shape[0] // 3 : 2 * negatives.shape[0] // 3]
elif PART == LAST_PART:
    negatives = negatives.iloc[2 * negatives.shape[0] // 3 :]
else:
    raise Exception(f"Undefined part: {PART}")

train_df = pd.concat([positives, negatives])
train_df = train_df.sample(frac=1, random_state=config.random_state)
print(f"Down sampled training: {len(train_df)}")
del positives, negatives

Down sampled training: 45561


### 🪙 Data Tokenization

In [13]:
reference_df = get_reference_df_parquet(config.raw_artifact)
all_labels = sorted(list(set(chain(*[x.tolist() for x in df.labels.values]))))
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}
id2label

[34m[1mwandb[0m: Downloading large artifact raw_data:latest, 139.90MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.4


{0: 'B-EMAIL',
 1: 'B-ID_NUM',
 2: 'B-NAME_STUDENT',
 3: 'B-PHONE_NUM',
 4: 'B-STREET_ADDRESS',
 5: 'B-URL_PERSONAL',
 6: 'B-USERNAME',
 7: 'I-ID_NUM',
 8: 'I-NAME_STUDENT',
 9: 'I-PHONE_NUM',
 10: 'I-STREET_ADDRESS',
 11: 'I-URL_PERSONAL',
 12: 'O'}

In [14]:
tokenizer = AutoTokenizer.from_pretrained(config.training_model_path)
train_ds = create_dataset(train_df, tokenizer, config.training_max_length, label2id)
valid_ds = create_dataset(eval_df, tokenizer, config.inference_max_length, label2id)



Map (num_proc=6):   0%|          | 0/45561 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/3079 [00:00<?, ? examples/s]

## 🏋️ Training

In [15]:
model = AutoModelForTokenClassification.from_pretrained(
    config.training_model_path,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### PEFT (Parameter Efficient Finetuning)

In [16]:
# import peft
# from peft import (
#     get_peft_config,
#     PeftModel,
#     PeftConfig,
#     get_peft_model,
#     LoraConfig,
#     TaskType,
# )

In [17]:
# peft_config = LoraConfig(
#     r=128,  # Use larger 'r' value increase more parameters during training
#     bias='none',
#     inference_mode=False,
#     task_type=TaskType.SEQ_CLS,
#     # Only Use Output and Values Projection
#     target_modules=['query_proj', 'value_proj'],
# )

# # Load the PEFT model
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

In [18]:
args = TrainingArguments(
    output_dir=config.output_dir,
    fp16=config.fp16,
    learning_rate=config.learning_rate,
    num_train_epochs=config.num_train_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    report_to=config.report_to,
    evaluation_strategy=config.evaluation_strategy,
    do_eval=config.do_eval,
    save_total_limit=config.save_total_limit,
    logging_steps=config.logging_steps,
    lr_scheduler_type=config.lr_scheduler_type,
    warmup_ratio=config.warmup_ratio,
    weight_decay=config.weight_decay,
)

Set "O" tokens to have a very small weight.

In [19]:
class_weights = torch.tensor([1.0] * 12 + [config.o_weight]).to("cuda")

In [20]:
trainer = CustomTrainer(
    model=model, 
    args=args, 
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(
        eval_compute_metrics,
        id2label=id2label,
        valid_ds=valid_ds,
        valid_df=reference_df,
        threshold=config.threshold,
    ),
    class_weights=class_weights
)

In [21]:
trainer.train()

  0%|          | 0/2847 [00:00<?, ?it/s]

{'loss': 2.7744, 'grad_norm': 25.770936965942383, 'learning_rate': 9.824561403508773e-07, 'epoch': 0.01}
{'loss': 2.6599, 'grad_norm': 25.629478454589844, 'learning_rate': 2.385964912280702e-06, 'epoch': 0.02}
{'loss': 2.0911, 'grad_norm': 23.514015197753906, 'learning_rate': 3.789473684210527e-06, 'epoch': 0.03}
{'loss': 1.2486, 'grad_norm': 15.418803215026855, 'learning_rate': 5.192982456140351e-06, 'epoch': 0.04}
{'loss': 0.5758, 'grad_norm': 4.754968166351318, 'learning_rate': 6.596491228070177e-06, 'epoch': 0.05}
{'loss': 0.239, 'grad_norm': 0.9361463785171509, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.06}
{'loss': 0.1527, 'grad_norm': 0.40551507472991943, 'learning_rate': 9.403508771929825e-06, 'epoch': 0.07}
{'loss': 0.1194, 'grad_norm': 0.325920969247818, 'learning_rate': 1.080701754385965e-05, 'epoch': 0.08}
{'loss': 0.0992, 'grad_norm': 0.29373884201049805, 'learning_rate': 1.2210526315789475e-05, 'epoch': 0.09}
{'loss': 0.0924, 'grad_norm': 0.2946913540363312, 'lear

  0%|          | 0/385 [00:00<?, ?it/s]

{'eval_loss': 0.002055585151538253, 'eval_ents_p': 0.625, 'eval_ents_r': 0.94, 'eval_ents_f5': 0.9221249622698459, 'eval_ents_per_type_EMAIL_p': 0.9473684210526315, 'eval_ents_per_type_EMAIL_r': 0.782608695652174, 'eval_ents_per_type_EMAIL_f5': 0.7878787878787878, 'eval_ents_per_type_ID_NUM_p': 0.4117647058823529, 'eval_ents_per_type_ID_NUM_r': 0.9333333333333333, 'eval_ents_per_type_ID_NUM_f5': 0.8899755501222495, 'eval_ents_per_type_NAME_STUDENT_p': 0.6666666666666666, 'eval_ents_per_type_NAME_STUDENT_r': 0.9467787114845938, 'eval_ents_per_type_NAME_STUDENT_f5': 0.9317217981340119, 'eval_ents_per_type_PHONE_NUM_p': 0.7142857142857143, 'eval_ents_per_type_PHONE_NUM_r': 0.9523809523809523, 'eval_ents_per_type_PHONE_NUM_f5': 0.9403254972875226, 'eval_ents_per_type_STREET_ADDRESS_p': 0.3448275862068966, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.855263157894737, 'eval_ents_per_type_URL_PERSONAL_p': 0.625, 'eval_ents_per_type_URL_P

  0%|          | 0/385 [00:00<?, ?it/s]

{'eval_loss': 0.0015146860387176275, 'eval_ents_p': 0.6171875, 'eval_ents_r': 0.948, 'eval_ents_f5': 0.9288513717214351, 'eval_ents_per_type_EMAIL_p': 0.9523809523809523, 'eval_ents_per_type_EMAIL_r': 0.8695652173913043, 'eval_ents_per_type_EMAIL_f5': 0.87248322147651, 'eval_ents_per_type_ID_NUM_p': 0.4576271186440678, 'eval_ents_per_type_ID_NUM_r': 0.9, 'eval_ents_per_type_ID_NUM_f5': 0.8677379480840542, 'eval_ents_per_type_NAME_STUDENT_p': 0.6215722120658135, 'eval_ents_per_type_NAME_STUDENT_r': 0.9523809523809523, 'eval_ents_per_type_NAME_STUDENT_f5': 0.933277027027027, 'eval_ents_per_type_PHONE_NUM_p': 0.45652173913043476, 'eval_ents_per_type_PHONE_NUM_r': 1.0, 'eval_ents_per_type_PHONE_NUM_f5': 0.9562171628721541, 'eval_ents_per_type_STREET_ADDRESS_p': 0.7407407407407407, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.901213171577123, 'eval_ents_per_type_URL_PERSONAL_p': 0.6779661016949152, 'eval_ents_per_type_URL_PERSONAL_r': 

  0%|          | 0/385 [00:00<?, ?it/s]

{'eval_loss': 0.0014683022163808346, 'eval_ents_p': 0.6319261213720316, 'eval_ents_r': 0.958, 'eval_ents_f5': 0.9393573691356161, 'eval_ents_per_type_EMAIL_p': 0.9523809523809523, 'eval_ents_per_type_EMAIL_r': 0.8695652173913043, 'eval_ents_per_type_EMAIL_f5': 0.87248322147651, 'eval_ents_per_type_ID_NUM_p': 0.5384615384615384, 'eval_ents_per_type_ID_NUM_r': 0.9333333333333333, 'eval_ents_per_type_ID_NUM_f5': 0.9077306733167082, 'eval_ents_per_type_NAME_STUDENT_p': 0.5961871750433275, 'eval_ents_per_type_NAME_STUDENT_r': 0.9635854341736695, 'eval_ents_per_type_NAME_STUDENT_f5': 0.9412755209429594, 'eval_ents_per_type_PHONE_NUM_p': 0.84, 'eval_ents_per_type_PHONE_NUM_r': 1.0, 'eval_ents_per_type_PHONE_NUM_f5': 0.9927272727272727, 'eval_ents_per_type_STREET_ADDRESS_p': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_r': 0.9090909090909091, 'eval_ents_per_type_STREET_ADDRESS_f5': 0.9090909090909092, 'eval_ents_per_type_URL_PERSONAL_p': 0.7692307692307693, 'eval_ents_per_type_URL_PE

TrainOutput(global_step=2847, training_loss=0.0443750848482691, metrics={'train_runtime': 2982.5862, 'train_samples_per_second': 45.827, 'train_steps_per_second': 0.955, 'train_loss': 0.0443750848482691, 'epoch': 3.0})

### Saving Model and Metrics locally

In [22]:
trainer.save_model(config.output_dir)
tokenizer.save_pretrained(config.output_dir)

('model_dir/DeBERTA-V3-base-512-first/tokenizer_config.json',
 'model_dir/DeBERTA-V3-base-512-first/special_tokens_map.json',
 'model_dir/DeBERTA-V3-base-512-first/spm.model',
 'model_dir/DeBERTA-V3-base-512-first/added_tokens.json',
 'model_dir/DeBERTA-V3-base-512-first/tokenizer.json')

## Determine Best Threshold

In [23]:
del tokenizer, model, collator, args, trainer

In [24]:
tokenizer = AutoTokenizer.from_pretrained(config.output_dir)
model = AutoModelForTokenClassification.from_pretrained(config.output_dir)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    ".",
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    report_to="none",
)
trainer = CustomTrainer(
    model=model,
    args=args,
    data_collator=collator,
    tokenizer=tokenizer,
)
preds = trainer.predict(valid_ds)

  0%|          | 0/385 [00:00<?, ?it/s]

In [25]:
print("Computing final metrics...")
final_metrics = {
    f"final_f5_at_{threshold}": eval_compute_metrics(
        (preds.predictions, None),
        id2label,
        valid_ds,
        reference_df,
        threshold=threshold,
    )["ents_f5"]
    for threshold in [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97]
}
wandb.log(final_metrics)
print(final_metrics)

Computing final metrics...
{'final_f5_at_0.5': 0.9020327067094605, 'final_f5_at_0.6': 0.9049690863292879, 'final_f5_at_0.7': 0.9176703765817961, 'final_f5_at_0.8': 0.9257455873402313, 'final_f5_at_0.9': 0.931807849674193, 'final_f5_at_0.95': 0.9393573691356161, 'final_f5_at_0.97': 0.9427518242684119}


In [26]:
# pick the best threshold from the final metrics and use it to generate preds_df
best_threshold = float(max(final_metrics, key=final_metrics.get).split("_")[-1])
print("best_threshold:", best_threshold)
wandb.config.best_threshold = best_threshold
preds_df = parse_predictions(
    preds.predictions, id2label, valid_ds, threshold=best_threshold
)

best_threshold: 0.97


## 📊 Data Visualization

In [27]:
import importlib
from src import utils
importlib.reload(utils)

<module 'src.utils' from '/media/ishfar/New Volume/Studies/Projects/Kaggle/PII_Detection/src/utils.py'>

In [28]:
# Prepare data to visualize errors and log them as a Weights & Biases table
print("Visualizing errors...")
grouped_preds = preds_df.groupby("eval_row")[
    ["document", "token", "label", "token_str"]
].agg(list)
viz_df = pd.merge(
    eval_df.reset_index(),
    grouped_preds,
    how="left",
    left_on="index",
    right_on="eval_row",
)
viz_df = filter_errors(viz_df, preds_df)
viz_df["pred_viz"] = generate_htmls_concurrently(
    viz_df,
    tokenizer,
    preds.predictions,
    id2label,
    valid_ds,
    threshold=best_threshold,
)
nlp = spacy.blank("en")
htmls = [visualize(row, nlp) for _, row in viz_df.iterrows()]
wandb_htmls = [wandb.Html(html) for html in htmls]
viz_df["gt_viz"] = wandb_htmls
viz_df.fillna("", inplace=True)
viz_df = utils.convert_for_upload(viz_df)
errors_table = wandb.Table(dataframe=viz_df)
wandb.log({"errors_table": errors_table})

print("Experiment finished, test it out on the inference notebook!")

Visualizing errors...


  0%|          | 0/199 [00:00<?, ?it/s]



Experiment finished, test it out on the inference notebook!


In [29]:
wandb.finish()

VBox(children=(Label(value='0.458 MB of 14.793 MB uploaded\r'), FloatProgress(value=0.030937380490105644, max=…



0,1
eval/ents_f5,▁▄█
eval/ents_p,▅▁█
eval/ents_per_type_EMAIL_f5,▁██
eval/ents_per_type_EMAIL_p,▁██
eval/ents_per_type_EMAIL_r,▁██
eval/ents_per_type_ID_NUM_f5,▅▁█
eval/ents_per_type_ID_NUM_p,▁▄█
eval/ents_per_type_ID_NUM_r,█▁█
eval/ents_per_type_NAME_STUDENT_f5,▁▂█
eval/ents_per_type_NAME_STUDENT_p,█▄▁

0,1
eval/ents_f5,0.93936
eval/ents_p,0.63193
eval/ents_per_type_EMAIL_f5,0.87248
eval/ents_per_type_EMAIL_p,0.95238
eval/ents_per_type_EMAIL_r,0.86957
eval/ents_per_type_ID_NUM_f5,0.90773
eval/ents_per_type_ID_NUM_p,0.53846
eval/ents_per_type_ID_NUM_r,0.93333
eval/ents_per_type_NAME_STUDENT_f5,0.94128
eval/ents_per_type_NAME_STUDENT_p,0.59619
