In [1]:
import json, argparse, torch, sys, random, gc, os
import numpy as np
import pandas as pd
import functools
from itertools import chain
from functools import partial
from pathlib import Path

# Transformer 
from transformers import (AutoTokenizer, Trainer, TrainingArguments,
                          AutoModelForTokenClassification, DataCollatorForTokenClassification,
                          DebertaV2Config, DebertaV2ForTokenClassification)
from datasets import Dataset, features
from typing import Iterable, Any, Callable
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

SEED = 42
seed_everything(SEED)

In [3]:
import ctypes
libc = ctypes.CDLL("libc.so.6")
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

In [4]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {DEVICE}")

Device: cuda


In [5]:
# 1-hot encoding 
from sklearn.preprocessing import MultiLabelBinarizer

def replace_labels(row, labels):
    return [label if label in labels else 'O' for label in row]
    

def load_data(labels):
    # Load training data
    train_data = pd.read_json("pii-detection-removal-from-educational-data/train.json")
    print(f"kaggle train data = {len(train_data)}") # 6807
    # Texts generated by Gemma
    gemma_df = pd.read_json("extra-data/pii_dataset_Gemma.json")
    print("gemma data = ", len(gemma_df)) # 1390
    # PII - Mixtral8x7B generated essays (2692)
    df_mpware = json.load(open('extra-data/mpware_mixtral8x7b_v1.1-no-i-username.json'))
    df_mpware = pd.DataFrame(df_mpware)    
    df_mpware = df_mpware[train_data.columns]
    print(f"df_mpware data = {len(df_mpware)}")
    # Combine to a single df
    df = pd.concat([train_data, gemma_df, df_mpware])
    df['document'] = [i for i in range(len(df))] # Update the document
    df.reset_index(drop=True, inplace=True)
    df['labels'] = df['labels'].apply(replace_labels, args=(labels,))
     # Get all the unique labels 
    all_labels = sorted(np.unique(functools.reduce(lambda a, b: list(np.unique(a+b)),
                                                  df['labels'].tolist())))
    print(f"all_labels = {all_labels}")
    # Create indexes for labels
    label2id = {label:index for index,label in enumerate(all_labels)}
    id2label = {index:label for index,label in enumerate(all_labels)}
    return df, all_labels, label2id, id2label
    
# Encode labels to columns
def encode_labels(df):
    total = len(df)
    df["unique_labels"] = df["labels"].apply(lambda labels: 
                                            list(set([label.split('-')[1] for label in labels if label != 'O'])))
    mlb = MultiLabelBinarizer()
    one_hot_encoded = mlb.fit_transform(df['unique_labels'])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
    df = pd.concat([df, one_hot_df], axis=1)
    # add 'POS' column that don't have 
    df['others'] = df['unique_labels'].apply(lambda x: 1 if len(x) == 0 else 0)
    label_classes = list(mlb.classes_) + ['others']
    for col in label_classes:
        subtotal = df[col].sum()
        percent = subtotal/total * 100
        print(f'{col}: {subtotal}  ({percent:.1f}%)')
    return df, label_classes

In [6]:
df, all_labels, label2id, id2label = load_data(['B-STREET_ADDRESS','I-STREET_ADDRESS'])

kaggle train data = 6807
gemma data =  5479
df_mpware data = 2692
all_labels = ['B-STREET_ADDRESS', 'I-STREET_ADDRESS', 'O']


In [7]:
df_labels, label_classes = encode_labels(df.copy())

STREET_ADDRESS: 4031  (26.9%)
others: 10947  (73.1%)


In [8]:
def split_df_by_sampling(df, n_samples):
    # Get the sample df
    samples_df = df.sample(n=n_samples, random_state=SEED)
    # The remaining df
    cond = df['document'].isin(samples_df['document'])
    others_df = df.drop(df[cond].index, inplace=False)
    return samples_df, others_df

def downsample_df(df,false_size):
    '''Split the df into training and valid dataset'''
    df['is_labels'] = df['labels'].apply(lambda labels: any(label != 'O' for label in labels))
    # One or more labels are not 'O'
    true_labels = df[df['is_labels'] == True]
    # all labels are 'O'
    false_labels = df[df['is_labels'] == False] 
    # Reset index to two df
    true_labels = true_labels.reset_index(drop=True, inplace=False)
    false_labels = false_labels.reset_index(drop=True, inplace=False)
    print(f"Number of true_labels = {len(true_labels)}")
    print(f"Number of false_labels = {len(false_labels)}")
    # Get 300 as valid dataset
    n_samples=len(true_labels) - 50
    # Sample true labels
    true_samples, true_others = split_df_by_sampling(true_labels, n_samples)
    print(f"true_samples = {len(true_samples)} true_others = {len(true_others)}")
    n_samples=false_size
    # Sample false labels
    false_samples, false_others = split_df_by_sampling(false_labels, n_samples)
    false_others = false_others.sample(n = 200)
    print(f"false_samples = {len(false_samples)} false_others = {len(false_others)}")
    # Training ds = P * true_labels + P * false_labels
    train_df = pd.concat([true_samples, false_samples])   
    # Valid ds = (1-P) * true_labels + (1-P) * false_labels
    valid_df = pd.concat([true_others, false_others])   
    return train_df, valid_df

In [9]:
train_df, valid_df = downsample_df(df.copy(),false_size=6000)
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
print(f"Number of train_df = {len(train_df)}")
print(f"Number of valid_df = {len(valid_df)}")
clear_memory()

Number of true_labels = 4031
Number of false_labels = 10947
true_samples = 3981 true_others = 50
false_samples = 6000 false_others = 200
Number of train_df = 9981
Number of valid_df = 250


In [10]:
def tokenize(example, tokenizer, label2id):
    # Preprocess the tokens and labels by adding trailing whitespace and labels
    tokens = []
    labels = []
    for token, label, t_ws in zip(example["tokens"], 
                                  example["provided_labels"],
                                  example["trailing_whitespace"]):
        tokens.append(token)
        labels.extend([label] * len(token))
        # Added trailing whitespace and label if true and 
        if t_ws:
            tokens.append(" ")
            labels.append("O")  
    
    text = "".join(tokens)
    # print(f"len(text)={len(text)}, len(tokens)={len(tokens)}")
    # tokenization without truncation
    tokenized = tokenizer(text, return_offsets_mapping=True,
                          truncation=False)
    labels = np.array(labels)
    # Labels
    token_labels = []
    for start_idx, end_idx in tokenized.offset_mapping:
        # Added 'O' 
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"]) 
        else:
            # case when the text starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1
            # Convert label to id (int)
            label_id = label2id[labels[start_idx]]
            token_labels.append(label_id)

    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}

In [11]:
from seqeval.metrics import recall_score, precision_score, f1_score

In [12]:
def compute_metrics(preds, all_labels):    
    try:
        #print("Compute metrics")
        predictions, labels = preds
        predictions = np.argmax(predictions, axis=2)
        # Include prediction Remove ignored index (special tokens)
        true_preds = []
        true_labels = []
        for pred, label in zip(predictions, labels):
            true_preds.append([all_labels[p] for p, l in zip(pred, label) if l != -100])
            true_labels.append([all_labels[l] for p, l in zip(pred, label) if l != -100])
        # Compute recall, precision and f1 score
        recall = recall_score(true_labels, true_preds)
        precision = precision_score(true_labels, true_preds)
        # Use modified f1 score to measure the performance
        f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
        result = {'f1': f1_score,  
                  'recall': recall,
                  'precision': precision}
        print(f"result = {result}")
        return result
    except Exception as e: 
        print(e)

In [13]:
class ModelTrainer:
    def __init__(self, all_labels, label2id, id2label):
        self.all_labels = all_labels
        self.label2id = label2id
        self.id2label = id2label
        self.num_proc = 3
        self.learning_rate = 2e-5
        self.num_train_epochs = 3 # Number of epochs
        self.batch_size = 1 # Default (4) Too large batch sizes lead to OOM
        self.fp16 = True if torch.cuda.is_available() else False
        self.model_path = "microsoft/deberta-v3-small"
        self.output_dir = "outputs/"
        self.save_path =  "models/deberta3small_address_model_512"
        self.load_model()
        
    # Load the model
    def load_model(self):
        # Create the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 
        # Load tokenizer config
        config = DebertaV2Config.from_pretrained(self.model_path)       
        # Increase context length using the max_position_embeddings parameter 
        config.update({
            'num_labels': len(self.all_labels),
            'id2label': self.id2label,
            'label2id': self.label2id,
            'ignore_mismatched_sizes': True,
        })
        # Create the model
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_path,
                                                                     config=config)
        print(f"Complete loading pretrained LLM model") 
        
    # Convert df to tokenized dataset
    def create_dataset(self, df):
        ds = Dataset.from_dict({
            "full_text": df["full_text"].tolist() ,
            "document": df["document"].astype('string'),
            "tokens": df["tokens"].tolist(),
            "trailing_whitespace": df["trailing_whitespace"].tolist(),
            "provided_labels": df["labels"].tolist()
        })
         # Tokenize the dataset
        tokenized_ds = ds.map(tokenize, 
                              fn_kwargs={"tokenizer": self.tokenizer, 
                                         "label2id": self.label2id},
                              num_proc=self.num_proc)
        return tokenized_ds
    
    # Train the model
    def train(self, train_df, valid_df):       
        # Create training dataset
        training_ds = self.create_dataset(train_df)
        # Create valid dataset
        valid_ds = self.create_dataset(valid_df)
        # Data collator
        data_collator = DataCollatorForTokenClassification(self.tokenizer, pad_to_multiple_of=16)               
        # Trainer cofiguration
        training_args = TrainingArguments(output_dir=self.output_dir, 
                                          fp16=self.fp16, # # Change to False if using CPU only
                                          learning_rate=self.learning_rate,
                                          num_train_epochs=self.num_train_epochs, # The total number of training epochs to run.
                                          per_device_train_batch_size=self.batch_size,  # batch size per device during training
                                          per_device_eval_batch_size=self.batch_size, # batch size for evaluation
                                          gradient_accumulation_steps=2, 
                                          report_to="none",
                                          evaluation_strategy="epoch", # Evaluated at the end of epochs
                                          # eval_steps=1,
                                          do_eval=True,
                                          save_strategy="epoch",
                                          save_total_limit=2, # Save the best and most recent checkpoints
                                          logging_steps=20,
                                          lr_scheduler_type='cosine',
                                          load_best_model_at_end=True, # Load the best model at the end
                                          metric_for_best_model="f1",
                                          greater_is_better=True,
                                          warmup_ratio=0.1, # number of warmup steps (0.1) for learning rate scheduler
                                          weight_decay=0.01, # strength of weight decay
                                         )
        # Pass the modelTrainer
        trainer = Trainer(model=self.model, 
                          args=training_args, 
                          train_dataset=training_ds,
                          eval_dataset=valid_ds, 
                          data_collator=data_collator, 
                          tokenizer=self.tokenizer,
                          compute_metrics=partial(compute_metrics, all_labels=all_labels),
                         )
        # Train the model
        trainer.train()
        # Save the model
        trainer.save_model(self.save_path)
        self.tokenizer.save_pretrained(self.save_path)
        print(f"Save the model to {self.save_path}")

In [None]:
trainer = ModelTrainer(all_labels, label2id, id2label)
trainer.train(train_df, valid_df)

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Complete loading pretrained LLM model


Map (num_proc=3): 100%|█████████████████████████████████████████████████████| 9981/9981 [00:26<00:00, 378.02 examples/s]
Map (num_proc=3): 100%|███████████████████████████████████████████████████████| 250/250 [00:01<00:00, 141.39 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,F1,Recall,Precision
0,0.0002,0.001741,0.925532,0.925532,0.925532
2,0.0001,0.001753,0.947196,0.946809,0.956989


result = {'f1': 0.925531914893617, 'recall': 0.925531914893617, 'precision': 0.925531914893617}
result = {'f1': 0.9471960704052393, 'recall': 0.9468085106382979, 'precision': 0.956989247311828}


In [None]:
print("Hello")