In [1]:
import argparse
import gc
import json
import os
import random
import re
from itertools import chain
from pathlib import Path
from typing import Iterable, Any, Callable

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from spacy.lang.en import English
from scipy.special import softmax

from transformers import (
    AutoTokenizer, TrainingArguments, Trainer,
    AutoModelForTokenClassification, LongformerConfig,
    DebertaV2ForTokenClassification, DebertaV2TokenizerFast,
    LongformerForTokenClassification, BitsAndBytesConfig,
    DataCollatorForTokenClassification
)

2024-04-21 20:33:42.499716: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 20:33:42.499818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 20:33:42.627493: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
MAX_LENGTH = 4096
# model_path = "/kaggle/input/deberta-small-512-pii-models/models_512/deberta3small_name_model_512"
threshold = 0.99

In [3]:
def tokenize(example, tokenizer):
    tokens = []
    token_map = []
    idx = 0
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        tokens.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            tokens.append(" ")
            token_map.append(-1) 
        idx += 1
    # Does not truncate the text and concate all words together 
    # Do not need to have extra space as we have already include it in the previous tokenization
    tokenized = tokenizer("".join(tokens), return_offsets_mapping=True, truncation=True,max_length=MAX_LENGTH)
    return {**tokenized, "token_map": token_map}

In [4]:
# Model Inferer
class ModelInfer:
    def __init__(self, all_labels, id2label,model_path):
        self.all_labels = all_labels
#         self.label2id = label2id
        self.id2label = id2label
        self.model_path = model_path
        self.max_length = MAX_LENGTH
        self.infer_dir = "/kaggle/working/infer" # Model infer output 
        self.num_proc = 3 # 3 processors
        self.threshold = threshold # Threashold
        self.load_model()
        
    def load_model(self):
        # Create the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) 
        # Create the model
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_path)        
#         # Load the fine-tuned adapter layer on top of base model
#         self.model = self.model.to(DEVICE)
        print(f"Complete loading pretrained LLM model")
    
    def post_processing_preds(self, preds):
        preds_final = []
        preds_softmax = np.exp(preds) / np.sum(np.exp(preds), axis=2).reshape(preds.shape[0],
                                                                              preds.shape[1],
                                                                              1)
        # Get the maximal value as the final preds
        preds = preds.argmax(-1)
        preds_without_O = preds_softmax[:,:,:(len(self.all_labels)-1)].argmax(-1) # Prob of entity labels (like 'NAME_STUDENT')
        O_preds = preds_softmax[:,:,(len(self.all_labels)-1)] # Prob for 'O'
        print()
        # If preds for 'O' > 0.99, select preds of 'O'. Otherwise, select preds for entity labels.  
        preds_final = np.where(O_preds < self.threshold, preds_without_O, preds)
        return preds_final        
        
    def infer_preds(self, ds):
        # Tokenize the dataset using customized Tokenizer (the same as Training Tokenizer)
        tokenized_ds = ds.map(tokenize, fn_kwargs={"tokenizer": self.tokenizer}, num_proc=2)
        # Create data loader
        data_collator = DataCollatorForTokenClassification(self.tokenizer,
                                                           pad_to_multiple_of=512)
        # Arguments (infer only)
        args = TrainingArguments(output_dir=self.infer_dir,
                                 per_device_eval_batch_size=1, 
                                 report_to="none")
        # Create the trainer 
        trainer = Trainer(model=self.model, 
                          args=args, 
                          data_collator=data_collator, 
                          tokenizer=self.tokenizer)
        
        # predict for that split
        preds = trainer.predict(tokenized_ds).predictions
                
        # Clear the unused memory
        del self.model, data_collator, trainer, args 
        clear_memory()
        preds_final = self.post_processing_preds(preds)
        return preds_final, tokenized_ds

In [5]:
all_labels = [
    ['B-ID_NUM','I-ID_NUM', 'O'],
    ['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O'],
    ['B-STREET_ADDRESS', 'I-STREET_ADDRESS', 'O'],
    ['B-URL_PERSONAL','I-URL_PERSONAL', 'O'],
    ['B-USERNAME', 'O'],
    ['B-EMAIL','O'],
    ['B-PHONE_NUM','I-PHONE_NUM','O'],
    
]

model_names = ['id','name','address','url','username','email','phone']

In [6]:
import ctypes
libc = ctypes.CDLL("libc.so.6")
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

In [7]:
final_df = pd.DataFrame()

In [8]:
def remove_duplicates(df):
    # Sort by the document and token
    df.sort_values(by=['document', 'token'])
    # Combine three columns 
    df['triplet'] = df[["document", "token", "label"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 
    # display(df)
    # Drop duplicated triplets and keep the first one as unique row
    df = df.drop_duplicates(subset=["triplet"], keep='first')
    # Regenerate 'row_id'
     
    df = df.reset_index(drop=True, inplace=False) 
    print("Remove duplicates")
#     display(df)
    return df

In [9]:
def predict(all_labels, model_name,final_df):
   
    model_path = f"/kaggle/input/longformer-models-piid/longformer_foundational_{model_name}"
    
    test_data = pd.read_json("/kaggle/input/pii-detection-removal-from-educational-data/test.json")
    

    test_ds = Dataset.from_dict({
        "full_text": test_data["full_text"].tolist(),
        "document": test_data["document"].tolist(),
        "tokens": test_data["tokens"].tolist(),
        "trailing_whitespace": test_data["trailing_whitespace"].tolist(),
    })
    
    print(f"Total number of test dataset {len(test_ds)}")
    
    config = json.load(open(Path(model_path) / "config.json"))
    id2label = config["id2label"]
    
    # Load the pretrained model and make the predictions
    inferer = ModelInfer(all_labels, id2label,model_path)
    preds_final, tokenized_ds = inferer.infer_preds(test_ds)
    
    results = []
    
    for p, token_map, offsets, tokens, doc in zip(preds_final,
                                                  tokenized_ds["token_map"], 
                                                  tokenized_ds["offset_mapping"],
                                                  tokenized_ds["tokens"],
                                                  tokenized_ds["document"]):
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            try:
                label_pred = id2label[str(token_pred)]
                if start_idx + end_idx == 0: 
                    continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # ignore "\n\n"
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1

                if start_idx >= len(token_map): 
                    break

                token_id = token_map[start_idx]

                # ignore "O" predictions and whitespace preds
                if label_pred != "O" and token_id != -1:
                    results.append({
                            "document": doc,
                            "token": token_id,
                            "label": label_pred,
                            "token_str": tokens[token_id]
                        })
            except Exception as e:
                print(f"Error {e}")
                print(f"token_map {len(token_map)} and {token_pred}  {start_idx} {end_idx}")
                sys.exit(-1)
            
    if results != []:
        test_df = pd.DataFrame(results)
        test_df = remove_duplicates(test_df)
        final_df = pd.concat([final_df,test_df])
    return final_df

In [10]:
from IPython.display import clear_output

In [11]:
for i in range(len(all_labels)):
    final_df = predict(all_labels[i],model_names[i],final_df)
    clear_memory()
    clear_output(wait=True)

Total number of test dataset 10
Complete loading pretrained LLM model
   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]




In [12]:
test_data = pd.read_json("/kaggle/input/pii-detection-removal-from-educational-data/test.json")

test_ds = Dataset.from_dict({
    "full_text": test_data["full_text"].tolist(),
    "document": test_data["document"].tolist(),
    "tokens": test_data["tokens"].tolist(),
    "trailing_whitespace": test_data["trailing_whitespace"].tolist(),
})

import re
from spacy.lang.en import English
nlp = English()

def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans


email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
emails = []
phone_nums = []

for _data in test_ds:
    # email
    for token_idx, token in enumerate(_data["tokens"]):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )
    # phone number
    matches = phone_num_regex.findall(_data["full_text"])
    if not matches:
        continue
        
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, _data["tokens"])
        
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
            )

In [13]:
temp = pd.concat([pd.DataFrame(emails),pd.DataFrame(phone_nums)])
if not temp.empty:
    temp = remove_duplicates(temp)
    final_df = pd.concat([final_df,temp])
final_df = final_df.sort_values(by=['document','token'])

In [14]:
final_df = final_df[["document", "token", "label"]]

In [15]:
INFERENCE_MAX_LENGTH = 3500
CONF_THRESH = 0.90  # threshold for "O" class
URL_THRESH = 0.1  # threshold for URL
AMP = True
MODEL_PATH = '/kaggle/input/37vp4pjt'
DATA_DIR = '/kaggle/input/pii-detection-removal-from-educational-data/'

In [16]:
def spacy_to_hf(data: dict, idx: int) -> slice:
    """
    Given an index of spacy token, return corresponding indices in deberta's output.
    We use this to find indice of URL tokens later.
    """
    str_range = np.where(np.array(data["token_map"]) == idx)[0]
    start_idx = bisect.bisect_left([off[1] for off in data["offset_mapping"]], str_range.min())
    end_idx = start_idx
    while end_idx < len(data["offset_mapping"]):
        if str_range.max() > data["offset_mapping"][end_idx][1]:
            end_idx += 1
            continue
        break
    token_range = slice(start_idx, end_idx+1)
    return token_range

In [17]:
class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, example: dict) -> dict:
        text = []
        token_map = []

        for idx, (t, ws) in enumerate(zip(example["tokens"], example["trailing_whitespace"])):
            text.append(t)
            token_map.extend([idx]*len(t))
            if ws:
                text.append(" ")
                token_map.append(-1)

        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_length,
        )

        return {**tokenized,"token_map": token_map,}

In [18]:
with open(str(Path(DATA_DIR).joinpath("test.json")), "r") as f:
    data = json.load(f)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

tokenizer = DebertaV2TokenizerFast.from_pretrained(MODEL_PATH)
ds = ds.map(CustomTokenizer(tokenizer=tokenizer, max_length=INFERENCE_MAX_LENGTH), num_proc=os.cpu_count())

     

#0:   0%|          | 0/3 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/3 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/2 [00:00<?, ?ex/s]

#3:   0%|          | 0/2 [00:00<?, ?ex/s]

In [19]:
model = DebertaV2ForTokenClassification.from_pretrained(MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer)
args = TrainingArguments(".", per_device_eval_batch_size=1, report_to="none", fp16=AMP)
trainer = Trainer(
    model=model, args=args, data_collator=collator, tokenizer=tokenizer,
)

  return self.fget.__get__(instance, owner)()


In [20]:
predictions = trainer.predict(ds).predictions

In [21]:
pred_softmax = torch.softmax(torch.from_numpy(predictions), dim=2).numpy()
id2label = model.config.id2label
o_index = model.config.label2id["O"]
preds = predictions.argmax(-1)
preds_without_o = pred_softmax.copy()
preds_without_o[:,:,o_index] = 0
preds_without_o = preds_without_o.argmax(-1)
o_preds = pred_softmax[:,:,o_index]
preds_final = np.where(o_preds < CONF_THRESH, preds_without_o , preds)

In [22]:
processed =[]
pairs = set()

# Iterate over document
for p, token_map, offsets, tokens, doc in zip(
    preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]
):
    # Iterate over sequence
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[token_pred]

        if start_idx + end_idx == 0:
            # [CLS] token i.e. BOS
            continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): 
            break

        token_id = token_map[start_idx]
        pair = (doc, token_id)

        # ignore certain labels and whitespace
        if label_pred in ("O", "B-EMAIL", "B-URL_PERSONAL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
            continue        

        if pair in pairs:
            continue
            
        processed.append(
            {"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]}
        )
        pairs.add(pair)

In [23]:
url_whitelist = [
    "wikipedia.org",
    "coursera.org",
    "google.com",
    ".gov",
]
url_whitelist_regex = re.compile("|".join(url_whitelist))

for row_idx, _data in enumerate(ds):
    for token_idx, token in enumerate(_data["tokens"]):
        if not nlp.tokenizer.url_match(token):
            continue
        print(f"Found URL: {token}")
        if url_whitelist_regex.search(token) is not None:
            print("The above is in the whitelist")
            continue
        input_idxs = spacy_to_hf(_data, token_idx)
        probs = pred_softmax[row_idx, input_idxs, model.config.label2id["B-URL_PERSONAL"]]
        if probs.mean() > URL_THRESH:
            print("The above is PII")
            processed.append(
                {
                    "document": _data["document"], 
                    "token": token_idx, 
                    "label": "B-URL_PERSONAL", 
                    "token_str": token
                }
            )
            pairs.add((_data["document"], token_idx))
        else:
            print("The above is not PII")

Found URL: https://en.wikipedia.org/wiki/Homo_economicus
The above is in the whitelist
Found URL: https://cyberleninka.ru/article/n/stremlenie-
The above is not PII
Found URL: http://www.intelros.ru/readroom/credo_new/k3-2018/36398-eticheskoe-
The above is not PII
Found URL: uchenie-asmita-v-kontekste-sovremennoy-ekonomiki.html
The above is not PII
Found URL: https://cyberleninka.ru/article/n/14398333
The above is not PII
Found URL: https://cyberleninka.ru/article/n/stremlenie-k-spravedlivomu-sotrudnichestvu-kak-
The above is not PII
Found URL: https://econweb.ucsd.edu/~jandreon/WorkingPapers/Philanthropy.pdf
The above is not PII
Found URL: muenchen.de/team/vorstandssprecher/schmidt/publikationen/papers/reciprocalt.pdf
The above is not PII


In [24]:
temp = pd.DataFrame(processed)
temp = temp[["document", "token", "label"]]
temp = remove_duplicates(temp)

Remove duplicates


In [25]:
final_df = pd.concat([final_df,temp])
final_df = final_df.sort_values(by=['document','token'])

In [26]:
final_df.drop_duplicates(subset=['document', 'token'], keep='first', inplace=True)
final_df['row_id'] = list(range(len(final_df)))
final_df = final_df[["row_id", "document", "token", "label"]]
# Create submission df
final_df.to_csv("submission.csv", index=False)
final_df.head()

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT
