In [1]:
# Import necessary libraries
import json
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset


2024-02-09 13:04:05.752327: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-09 13:04:05.752426: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-09 13:04:05.885557: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Define constants
INFERENCE_MAX_LENGTH = 2048
MODEL_PATH = '/kaggle/input/915-deberta3base-training/deberta3base_1024/'
COMPETITION_LABELS = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
THRESHOLD = 0.9


In [3]:
# Function to tokenize the data
def tokenize(example, tokenizer):
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
    return {
        **tokenized,
        "token_map": token_map,
    }

In [4]:
# Load the data
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
# Modified dataset loading to include additional fields as per the provided description
ds = Dataset.from_dict({
    "document": [x["document"] for x in data],
    "full_text": [x["full_text"] for x in data],  # Added 'full_text' field
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})


In [5]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [6]:
# Tokenize the dataset
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)


   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

In [7]:
# Initialize a list to store predictions from different models
all_preds = []

# Loop through different model paths
for model_path in [
    '/kaggle/input/915-deberta3base-training/deberta3base_1024/',
    '/kaggle/input/piidd-deberta-model-starter-training/output/checkpoint-1362'
]:
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    args = TrainingArguments(
        ".", 
        per_device_eval_batch_size=1, 
        report_to="none",
    )
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=collator, 
        tokenizer=tokenizer,
    )
    # Get predictions from the model
    predictions = trainer.predict(ds).predictions
    all_preds.append(torch.softmax(torch.tensor(predictions), dim=-1).numpy())
    del model, trainer
    torch.cuda.empty_cache()
    

In [8]:
# Aggregate predictions from different models
predictions = np.mean(all_preds, axis=0)

In [9]:
from pathlib import Path

# Load configuration file to get label mappings
config = json.load(open(Path(MODEL_PATH) / "config.json"))
id2label = config["id2label"]



In [10]:
# Define the competition labels including "O"
COMPETITION_LABELS = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS', 'O']

# Post-process predictions
preds = np.argmax(predictions, axis=-1)
O_preds = predictions[:, :, COMPETITION_LABELS.index("O")]



In [11]:
# Set threshold for confidence
preds_final = np.where(O_preds < THRESHOLD, preds, COMPETITION_LABELS.index("O"))


In [12]:
# Initialize lists to store extracted triplets
triplets = []
document, token, label, token_str = [], [], [], []

# Extract triplets from predictions
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0:
            continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

In [13]:
# Create DataFrame for submission
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))

In [14]:
# Display first 100 rows of DataFrame
display(df.head(100))
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)


Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
