## Preprocessing

In [5]:
import shutil
import os

# Copy everything from dataset to working directory
src = "/kaggle/input/plivo-assignment/pii_ner_assignment"
dst = "/kaggle/working"

# Copy recursively
shutil.copytree(src, dst, dirs_exist_ok=True)

print("Copied dataset to /kaggle/working")

Copied dataset to /kaggle/working


In [6]:
import shutil

src = "/kaggle/working/dev.jsonl"
dst = "/kaggle/working/data/dev.jsonl"

shutil.copy(src, dst)   # overwrites existing
print("File replaced1.")

src = "/kaggle/working/train.jsonl"
dst = "/kaggle/working/data/train.jsonl"

shutil.copy(src, dst)   # overwrites existing
print("File replaced2.")


File replaced1.
File replaced2.


In [8]:
import os

file_path = "/kaggle/working/train.jsonl"

if os.path.exists(file_path):
    os.remove(file_path)
    print("Deleted:", file_path)
else:
    print("File not found")


# 2nd file delete
file_path = "/kaggle/working/dev.jsonl"

if os.path.exists(file_path):
    os.remove(file_path)
    print("Deleted:", file_path)
else:
    print("File not found")

File not found
Deleted: /kaggle/working/dev.jsonl


In [16]:
import shutil

src = "/kaggle/input/myfiles/model.py"   # READ-ONLY
dst = "/kaggle/working/src/model.py"                        # WRITEABLE

shutil.copy(src, dst)  # This overwrites the file
print("File overwritten.")


src = "/kaggle/input/myfiles/predict.py"   # READ-ONLY
dst = "/kaggle/working/src/predict.py"                        # WRITEABLE

shutil.copy(src, dst)  # This overwrites the file
print("File overwritten.")

src = "/kaggle/input/myfiles/train.py"   # READ-ONLY
dst = "/kaggle/working/src/train.py"                        # WRITEABLE

shutil.copy(src, dst)  # This overwrites the file
print("File overwritten.")

File overwritten.
File overwritten.
File overwritten.


In [21]:
!zip -r project_submission.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/src/ (stored 0%)
  adding: kaggle/working/src/measure_latency.py (deflated 64%)
  adding: kaggle/working/src/eval_span_f1.py (deflated 70%)
  adding: kaggle/working/src/train.py (deflated 64%)
  adding: kaggle/working/src/model.py (deflated 49%)
  adding: kaggle/working/src/dataset.py (deflated 70%)
  adding: kaggle/working/src/predict.py (deflated 69%)
  adding: kaggle/working/src/__pycache__/ (stored 0%)
  adding: kaggle/working/src/__pycache__/dataset.cpython-311.pyc (deflated 53%)
  adding: kaggle/working/src/__pycache__/model.cpython-311.pyc (deflated 35%)
  adding: kaggle/working/src/__pycache__/labels.cpython-311.pyc (deflated 40%)
  adding: kaggle/working/src/labels.py (deflated 52%)
  adding: kaggle/working/README.md (deflated 52%)
  adding: kaggle/working/out_synth/ (stored 0%)
  adding: kaggle/working/out_synth/config.json (deflated 56%)
  adding: kaggle/wo

## Generate Train and Dev samples 

In [4]:
import json, random

# -----------------------------
# Helpers
# -----------------------------
def random_name():
    first = random.choice([
        "ramesh","suresh","priyanka","rohan","mehta","john","sharma",
        "verma","arjun","neha","amit","rahul","kiran","deepak"
    ])
    last = random.choice([
        "kumar","sharma","verma","mehta","reddy","iyer","patel","singh"
    ])
    return f"{first} {last}"

def random_email(name):
    user = name.replace(" ", "")
    domain = random.choice(["gmail", "yahoo", "outlook", "hotmail"])
    return f"{user} at {domain} dot com"

def random_phone():
    return "".join(str(random.randint(0, 9)) for _ in range(10))

def random_credit():
    return f"{random.randint(4000,5999)} {random.randint(1000,9999)} {random.randint(1000,9999)} {random.randint(1000,9999)}"

def random_city():
    return random.choice(["mumbai","delhi","chennai","pune","kolkata","hyderabad","bengaluru"])

def random_date():
    return f"{random.randint(1,28):02d} {random.randint(1,12):02d} 20{random.randint(23,30)}"


# -----------------------------
# Generate a sample
# -----------------------------
def make_sample(uid):
    name = random_name()
    email = random_email(name)
    phone = random_phone()
    card = random_credit()
    city = random_city()
    date = random_date()

    text = (
        f"my name is {name} my email is {email} my number is {phone} "
        f"i live in {city} i will travel on {date} my card is {card}"
    )

    entities = []
    for label, value in [
        ("PERSON_NAME", name),
        ("EMAIL", email),
        ("PHONE", phone),
        ("CITY", city),
        ("DATE", date),
        ("CREDIT_CARD", card),
    ]:
        start = text.index(value)
        end = start + len(value)
        entities.append({"start": start, "end": end, "label": label})

    return {"id": uid, "text": text, "entities": entities}


# -----------------------------
# Create the new datasets
# -----------------------------
train = [make_sample(f"utt_synth_{i:04d}") for i in range(1, 801)]
dev = [make_sample(f"utt_synth_{800+i:04d}") for i in range(1, 151)]

# -----------------------------
# SAVE NEW FILES (NO OVERWRITE)
# -----------------------------
with open("train.jsonl", "w") as f:
    for item in train:
        f.write(json.dumps(item) + "\n")

with open("dev.jsonl", "w") as f:
    for item in dev:
        f.write(json.dumps(item) + "\n")

print("Generated data/train.jsonl (800 samples)")
print("Generated data/dev.jsonl (150 samples)")


Generated data/train.jsonl (800 samples)
Generated data/dev.jsonl (150 samples)


### Updating model.py (miniLM + Dropout)

## Train the Baseline Model

In [9]:
!pip install -r requirements.txt

Collecting seqeval (from -r requirements.txt (line 5))
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadat

In [17]:
!python /kaggle/working/src/train.py \
  --model_name distilbert-base-uncased \
  --train /kaggle/working/data/train.jsonl \
  --dev /kaggle/working/data/dev.jsonl \
  --out_dir out_synth


2025-11-23 14:21:50.563506: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763907710.586139     308 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763907710.593068     308 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
Traceback (most recent call last):
  File "/kaggle/working/src/train.py", lin

## Prediction

In [18]:
!python /kaggle/working/src/predict.py \
  --model_dir out_synth \
  --input /kaggle/working/data/dev.jsonl \
  --output out_synth/dev_pred.json


2025-11-23 14:22:18.144740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763907738.166537     325 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763907738.173210     325 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
Wrote predictions for 150 utterances to out_synth/dev_pred.json


## Evaluation

In [19]:
!python /kaggle/working/src/eval_span_f1.py \
  --gold /kaggle/working/data/dev.jsonl \
  --pred /kaggle/working/out_synth/dev_pred.json


Per-entity metrics:
CITY            P=1.000 R=1.000 F1=1.000
CREDIT_CARD     P=1.000 R=1.000 F1=1.000
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=1.000 R=1.000 F1=1.000
PERSON_NAME     P=1.000 R=1.000 F1=1.000
PHONE           P=1.000 R=1.000 F1=1.000

Macro-F1: 1.000

PII-only metrics: P=1.000 R=1.000 F1=1.000
Non-PII metrics: P=1.000 R=1.000 F1=1.000


## Measure Latency

In [20]:
!python src/measure_latency.py \
  --model_dir out_synth \
  --input data/dev.jsonl \
  --runs 50


2025-11-23 14:22:45.784954: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763907765.807827     345 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763907765.814707     345 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
Latency over 50 runs (batch_size=1):
  p50: 4.00 ms
  p95: 5.12 ms
