In [None]:
import pandas as pd
import ast
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load your CSV
df = pd.read_csv("dataset.csv")
df.head()


In [None]:
def try_parse_list(cell):
    if isinstance(cell, str):
        return ast.literal_eval(cell)
    return cell

df["Tokenised Filled Template"] = df["Tokenised Filled Template"].apply(try_parse_list)
df["Tokens"] = df["Tokens"].apply(try_parse_list)
df.head()

In [None]:
# Check for mismatches between tokens and labels
counter = 0
for idx, row in df.iterrows():
    if len(row["Tokenised Filled Template"]) != len(row["Tokens"]):
        print(f"Mismatch at index {idx}")
        # print(f"Tokens ({len(row['Tokenised Filled Template'])}): {row['Tokenised Filled Template']}")
        # print(f"Labels ({len(row['Tokens'])}): {row['Tokens']}")
        counter += 1
print(f"Total mismatches found: {counter}")

In [None]:
# Filter only rows where tokens and labels match in length
df = df[df["Tokenised Filled Template"].str.len() == df["Tokens"].str.len()]
# Check for mismatches between tokens and labels
counter = 0
for idx, row in df.iterrows():
    if len(row["Tokenised Filled Template"]) != len(row["Tokens"]):
        print(f"Mismatch at index {idx}")
        # print(f"Tokens ({len(row['Tokenised Filled Template'])}): {row['Tokenised Filled Template']}")
        # print(f"Labels ({len(row['Tokens'])}): {row['Tokens']}")
        counter += 1
print(f"Total mismatches found: {counter}")

In [None]:
# Extract BIO tag scheme
all_tags = set(tag for tags in df["Tokens"] for tag in tags)
unique_tags = sorted(all_tags)
label2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2label = {idx: tag for tag, idx in label2id.items()}

# Add numeric label ids for each row
df["Label_ids"] = df["Tokens"].apply(lambda tags: [label2id[tag] for tag in tags])

# Train-validation split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face datasets
def to_hf_dataset(dataframe):
    return Dataset.from_dict({
        "tokens": dataframe["Tokenised Filled Template"].tolist(),
        "labels": dataframe["Label_ids"].tolist()
    })

train_dataset = to_hf_dataset(train_df)
val_dataset = to_hf_dataset(val_df)


In [None]:
from transformers import BertTokenizerFast, BertForTokenClassification
# model_checkpoint = "prajjwal1/bert-tiny"
model_checkpoint = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint)

model = BertForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_tags),
    id2label=id2label,
    label2id=label2id
)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",   # ensure uniform tensor sizes
        max_length=128,
        return_attention_mask=True
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # For subword tokens, assign -100 or the same label as the word
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
from transformers import TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(predictions_output):
    preds, labels = predictions_output
    preds = np.argmax(preds, axis=2)

    true_preds = []
    true_labels = []

    for pred_seq, label_seq in zip(preds, labels):
        pred_tags = []
        label_tags = []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:
                pred_tags.append(id2label[p])
                label_tags.append(id2label[l])
        true_preds.append(pred_tags)
        true_labels.append(label_tags)

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }

training_args = TrainingArguments(
    output_dir="./ner_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
trainer.save_model("./final_ner_model")
tokenizer.save_pretrained("./final_ner_model")

In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model="./final_ner_model", tokenizer="./final_ner_model", aggregation_strategy="simple")

text = "Dr. Marvin Rolfson and Julius Daugherty attended the arbitration."
print(ner_pipeline(text))
