In [10]:
import re
import pandas as pd

# Read your conversation text file
with open("/content/human_chat.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Extract all dialogue lines after "Human 1:" or "Human 2:"
lines = re.findall(r'Human\s*\d+:\s*(.*)', text)

# Clean up (remove leading/trailing spaces, empty lines)
lines = [line.strip() for line in lines if line.strip()]

# Convert into dictionary format
data = {"text": lines}

df = pd.DataFrame(data)

In [None]:
data

In [12]:
import spacy

nlp = spacy.load("en_core_web_sm")

records = []
for text in df["text"]:
    doc = nlp(text)
    words = [token.text for token in doc]
    tags = []
    for token in doc:
        ent = token.ent_iob_
        if ent == "O":
            tags.append("O")
        else:
            tags.append(f"{ent}-{token.ent_type_}")
    records.append({"Word": words, "Tag": tags})

ner_df = pd.DataFrame(records)

In [13]:
ner_df

Unnamed: 0,Word,Tag
0,"[Hi, !]","[O, O]"
1,"[What, is, your, favorite, holiday, ?]","[O, O, O, O, O, O]"
2,"[one, where, I, get, to, meet, lots, of, diffe...","[O, O, O, O, O, O, O, O, O, O, O]"
3,"[What, was, the, most, number, of, people, you...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[Hard, to, keep, a, count, ., Maybe, 25, .]","[O, O, O, O, O, O, O, B-CARDINAL, O]"
...,...,...
1489,"[Where, would, you, most, like, to, go, ,, if,...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1490,"[Fly, to, the, moon, :), Haha]","[O, O, O, O, O, O]"
1491,"[Wow, ,, cool, !, I, think, space, tourism, is...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-D..."
1492,"[Yep, !, I, believe, there, is, a, day, for, t...","[O, O, O, O, O, O, O, O, O, O]"


In [14]:
tags = []
for tag in ner_df['Tag'].to_list():
    tags.extend(tag)
print('Entities in our data set')
set(tags)

Entities in our data set


{'B-CARDINAL',
 'B-DATE',
 'B-EVENT',
 'B-FAC',
 'B-GPE',
 'B-LANGUAGE',
 'B-LOC',
 'B-NORP',
 'B-ORDINAL',
 'B-ORG',
 'B-PERCENT',
 'B-PERSON',
 'B-PRODUCT',
 'B-QUANTITY',
 'B-TIME',
 'B-WORK_OF_ART',
 'I-CARDINAL',
 'I-DATE',
 'I-EVENT',
 'I-GPE',
 'I-LOC',
 'I-ORG',
 'I-PERCENT',
 'I-PERSON',
 'I-PRODUCT',
 'I-QUANTITY',
 'I-TIME',
 'I-WORK_OF_ART',
 'O'}

In [15]:
from transformers import AutoModelForTokenClassification

model_checkpoint = "dslim/bert-base-NER"
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

print('Entities from the pretrained model')
model.config.id2label

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entities from the pretrained model


{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

# Named Entity Recognition and Masking for Conversation Data

In [16]:
entity_mapping = {
    # Outside
    'O': 0,

    # Person
    'B-PERSON': 3,
    'I-PERSON': 4,

    # Organization
    'B-ORG': 5,
    'I-ORG': 6,

    # Location / GPE / FAC (map to LOC)
    'B-LOC': 7,
    'I-LOC': 8,
    'B-GPE': 7,
    'I-GPE': 8,
    'B-FAC': 7,
    'I-FAC': 8,

    # Miscellaneous: events, art, products, etc.
    'B-EVENT': 1,
    'I-EVENT': 2,
    'B-WORK_OF_ART': 1,
    'I-WORK_OF_ART': 2,
    'B-PRODUCT': 1,
    'I-PRODUCT': 2,
    'B-LANGUAGE': 1,
    'I-LANGUAGE': 2,
    'B-NORP': 1,
    'I-NORP': 2,
    'B-DATE': 1,
    'I-DATE': 2,
    'B-TIME': 1,
    'I-TIME': 2,
    'B-CARDINAL': 1,
    'I-CARDINAL': 2,
    'B-ORDINAL': 1,
    'I-ORDINAL': 2,
    'B-PERCENT': 1,
    'I-PERCENT': 2,
    'B-QUANTITY': 1,
    'I-QUANTITY': 2,
}


In [19]:
import torch

class NERDataset:
    def __init__(self, df):
        # input is annotated data frame
        self.texts = df['Word'].to_list()
        self.tags = df['Tag'].to_list()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        tags = self.tags[item]

        ids = []
        target_tag =[]

        # tokenize words and define tags accordingly
        # running -> [run, ##ning]
        # tags - ['O', 'O']
        for i, s in enumerate(text):
            inputs = tokenizer.encode(s, add_special_tokens=False)
            input_len = len(inputs)
            ids.extend(inputs)
            target_tag.extend([entity_mapping[tags[i]]] * input_len)

        # truncate
        ids = ids[:MAX_LEN - 2]
        target_tag = target_tag[:MAX_LEN - 2]

        # add special tokens
        ids = [101] + ids + [102]
        target_tag = [0] + target_tag + [0]
        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        # construct padding
        padding_len = MAX_LEN - len(ids)
        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'labels': torch.tensor(target_tag, dtype=torch.long)
               }

In [20]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch

df_train, df_val = train_test_split(ner_df, test_size=0.2, random_state=42)
df_val, df_test = train_test_split(df_val, test_size=0.5, random_state=42)

model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

MAX_LEN = 128

data_train = NERDataset(df_train)
data_val = NERDataset(df_val)
data_test = NERDataset(df_test)

# initialize DataLoader used to return batches for training/validation
loader_train = torch.utils.data.DataLoader(
    data_train, batch_size=32, num_workers=4
)

loader_val = torch.utils.data.DataLoader(
    data_val, batch_size=32, num_workers=4
)

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [26]:
from transformers import AutoModelForTokenClassification, get_scheduler
from torch.optim import AdamW
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import warnings

warnings.filterwarnings("ignore")

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

# just train the linear classifier on top of BERT
param_optimizer = list(model.classifier.named_parameters())
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-12
)
## full finetuning
#optimizer = AdamW(model.parameters())

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# add scheduler to linearly reduce the learning rate throughout the epochs.
num_epochs = 3
num_training_steps = num_epochs * len(loader_train)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    model.train()
    final_loss = 0
    predictions , true_labels = [], []
    for batch in loader_train:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        true_labels.extend(batch['labels'].detach().cpu().numpy().ravel())
        predictions.extend(np.argmax(outputs[1].detach().cpu().numpy(), axis=2).ravel())

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        final_loss+=loss.item()

    print(f'Training loss: {final_loss/len(loader_train)}')
    print('Training F1: {}'.format(f1_score(predictions, true_labels, average='macro')))
    print(f'Training acc: {accuracy_score(predictions, true_labels)}')
    print('*'*20)

    model.eval()
    final_loss = 0
    predictions , true_labels = [], []
    for batch in loader_val:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        final_loss+=outputs.loss.item()
        true_labels.extend(batch['labels'].detach().cpu().numpy().ravel())
        predictions.extend(np.argmax(outputs[1].detach().cpu().numpy(), axis=2).ravel())
    print(f'Validation loss: {final_loss/len(loader_val)}')
    print('Vallidation F1: {}'.format(f1_score(predictions, true_labels, average='macro')))
    print(f'Validaton acc: {accuracy_score(predictions, true_labels)}')
    print('*'*20)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--dslim--bert-base-NER/snapshots/d1a3e8f13f8c3566299d95fcfc9a8d2382a9affc/config.json
Model config BertConfig {
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,
    "I-PER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token

  0%|          | 0/114 [00:00<?, ?it/s]

Training loss: 0.08575797139814026
Training F1: 0.26552593204701136
Training acc: 0.9869312238493724
********************
Validation loss: 0.11709091365337372
Vallidation F1: 0.23790601493649013
Validaton acc: 0.9814911912751678
********************
Training loss: 0.08320315932168772
Training F1: 0.25918451605268333
Training acc: 0.9873627092050209
********************
Validation loss: 0.11511629596352577
Vallidation F1: 0.23801141409732274
Validaton acc: 0.981700922818792
********************
Training loss: 0.08246084743816602
Training F1: 0.27153905189002925
Training acc: 0.9873430962343096
********************
Validation loss: 0.11450864151120185
Vallidation F1: 0.23801141409732274
Validaton acc: 0.981700922818792
********************


In [35]:
import numpy as np

# test the model
test_sentence = """
Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a
reporter for the network, about protests in Minnesota and elsewhere.
"""
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence]).cuda()
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []

for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(label_idx)
        new_tokens.append(token)

for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(model.config.id2label[label], token))

O	[CLS]
O	Mr
O	.
B-PER	Trump
O	’
O	s
O	tweets
O	began
O	just
O	moments
O	after
O	a
B-ORG	Fox
I-ORG	News
O	report
O	by
B-PER	Mike
I-PER	Tobin
O	,
O	a
O	reporter
O	for
O	the
O	network
O	,
O	about
O	protests
O	in
B-LOC	Minnesota
O	and
O	elsewhere
O	.
O	[SEP]


## Masking subject


In [39]:
masked_tokens = []

for token, label in zip(new_tokens, new_labels):
    entity = model.config.id2label[label]  # get string label
    if entity in ["B-PER", "I-PER"]:
        # Replace PERSON tokens with a placeholder
        if len(masked_tokens) == 0 or masked_tokens[-1] != "[PERSON]":
            masked_tokens.append("[PERSON]")
        # else: skip consecutive I-PER tokens
    else:
        masked_tokens.append(token)
# Join back into a sentence
masked_sentence = " ".join(masked_tokens)
print(masked_sentence)


[CLS] Mr . [PERSON] ’ s tweets began just moments after a Fox News report by [PERSON] , a reporter for the network , about protests in Minnesota and elsewhere . [SEP]
