In [28]:
#import_libraries
import numpy as np
import pandas as pd
from transformers import BertTokenizer, AdamW, BertForTokenClassification
from torch.utils.data import TensorDataset, DataLoader, random_split
from tqdm import tqdm
import torch
from torch.optim import lr_scheduler
from sklearn.model_selection import train_test_split

In [29]:
# Define_entity_types_and_number_of_labels
entity_types = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
num_labels = len(entity_types)

In [30]:
# Load pre-trained_BERT_tokenizer_and_model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Define_training_parameters
batch_size = 32
learning_rate = 5e-5
num_epochs = 15

In [32]:
train_dataset_sample = [
    {"text": "John Works at Google in New York.", "labels": {"entities": [(0, 4, "PER"), (17, 23, "ORG"), (27, 35, "LOC")]}},
    {"text": "Apple Inc. is a technology company.", "labels": {"entities": [(0, 10, "ORG")]}},
]

In [33]:
def tokenize_and_format_data(dataset, tokenizer, entity_types):
    tokenized_data = []
    
    for sample in dataset:
        text = sample["text"]
        entities = sample["labels"]["entities"]
        
        # Tokenize_the_text
        tokens = tokenizer.tokenize(text)
        
        # Initialize_labels_for_each_token_as_'O'_(outside_of_any_entity)
        labels = ['O'] * len(tokens)
        
        # Update_labels_for_entity_spans
        for start, end, entity_type in entities:
            # Tokenize_the_prefix_to_get_correct_offset
            prefix_tokens = tokenizer.tokenize(text[:start])
            start_token = len(prefix_tokens)
            
            # Tokenize the_entity_to_get_its_length
            entity_tokens = tokenizer.tokenize(text[start:end])
            end_token = start_token + len(entity_tokens) - 1
            
            # Ensure start_token and end_token are within bounds
            if start_token < len(tokens):
                labels[start_token] = f"B-{entity_type}"
                for i in range(start_token + 1, min(end_token + 1, len(tokens))):
                    labels[i] = f"I-{entity_type}"
        
        # Convert tokens to input IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        
        # Convert labels to label IDs
        label_ids = [entity_types.index(label) if label in entity_types else entity_types.index('O') for label in labels]
        
        # Padding for input IDs and labels to match the model's maximum length
        padding_length = tokenizer.model_max_length - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * padding_length
        label_ids += [entity_types.index('O')] * padding_length
        
        tokenized_data.append({'input_ids': input_ids, 'labels': label_ids})
    
    # Convert lists to tensors
    input_ids_tensor = torch.tensor([item['input_ids'] for item in tokenized_data])
    label_ids_tensor = torch.tensor([item['labels'] for item in tokenized_data])
    
    # Create TensorDataset
    dataset = TensorDataset(input_ids_tensor, label_ids_tensor)
    
    return dataset

In [34]:
train_data = tokenize_and_format_data(train_dataset_sample, tokenizer, entity_types)
train_data_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=learning_rate)




In [35]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch_input_ids, batch_labels = batch
        batch_input_ids = batch_input_ids.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        batch_labels = batch_labels.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        
        optimizer.zero_grad()
        outputs = model(input_ids=batch_input_ids, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_data_loader)
    print(f"Average loss for epoch {epoch+1}: {avg_loss}")

Epoch 1/15:   0%|                                         | 0/1 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.66s/it]


Average loss for epoch 1: 2.348078966140747


Epoch 2/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.56s/it]


Average loss for epoch 2: 0.5442062616348267


Epoch 3/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.38s/it]


Average loss for epoch 3: 0.16310027241706848


Epoch 4/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.48s/it]


Average loss for epoch 4: 0.13836759328842163


Epoch 5/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.69s/it]


Average loss for epoch 5: 0.07499676942825317


Epoch 6/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.30s/it]


Average loss for epoch 6: 0.06419733166694641


Epoch 7/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.33s/it]


Average loss for epoch 7: 0.05740748718380928


Epoch 8/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.33s/it]


Average loss for epoch 8: 0.05477806553244591


Epoch 9/15: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.33s/it]


Average loss for epoch 9: 0.05383329838514328


Epoch 10/15: 100%|████████████████████████████████| 1/1 [00:01<00:00,  1.30s/it]


Average loss for epoch 10: 0.053104326128959656


Epoch 11/15: 100%|████████████████████████████████| 1/1 [00:01<00:00,  1.42s/it]


Average loss for epoch 11: 0.0510014146566391


Epoch 12/15: 100%|████████████████████████████████| 1/1 [00:01<00:00,  1.45s/it]


Average loss for epoch 12: 0.04861518740653992


Epoch 13/15: 100%|████████████████████████████████| 1/1 [00:01<00:00,  1.40s/it]


Average loss for epoch 13: 0.04677664116024971


Epoch 14/15: 100%|████████████████████████████████| 1/1 [00:01<00:00,  1.29s/it]


Average loss for epoch 14: 0.04556713253259659


Epoch 15/15: 100%|████████████████████████████████| 1/1 [00:01<00:00,  1.33s/it]

Average loss for epoch 15: 0.04445786029100418





In [41]:
text_medical="The patient was prescribed aspirin for pain relief."
#tokenizing_the_text
inputs_medical=tokenizer(text_medical, return_tensors="pt")
outputs_medical=model(**inputs_medical)
print(outputs_medical)

TokenClassifierOutput(loss=None, logits=tensor([[[ 0.9386,  0.4486, -0.0478, -0.0886, -0.7108, -0.1851,  0.2494],
         [ 0.3358, -0.5510,  0.2315, -0.3240, -0.2289, -0.5868,  0.2912],
         [ 1.3937, -0.6088, -0.0767, -0.4081, -0.5356,  0.4041, -0.2571],
         [ 0.8568, -0.5871, -0.1098, -0.3460, -0.2877,  0.3004, -0.4386],
         [ 0.8510, -0.4462,  0.1478, -0.2943, -0.5911,  0.0192, -0.2116],
         [ 1.4181,  0.3901, -0.2994, -0.0149, -0.5432,  0.5404, -0.2110],
         [ 1.5397, -0.2353, -0.5442, -0.0059, -0.3951, -0.1309, -0.9379],
         [ 1.3004, -0.2631, -0.5439, -0.1184, -0.4259,  0.6136, -0.4864],
         [ 1.7083, -0.7097, -0.0932, -0.2090,  0.2152,  0.2975, -0.2829],
         [ 1.6492, -0.6368, -0.3893, -0.7716, -0.9038, -0.0788, -0.4138],
         [ 1.1298, -0.6394,  0.3592, -0.2730, -0.5218,  0.2692, -0.0870],
         [ 0.4108, -0.0980,  0.0254, -0.0092, -0.6833, -0.6284, -0.1557],
         [ 1.2724,  0.2668,  0.1499, -0.2527, -0.6540, -0.4585, -0.3240]

In [42]:
#extracting_predicted_labels
predicted_labels_medical=outputs_medical.logits.argmax(dim=-1)
#mapping_labels_to_entity_names
entities_medical=[tokenizer.decode(token) for token in predicted_labels_medical[0]]
print("Medical Entities:", entities_medical)

Medical Entities: ['[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]']


In [43]:
text_legal="This aggrement is entered into on this 1st day of January, 2023, between Company X and Company Y."
inputs_legal=tokenizer(text_legal, return_tensors="pt")
outputs_legal_1=model(**inputs_legal)
predicted_labels_legal=outputs_legal_1.logits.argmax(dim=-1)
entities_legal=[tokenizer.decode(token) for token in predicted_labels_legal[0]]
print("Legal Entities:", entities_legal)


Legal Entities: ['[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]', '[ P A D ]']
