In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score
import torch.nn.functional as F 
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
MAX_LEN = 174


In [4]:
labels_to_ids = {'B-CHAR': 0, 'I-LOC': 1, 'I-CHAR': 2, 'O': 3, 'B-ORG': 4, 'B-LOC': 5}


In [5]:
ids_to_labels = {0: 'B-CHAR', 1: 'I-LOC', 2: 'I-CHAR', 3: 'O', 4: 'B-ORG', 5: 'B-LOC'}

In [None]:
labels_to_ids = {'B-CHAR': 0, 'O': 1, 'I-CHAR': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
ids_to_labels = {0: 'B-CHAR', 1: 'O', 2: 'I-CHAR', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

In [7]:
model = BertForTokenClassification.from_pretrained('final_train', num_labels=6)
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [8]:
tokenizer = BertTokenizerFast.from_pretrained("final_tokenizer")

In [9]:
sentence = "Oooohhh Frodo Aaaahh Gimli My precious Wake up Wake up Wake up sleepies We must go yes we must go at once"
words = sentence.split()
inputs = tokenizer(words,
             is_split_into_words=True,
             return_offsets_mapping=True,
             padding='max_length',
             truncation=True,
             max_length=MAX_LEN,
             return_tensors="pt")


# move to gpu
ids = inputs["input_ids"].to(device)
print('ids', ids)
mask = inputs["attention_mask"].to(device)
print('mask', mask)
# forward pass
outputs = model(ids, attention_mask=mask)
print('outputs:', outputs)

logits = outputs[0]
print("logits", logits)
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
print('active_logits', active_logits)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
print('flattened_predictions', flattened_predictions)
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
print('tokens', tokens)

token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
print('token_predictions', token_predictions)

wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
print('wp_preds', wp_preds)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

ids tensor([[  101,  1051,  9541, 11631, 23644, 10424,  7716,  2080, 13360,  4430,
          2232, 21025, 19968,  2072,  2026,  9062,  5256,  2039,  5256,  2039,
          5256,  2039,  3637,  3111,  2057,  2442,  2175,  2748,  2057,  2442,
          2175,  2012,  2320,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     

In [10]:


from transformers import BertTokenizer, BertForTokenClassification
import torch

# Load the fine-tuned model
model = BertForTokenClassification.from_pretrained("curve_train", num_labels=6)
model.to(device)
tokenizer = BertTokenizerFast.from_pretrained('curve_tokenizer')

# Text to predict
text = "Oooohhh Frodo Aaaahh Gimli My precious Wake up Wake up Wake up sleepies We must go yes we must go at once"

# Tokenize the input text
words = text.split()
inputs = tokenizer(words,
             is_split_into_words=True,
             return_offsets_mapping=True,
             padding='max_length',
             truncation=True,
             max_length=MAX_LEN,
             return_tensors="pt")

# Move inputs to the same device as the model
ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# Make predictions
with torch.no_grad():
    outputs = model(input_ids=ids, attention_mask=attention_mask)
logits = outputs[0]


active_logits = logits.view(-1, model.num_labels)

flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
# Check if predictions were made
if predicted_labels.numel() == 0:
    print("No predictions made.")
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]

wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(text.split())
print(prediction)

NameError: name 'predicted_labels' is not defined

In [11]:

MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 5
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
MODEL_PATH = 'ner_model_from_final1.pth'
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        labels = [labels_to_ids[label] for label in word_labels]

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

def clean_tag(tag):
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag



from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report
import numpy as np
import torch
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")
data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
             'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)


testing_set = dataset(df_test, tokenizer, MAX_LEN)

test_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

loss_values, validation_loss_values = [], []   
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions, true_labels = [], []

for idx, batch in enumerate(testing_loader):
    ids = batch['input_ids'].to(device, dtype=torch.long)
    mask = batch['attention_mask'].to(device, dtype=torch.long)
    labels = batch['labels'].to(device, dtype=torch.long)

    with torch.no_grad():
        outputs = model(ids, token_type_ids=None, attention_mask=mask, labels=labels)
    
    logits = outputs[1].detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()
    eval_loss += outputs[0].mean().item()
    
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)

# Ensure tag values are correctly ordered
tag_values = {0: 'B-CHAR', 1: 'I-LOC', 2: 'I-CHAR', 3: 'O', 4: 'B-ORG', 5: 'B-LOC', -100: 'PAD'}

# Converting true labels and predictions to tag names
pred_tags = [tag_values[int(p_i)] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if int(l_i) != -100]
valid_tags = [tag_values[int(l_i)] for l in true_labels for l_i in l if int(l_i) != -100]

eval_loss = eval_loss / len(testing_loader)
validation_loss_values.append(eval_loss)
print("Validation loss: {}".format(eval_loss))
valid_tags = [valid_tags]
pred_tags = [pred_tags]
report = seqeval_classification_report(valid_tags, pred_tags)
print("Validation Accuracy: {}".format(accuracy_score(valid_tags, pred_tags)))
print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags)))
print(report)

Validation loss: 0.06755576183398565
Validation Accuracy: 0.9915406698564593
Validation F1-Score: 0.8881453154875717
              precision    recall  f1-score   support

        CHAR       0.87      0.87      0.87       820
         LOC       0.91      0.98      0.94       216
         ORG       1.00      0.50      0.67         2

   micro avg       0.88      0.89      0.89      1038
   macro avg       0.93      0.78      0.83      1038
weighted avg       0.88      0.89      0.89      1038



In [12]:
sentence = "Oooohhh Frodo Aaaahh Gimli My precious Wake up Wake up Wake up sleepies We must go yes we must go at once"
words = sentence.split()
inputs = tokenizer(words,
             is_split_into_words=True,
             return_offsets_mapping=True,
             padding='max_length',
             truncation=True,
             max_length=MAX_LEN,
             return_tensors="pt")


# move to gpu
ids = inputs["input_ids"].to(device)
print('ids', ids)
mask = inputs["attention_mask"].to(device)
print('mask', mask)
# forward pass
outputs = model(ids, attention_mask=mask)
print('outputs:', outputs)

logits = outputs[0]
print("logits", logits)
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
print('active_logits', active_logits)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
print('flattened_predictions', flattened_predictions)
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
print('tokens', tokens)

token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
print('token_predictions', token_predictions)

wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
print('wp_preds', wp_preds)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

ids tensor([[  101,  1051,  9541, 11631, 23644, 10424,  7716,  2080, 13360,  4430,
          2232, 21025, 19968,  2072,  2026,  9062,  5256,  2039,  5256,  2039,
          5256,  2039,  3637,  3111,  2057,  2442,  2175,  2748,  2057,  2442,
          2175,  2012,  2320,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     