In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score
import torch.nn.functional as F 
import pandas as pd

In [2]:
MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 5
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
MODEL_PATH = 'ner_model_from_final1.pth'

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
def clean_tag(tag):
    # Ensure tags are in the correct format
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

In [5]:
def read_names(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        names = [name.strip().lower() for name in file.readlines()]
    return names

In [6]:
def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

In [7]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True,
                             return_offsets_mapping=True,
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [8]:
train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")

In [9]:
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")

In [10]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [11]:
data = {'sentence': [" ".join(sentence) for sentence in train_tokens],
        'word_labels': [",".join(tags) for tags in train_tags]}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,sentence,word_labels
0,this book is largely concerned with hobbits an...,"O,O,O,O,O,O,B-CHAR,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."
1,further information will also be found in the ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-LOC,O,O,O,O,O,O,..."
2,that story was derived from the earlier chapte...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-CHAR,O,O,O,B-CHA..."
3,many however may wish to know more about this ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,for such readers a few notes on the more impor...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-CHAR,O,O,O,O,O..."


In [12]:
data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
        'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)
df_test.head()

Unnamed: 0,sentence,word_labels
0,oh smeagol ive got one ive got a fish smeagol ...,"O,B-CHAR,O,O,O,O,O,O,O,B-CHAR,I-CHAR,O,O,O,O,O..."
1,because it s my birthday and i wants it,"O,O,O,O,O,O,O,O,O"
2,my precious,"O,O"
3,they cursed us murderer murderer they called us,"O,O,O,O,O,O,O,O"
4,they cursed us and drove us away,"O,O,O,O,O,O,O"


In [13]:
# Split the word_labels into individual tags and get unique tags
all_tags = [tag for tags in df['word_labels'] for tag in tags.split(",")]
unique_tags = set(all_tags)


In [14]:
# Create mappings
labels_to_ids = {k: v for v, k in enumerate(unique_tags)}
ids_to_labels = {v: k for k, v in labels_to_ids.items()}

In [15]:

# Display the mappings
print("labels_to_ids:", labels_to_ids)
print("ids_to_labels:", ids_to_labels)

labels_to_ids: {'B-LOC': 0, 'I-CHAR': 1, 'O': 2, 'I-LOC': 3, 'B-CHAR': 4, 'B-ORG': 5}
ids_to_labels: {0: 'B-LOC', 1: 'I-CHAR', 2: 'O', 3: 'I-LOC', 4: 'B-CHAR', 5: 'B-ORG'}


In [16]:
# Split the word_labels into individual tags and get unique tags
all_tags_test = [tag for tags in df_test['word_labels'] for tag in tags.split(",")]
unique_tags_test = set(all_tags_test)

# Create mappings
labels_to_ids_test = {k: v for v, k in enumerate(unique_tags_test)}
ids_to_labels_test = {v: k for k, v in labels_to_ids.items()}

In [17]:
# Display the mappings
print("labels_to_ids:", labels_to_ids_test)
print("ids_to_labels:", ids_to_labels_test)

labels_to_ids: {'B-LOC': 0, 'I-CHAR': 1, 'O': 2, 'I-LOC': 3, 'B-CHAR': 4, 'B-ORG': 5}
ids_to_labels: {0: 'B-LOC', 1: 'I-CHAR', 2: 'O', 3: 'I-LOC', 4: 'B-CHAR', 5: 'B-ORG'}


In [18]:
training_set = dataset(df, tokenizer, MAX_LEN)

In [19]:
testing_set = dataset(df_test, tokenizer, MAX_LEN)

In [20]:
# testing_set[0]

In [21]:
# for token, label in zip(tokenizer.convert_ids_to_tokens(testing_set[0]["input_ids"]), testing_set[0]["labels"]):
#   print('{0:10}  {1}'.format(token, label))

In [22]:
# for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
#   print('{0:10}  {1}'.format(token, label))

In [23]:
# def decode_tokens_and_labels(tokenizer, encoded_input, ids_to_labels):
#     # Decode the input_ids to tokens
#     tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'].tolist())
    
#     # Convert label IDs back to label names
#     label_ids = encoded_input['labels'].tolist()
#     labels = [ids_to_labels[label_id] if label_id != -100 else 'PAD' for label_id in label_ids]
    
#     # Combine tokens and labels
#     decoded = [(token, label) for token, label in zip(tokens, labels)]
    
#     return decoded

# # Example usage:
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# encoded_input = training_set[0]
# decoded_output = decode_tokens_and_labels(tokenizer, encoded_input, ids_to_labels)

# for token, label in decoded_output:
#     print(f"{token}: {label}")

In [24]:
# import numpy as np
# import pandas as pd
# import torch
# from torch.utils.data import Dataset
# from transformers import BertTokenizerFast

# class CustomDataset(Dataset):
#     def __init__(self, dataframe, tokenizer, max_len):
#         self.len = len(dataframe)
#         self.data = dataframe
#         self.tokenizer = tokenizer
#         self.max_len = max_len

#     def __getitem__(self, index):
#         # step 1: get the sentence and word labels
#         sentence = self.data.sentence[index].strip().split()
#         word_labels = self.data.word_labels[index].split(",")

#         # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
#         # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
#         encoding = self.tokenizer(sentence,
#                                   is_split_into_words=True,
#                                   return_offsets_mapping=True,
#                                   padding='max_length',
#                                   truncation=True,
#                                   max_length=self.max_len)

#         # step 3: create token labels, handling subwords correctly
#         labels = [labels_to_ids[label] for label in word_labels]
#         encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

#         # set labels for subwords
#         i = 0
#         for idx, mapping in enumerate(encoding["offset_mapping"]):
#             if mapping[0] == 0 and mapping[1] != 0:
#                 encoded_labels[idx] = labels[i]
#                 i += 1
#             elif mapping[0] != 0 and mapping[1] != 0:
#                 encoded_labels[idx] = labels[i-1] if word_labels[i-1].startswith("B-") else labels[i-1]

#         # step 4: turn everything into PyTorch tensors
#         item = {key: torch.as_tensor(val) for key, val in encoding.items()}
#         item['labels'] = torch.as_tensor(encoded_labels)

#         return item

#     def __len__(self):
#         return self.len

# def read_data(file_path):
#     sentences, labels = [], []
#     sentence, label = [], []
#     with open(file_path, encoding="utf-8") as file:
#         for line in file:
#             if line.startswith("#"):
#                 continue
#             elif line == "\n":
#                 if sentence:
#                     sentences.append(sentence)
#                     labels.append(label)
#                     sentence, label = [], []
#             else:
#                 parts = line.strip().split("\t")
#                 sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
#                 label.append(clean_tag(parts[2]))
#     if sentence:
#         sentences.append(sentence)
#         labels.append(label)
#     return sentences, labels

# def clean_tag(tag):
#     # Ensure tags are in the correct format
#     if tag.count('-') > 1:
#         prefix, entity = tag.split('-', 1)
#         tag = f"{prefix}-{entity.replace('-', '')}"
#     return tag

# # Example usage
# train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# data = {'sentence': [" ".join(sentence) for sentence in train_tokens],
#         'word_labels': [",".join(tags) for tags in train_tags]}

# df = pd.DataFrame(data)

# # Split the word_labels into individual tags and get unique tags
# all_tags = [tag for tags in df['word_labels'] for tag in tags.split(",")]
# unique_tags = set(all_tags)
# print(unique_tags)
# print()
# # Create mappings
# labels_to_ids = {k: v for v, k in enumerate(unique_tags)}
# ids_to_labels = {v: k for k, v in labels_to_ids.items()}

# MAX_LEN = 128
# training_set = CustomDataset(df, tokenizer, MAX_LEN)
# decoded_output = decode_tokens_and_labels(tokenizer, training_set[1], ids_to_labels)

# for token, label in decoded_output:
#     print(f"{token}: {label}")


In [25]:
train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [26]:
len(labels_to_ids)

6

In [27]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [28]:
import math
print(-math.log(1/(len(labels_to_ids))))

1.791759469228055


In [29]:
inputs = training_set[7]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(1.7145, device='cuda:0', grad_fn=<NllLossBackward0>)

In [30]:
# tr_logits = outputs[1]
# tr_logits.shape

In [31]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)

In [32]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds,tr_labels = [], []

    # Put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        # Forward pass
        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        # Accumulate the training loss
        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # Compute training accuracy
        flattened_targets = labels.view(-1)  # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

        # Only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100  # shape (batch_size * seq_len,)
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels.cpu().numpy())
        tr_preds.extend(predictions.cpu().numpy())

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)


Training epoch: 1
Training loss per 100 training steps: 1.788746953010559
Training loss per 100 training steps: 0.12410592319130308
Training loss per 100 training steps: 0.07657311602370508
Training loss per 100 training steps: 0.05679301631194462
Training loss epoch: 0.055074801444592085
Training accuracy epoch: 0.9876425903603387
Training epoch: 2
Training loss per 100 training steps: 0.017275525256991386
Training loss per 100 training steps: 0.011776316728867075
Training loss per 100 training steps: 0.010120325594380572
Training loss per 100 training steps: 0.0093788890068673
Training loss epoch: 0.009283915403258992
Training accuracy epoch: 0.9973869251789802
Training epoch: 3
Training loss per 100 training steps: 0.021837104111909866
Training loss per 100 training steps: 0.004803928959247942
Training loss per 100 training steps: 0.0049310315209811564


KeyboardInterrupt: 

In [None]:
# # Define the path to the saved model directory
# directory = "./model"

# # Load the tokenizer
# tokenizer = BertTokenizerFast.from_pretrained(directory)

# # Load the model
# model = BertForTokenClassification.from_pretrained(directory)
# model.to(device)

# print('Model and tokenizer loaded successfully')


In [None]:
# model.save_pretrained("bert_train")

In [None]:
# tokenizer.save_pretrained('tokenizer')

In [33]:

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
eval_preds, eval_labels = [], []

with torch.no_grad():
    for idx, batch in enumerate(testing_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        eval_logits = outputs.logits

        eval_loss += loss.item()
        nb_eval_steps += 1
        nb_eval_examples += labels.size(0)

        flattened_targets = labels.view(-1)
        active_logits = eval_logits.view(-1, model.config.num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)

        active_accuracy = labels.view(-1) != -100
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        eval_labels.extend(labels.cpu().numpy())
        eval_preds.extend(predictions.cpu().numpy())

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = accuracy_score(eval_labels, eval_preds)

print(f"Evaluation loss: {eval_loss}")
print(f"Evaluation accuracy: {eval_accuracy}")

report = classification_report(eval_labels, eval_preds, target_names=list(labels_to_ids.keys()))
print("Classification Report:")
print(report)

Evaluation loss: 0.056707674740917154
Evaluation accuracy: 0.9894736842105263
Classification Report:
              precision    recall  f1-score   support

       B-LOC       0.89      0.98      0.93       216
      I-CHAR       0.89      0.66      0.76        85
           O       1.00      0.99      1.00     25000
       I-LOC       0.00      0.00      0.00         2
      B-CHAR       0.84      0.88      0.86       820
       B-ORG       0.00      0.00      0.00         2

    accuracy                           0.99     26125
   macro avg       0.60      0.59      0.59     26125
weighted avg       0.99      0.99      0.99     26125



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import json
config = json.load(open('curve_train/config.json'))
config['id2label'] = ids_to_labels
config['label2id'] = labels_to_ids
json.dump(config, open('curve_train/config.json', 'w'))

In [None]:
len(labels_to_ids)

In [None]:
model = BertForTokenClassification.from_pretrained('curve_train', num_labels=len(labels_to_ids))
model.to(device)

In [34]:

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
eval_preds, eval_labels = [], []

with torch.no_grad():
    for idx, batch in enumerate(testing_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        eval_logits = outputs.logits

        eval_loss += loss.item()
        nb_eval_steps += 1
        nb_eval_examples += labels.size(0)

        flattened_targets = labels.view(-1)
        active_logits = eval_logits.view(-1, model.config.num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)

        active_accuracy = labels.view(-1) != -100
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        eval_labels.extend(labels.cpu().numpy())
        eval_preds.extend(predictions.cpu().numpy())

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = accuracy_score(eval_labels, eval_preds)

print(f"Evaluation loss: {eval_loss}")
print(f"Evaluation accuracy: {eval_accuracy}")

report = classification_report(eval_labels, eval_preds, target_names=list(labels_to_ids.keys()))
print("Classification Report:")
print(report)

Evaluation loss: 0.056598789979600245
Evaluation accuracy: 0.9894736842105263
Classification Report:
              precision    recall  f1-score   support

       B-LOC       0.89      0.98      0.93       216
      I-CHAR       0.89      0.66      0.76        85
           O       1.00      0.99      1.00     25000
       I-LOC       0.00      0.00      0.00         2
      B-CHAR       0.84      0.88      0.86       820
       B-ORG       0.00      0.00      0.00         2

    accuracy                           0.99     26125
   macro avg       0.60      0.59      0.59     26125
weighted avg       0.99      0.99      0.99     26125



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            flattened_targets = labels.view(-1)  # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            # Only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100  # shape (batch_size * seq_len,)
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels.cpu().numpy())
            eval_preds.extend(predictions.cpu().numpy())

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id] for id in eval_labels]
    predictions = [ids_to_labels[id] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions



In [None]:
# Evaluation function
def evaluate():
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            flattened_targets = labels.view(-1)
            active_logits = eval_logits.view(-1, model.config.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)

            active_accuracy = labels.view(-1) != -100
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels.cpu().numpy())
            eval_preds.extend(predictions.cpu().numpy())

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)

    print(f"Evaluation loss: {eval_loss}")
    print(f"Evaluation accuracy: {eval_accuracy}")

    # Get sorted list of label IDs
    sorted_labels = sorted(labels_to_ids, key=labels_to_ids.get)

    report = classification_report(eval_labels, eval_preds, labels=[labels_to_ids[label] for label in sorted_labels], target_names=sorted_labels)
    print("Classification Report:")
    print(report)
# evaluate()

In [None]:
labels, predictions = valid(model, testing_loader)


In [61]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1_score = f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1_score}")
    report = classification_report(eval_labels, eval_preds)
    print(report)

    return eval_labels, eval_preds

In [62]:
# Example usage
labels, predictions = valid(model, testing_loader)


Validation loss per 100 evaluation steps: 0.0824962928891182
Validation Loss: 0.058055733434028096
Validation Accuracy: 0.9894736842105263
F1 Score: 0.8505854800936767
              precision    recall  f1-score   support

        CHAR       0.81      0.85      0.83       820
         LOC       0.88      0.97      0.92       216
         ORG       0.00      0.00      0.00         2

   micro avg       0.83      0.87      0.85      1038
   macro avg       0.56      0.61      0.59      1038
weighted avg       0.83      0.87      0.85      1038



In [55]:
from seqeval.metrics import classification_report

print(classification_report(labels, predictions))

              precision    recall  f1-score   support

        CHAR       0.81      0.85      0.83       820
         LOC       0.88      0.97      0.92       216
         ORG       0.00      0.00      0.00         2

   micro avg       0.83      0.87      0.85      1038
   macro avg       0.56      0.61      0.59      1038
weighted avg       0.83      0.87      0.85      1038



  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
from seqeval.metrics import f1_score, accuracy_score
print(f1_score(labels, predictions))
print(accuracy_score(labels, predictions))

0.8505854800936767
0.9894736842105263


In [58]:
predictions

[['O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-CHAR',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'B-CHAR'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-CHAR',
  'O',
  'O',
  'O',
 

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("curve_tokenizer")


In [None]:
sentence = "Oooohhh Frodo Aaaahh Gimli My precious Wake up Wake up Wake up sleepies We must go yes we must go at once"
words = sentence.split()
inputs = tokenizer(words,
             is_split_into_words=True,
             return_offsets_mapping=True,
             padding='max_length',
             truncation=True,
             max_length=MAX_LEN,
             return_tensors="pt")


# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

In [None]:
import torch

# Ensure MAX_LEN is defined
MAX_LEN = 128

sentence = "Oh Smeagol Ive got one Ive got a fish Smeagol Smeagol Pull it in Go on go on go on pull it in Arrghh Deagol Deagol Deagol Give us that Deagol my love Why"
words = sentence.split()
inputs = tokenizer(words,
             is_split_into_words=True,
             return_offsets_mapping=True,
             padding='max_length',
             truncation=True,
             max_length=MAX_LEN,
             return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs.logits

active_logits = logits.view(-1, model.config.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  # only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(words)
print(prediction)


In [None]:
from transformers import pipeline

In [None]:
# import os

# directory = "./model"

# if not os.path.exists(directory):
#     os.makedirs(directory)

# # save vocabulary of the tokenizer
# tokenizer.save_vocabulary(directory)
# # save the model weights and its configuration file
# model.save_pretrained(directory)
# print('All files saved')


In [None]:

# # Display the mappings
# print("labels_to_ids:", labels_to_ids)
# print("ids_to_labels:", ids_to_labels)

In [None]:
# import json
# config = json.load(open('bert_train/config.json'))
# config['id2label'] = ids_to_labels
# config['label2id'] = labels_to_ids
# json.dump(config, open('bert_train/config.json', 'w'))
# model = BertForTokenClassification.from_pretrained('bert_train', num_labels=len(labels_to_ids))
# model.to(device)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import BertTokenizerFast, BertForTokenClassification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report




MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 5
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
MODEL_PATH = 'ner_model_from_final1.pth'
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

# Data Reading and Preprocessing Functions

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        labels = [labels_to_ids[label] for label in word_labels]

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

def clean_tag(tag):
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

data = {'sentence': [" ".join(sentence) for sentence in train_tokens],
        'word_labels': [",".join(tags) for tags in train_tags]}

df = pd.DataFrame(data)

data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
             'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)

# Create mappings
all_tags = [tag for tags in df['word_labels'] for tag in tags.split(",")]
unique_tags = set(all_tags)
labels_to_ids = {k: v for v, k in enumerate(unique_tags)}
ids_to_labels = {v: k for k, v in labels_to_ids.items()}

# Display the mappings
print("labels_to_ids:", labels_to_ids)
print("ids_to_labels:", ids_to_labels)

# Create training and testing datasets
training_set = dataset(df, tokenizer, MAX_LEN)
testing_set = dataset(df_test, tokenizer, MAX_LEN)

test_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

def train_model(training_subset, model, optimizer):
    model.train()
    tr_loss = 0
    nb_tr_steps = 0

    training_loader = DataLoader(training_subset, batch_size=BATCH_SIZE, shuffle=True)
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        tr_loss += loss.item()
        nb_tr_steps += 1

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    return epoch_loss

def evaluate_model(model, testing_loader):
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1

            flattened_targets = labels.view(-1)
            active_logits = eval_logits.view(-1, model.config.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)

            active_accuracy = labels.view(-1) != -100
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels.cpu().numpy())
            eval_preds.extend(predictions.cpu().numpy())

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    report = classification_report(eval_labels, eval_preds, labels=[labels_to_ids[label] for label in sorted(labels_to_ids)], target_names=sorted(labels_to_ids))

    return eval_loss, eval_accuracy, report


from seqeval.metrics import classification_report as seqeval_classification_report
import torch
from sklearn.metrics import accuracy_score

def evaluate_model1(model, testing_loader):
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1

            flattened_targets = labels.view(-1)
            active_logits = eval_logits.view(-1, model.config.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)

            active_accuracy = labels.view(-1) != -100
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels.cpu().numpy().tolist())
            eval_preds.extend(predictions.cpu().numpy().tolist())

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    
    # Convert labels and predictions to a list of lists for seqeval
    eval_labels_nested = [eval_labels]
    eval_preds_nested = [eval_preds]
    
    report = seqeval_classification_report(eval_labels_nested, eval_preds_nested)

    return eval_loss, eval_accuracy, report, reports


def eval1(model, testing_loader):
    loss_values, validation_loss_values = [], [] 
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    reports = []
    for idx, batch in enumerate(testing_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        with torch.no_grad():
            outputs = model(ids, token_type_ids=None, attention_mask=mask, labels=labels)

        logits = outputs[1].detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        eval_loss += outputs[0].mean().item()

        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    # Ensure tag values are correctly ordered
    tag_values = {0: 'B-CHAR', 1: 'I-LOC', 2: 'I-CHAR', 3: 'O', 4: 'B-ORG', 5: 'B-LOC', -100: 'PAD'}

    # Converting true labels and predictions to tag names
    pred_tags = [tag_values[int(p_i)] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if int(l_i) != -100]
    valid_tags = [tag_values[int(l_i)] for l in true_labels for l_i in l if int(l_i) != -100]

    eval_loss = eval_loss / len(testing_loader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    valid_tags = [valid_tags]
    pred_tags = [pred_tags]
    report = seqeval_classification_report(valid_tags, pred_tags)
    print("Validation Accuracy: {}".format(accuracy_score(valid_tags, pred_tags)))
    eval_accuaracy = accuracy_score(valid_tags, pred_tags)
    print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags)))
    print(report)
    return  eval_loss, eval_accuracy, report, reports
    
    
def learning_curve(training_set, testing_loader, model, optimizer, subset_sizes, epochs=EPOCHS):
    train_losses = []
    eval_losses = []
    eval_accuracies = []
    reports = []

    for subset_size in subset_sizes:
        print(f"Training with subset size: {subset_size}")
        subset_indices = list(range(subset_size))
        training_subset = Subset(training_set, subset_indices)

        for epoch in range(epochs):
            train_loss = train_model(training_subset, model, optimizer)
            print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}")

        eval_loss, eval_accuracy, eval_report = evaluate_model(model, testing_loader)
        print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
        print(eval_report)

        train_losses.append(train_loss)
        eval_losses.append(eval_loss)
        eval_accuracies.append(eval_accuracy)
        reports.append(eval_report)

    return train_losses, eval_losses, eval_accuracies, reports   







def learning_curve1(training_set, testing_loader, model, optimizer, subset_sizes, epochs=EPOCHS):
    train_losses = []
    eval_losses = []
    eval_accuracies = []
    reports = []

    for subset_size in subset_sizes:
        print(f"Training with subset size: {subset_size}")
        subset_indices = list(range(subset_size))
        training_subset = Subset(training_set, subset_indices)

        for epoch in range(epochs):
            train_loss = train_model(training_subset, model, optimizer)
            print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}")
        loss_values, validation_loss_values = [], []   
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []

        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            with torch.no_grad():
                outputs = model(ids, token_type_ids=None, attention_mask=mask, labels=labels)

            logits = outputs[1].detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            eval_loss += outputs[0].mean().item()

            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)
        print('hi1')
        # Ensure tag values are correctly ordered
        tag_values = {0: 'B-CHAR', 1: 'I-LOC', 2: 'I-CHAR', 3: 'O', 4: 'B-ORG', 5: 'B-LOC', -100: 'PAD'}

        # Converting true labels and predictions to tag names
        pred_tags = [tag_values[int(p_i)] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if int(l_i) != -100]
        valid_tags = [tag_values[int(l_i)] for l in true_labels for l_i in l if int(l_i) != -100]

        eval_loss = eval_loss / len(testing_loader)
        validation_loss_values.append(eval_loss)
        print("Validation loss: {}".format(eval_loss))
        print('here3')
        valid_tags = [valid_tags]
        pred_tags = [pred_tags]
        report = seqeval_classification_report(valid_tags, pred_tags)
        print("Validation Accuracy: {}".format(accuracy_score(valid_tags, pred_tags)))
        print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags)))
        print(report)    
    

# Define subset sizes and initialize model and optimizer
subset_sizes = [1, 2, 5, 8, 10, 15, 20, 30, 40, 60, 80, 100, 140, 180, 230, 300, 500, 700, 900, 1100, 1300, 1500, 1700, 1900, 2100]
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))
model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)

# Generate learning curve
train_losses, eval_losses, eval_accuracies, reports = learning_curve1(training_set, testing_loader, model, optimizer, subset_sizes)
print('train_losses: ', train_losses)
print('eval_losses: ', eval_losses)
print('eval_accuracies: ', eval_accuracies)
print('reports: ', reports)

# Plot learning curve
plt.figure(figsize=(12, 6))
plt.plot(subset_sizes, eval_accuracies, label='Evaluation Accuracy')
plt.plot(subset_sizes, train_losses, label='Training Loss')
plt.plot(subset_sizes, eval_losses, label='Evaluation Loss')
plt.xlabel('Training Set Size')
plt.ylabel('Performance')
plt.title('Learning Curve')
plt.legend()
plt.show()


In [None]:
# model.save_pretrained("curve_train")

In [None]:
# tokenizer.save_pretrained('curve_tokenizer')

In [None]:
plt.savefig("learning_curve.png")

In [None]:
import torch
from sklearn.metrics import accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report

def evaluate_model(model, testing_loader, label_map):
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    eval_preds, eval_labels = [], []
    predictions_iob2 = []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1

            flattened_targets = labels.view(-1)
            active_logits = eval_logits.view(-1, model.config.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)

            active_accuracy = labels.view(-1) != -100
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels.cpu().numpy().tolist())
            eval_preds.extend(predictions.cpu().numpy().tolist())

            # Convert predictions and labels to IOB2 format
            for prediction, label in zip(predictions.cpu().numpy().tolist(), labels.cpu().numpy().tolist()):
                predictions_iob2.append((label_map[label], label_map[prediction]))

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)

    return eval_loss, eval_accuracy, predictions_iob2


In [None]:
def save_predictions_iob2(predictions, file_path):
    with open(file_path, 'w') as f:
        for true_label, pred_label in predictions:
            f.write(f"{true_label} {pred_label}\n")

# Example usage:
eval_loss, eval_accuracy, predictions_iob2 = evaluate_model(model, testing_loader, ids_to_labels)
save_predictions_iob2(predictions_iob2, 'predictions.iob2')


In [None]:
from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report
import numpy as np
import torch

loss_values, validation_loss_values = [], []   
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions, true_labels = [], []

for idx, batch in enumerate(testing_loader):
    ids = batch['input_ids'].to(device, dtype=torch.long)
    mask = batch['attention_mask'].to(device, dtype=torch.long)
    labels = batch['labels'].to(device, dtype=torch.long)

    with torch.no_grad():
        outputs = model(ids, token_type_ids=None, attention_mask=mask, labels=labels)
    
    logits = outputs[1].detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()
    eval_loss += outputs[0].mean().item()
    
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.extend(label_ids)

# Ensure tag values are correctly ordered
tag_values = {0: 'B-CHAR', 1: 'I-LOC', 2: 'I-CHAR', 3: 'O', 4: 'B-ORG', 5: 'B-LOC', -100: 'PAD'}

# Converting true labels and predictions to tag names
pred_tags = [tag_values[int(p_i)] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if int(l_i) != -100]
valid_tags = [tag_values[int(l_i)] for l in true_labels for l_i in l if int(l_i) != -100]

eval_loss = eval_loss / len(testing_loader)
validation_loss_values.append(eval_loss)
print("Validation loss: {}".format(eval_loss))
valid_tags = [valid_tags]
pred_tags = [pred_tags]
report = seqeval_classification_report(valid_tags, pred_tags)
print("Validation Accuracy: {}".format(accuracy_score(valid_tags, pred_tags)))
print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags)))
print(report)