# Assignment 3 - NLP Aris Tsilifonis - mtn2323

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Question 1 - Execute the code snippet below and examine accuracy and macro-average accuracy of the given test set.

In [2]:
#
# Named-entity recognition using BERT
# Dataset: https://www.kaggle.com/datasets/alaakhaled/conll003-englishversion
#

# dependencies
import torch
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertForTokenClassification, BertTokenizerFast
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import tqdm
tqdmn = tqdm.notebook.tqdm

# hyper-parameters
EPOCHS = 3
BATCH_SIZE = 8
LR = 1e-5

# the path of the data files
base_path = './drive/My Drive/CoNLL003/'

# use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# read the data files
def load_sentences(filepath):

    sentences = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    with open(filepath, 'r') as f:

        for line in f.readlines():

            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(tokens) > 0:
                    sentences.append({'tokens': tokens, 'pos_tags': pos_tags, 'chunk_tags': chunk_tags, 'ner_tags': ner_tags})
                    tokens = []
                    pos_tags = []
                    chunk_tags = []
                    ner_tags = []
            else:
                l = line.split(' ')
                tokens.append(l[0])
                pos_tags.append(l[1])
                chunk_tags.append(l[2])
                ner_tags.append(l[3].strip('\n'))

    return sentences

print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

# build tagset and tag ids
tags = [sentence['ner_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])
print('Tagset size:',len(tagset))

# load BERT tokenizer
bert_version = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_version)

# map tokens and tags to token ids and label ids
def align_label(tokens, labels):

    word_ids = tokens.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tagmap[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

def encode(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['ner_tags'])
    return { 'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels) }

print('encoding data')
train_dataset = [encode(sentence) for sentence in train_sentences]
valid_dataset = [encode(sentence) for sentence in valid_sentences]
test_dataset = [encode(sentence) for sentence in test_sentences]

# initialize the model including a classification layer with num_labels classes
print('initializing the model')
model = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model.to(device)
optimizer = optim.AdamW(params=model.parameters(), lr=LR)

# prepare batches of data
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

# evaluate the performance of the model
def EvaluateModel(model, data_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds = [],[]
        for i, batch in enumerate(tqdmn(data_loader)):
            # move the batch tensors to the same device as the model
            batch = { k: v.to(device) for k, v in batch.items() }
            # send 'input_ids', 'attention_mask' and 'labels' to the model
            outputs = model(**batch)
            # iterate through the examples
            for idx, _ in enumerate(batch['labels']):
                # get the true values
                true_values_all = batch['labels'][idx]
                true_values = true_values_all[true_values_all != -100]
                # get the predicted values
                pred_values = torch.argmax(outputs[1], dim=2)[idx]
                pred_values = pred_values[true_values_all != -100]
                # update the lists of true answers and predictions
                Y_actual.append(true_values)
                Y_preds.append(pred_values)
                #find_misclassifications(batch, true_values_all, true_values, pred_values, idx, tokenizer)
        Y_actual = torch.cat(Y_actual)
        Y_preds = torch.cat(Y_preds)
    # Return list of actual labels, predicted labels
    return Y_actual.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()




loading data
Tagset size: 9


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

encoding data
initializing the model


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model,valid_loader)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model,test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))


training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.984

Validation Macro-Accuracy : 0.918
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.987

Validation Macro-Accuracy : 0.940
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.987

Validation Macro-Accuracy : 0.938
applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.978

Test Macro-Accuracy : 0.910

Classification Report : 
              precision    recall  f1-score   support

           O       1.00      0.99      0.99     38323
       B-LOC       0.93      0.93      0.93      1668
       B-PER       0.97      0.97      0.97      1617
       B-ORG       0.87      0.90      0.88      1661
       I-PER       0.99      0.99      0.99      1156
       I-ORG       0.80      0.92      0.86       835
      B-MISC       0.83      0.84      0.83       702
       I-LOC       0.87      0.89      0.88       257
      I-MISC       0.62      0.75      0.68       216

    accuracy                           0.98     46435
   macro avg       0.87      0.91      0.89     46435
weighted avg       0.98      0.98      0.98     46435



In [4]:
print(Y_actual)
print(Y_preds)

[0 0 1 ... 0 2 0]
[0 0 1 ... 0 2 0]


In [5]:
print(test_sentences)
print(len(test_sentences))

3453


In [6]:
torch.save(model, './drive/My Drive/CoNLL003/model/model.pt')

In [7]:
model = torch.load( './drive/My Drive/CoNLL003/model/model.pt')

# Question 2

Choose one sentence from the test set with at least 10 tokens in which the model
fails to find the correct tags for some tokens. Show in detail which tokens
the correct tags are assigned and which tokens are wrong. Additionally, enter as input to
final model a new sentence (eg from an online newspaper of your choice) with
at least 10 tokens in which some nominal entities appear and point to
which tokens the model predicts correctly and which incorrectly.

# Find sentences with more than 10 tokens

In [8]:
def find_long_sentences(data, min_tokens=10):
    """
    Finds sentences in the dataset that have more than a specified number of tokens.

    Args:
        data (list of dicts): The dataset where each dictionary represents a sentence with tokens and other attributes.
        min_tokens (int): Minimum number of tokens a sentence must have to be included in the results.

    Returns:
        list of dicts: A list of sentences that have more than the specified number of tokens.
    """
    long_sentences = [sentence for sentence in data if len(sentence['tokens']) > min_tokens]
    return long_sentences

# Find sentences with more than 10 tokens
long_sentences = find_long_sentences(test_sentences)

# Display the results
for sentence in long_sentences:
    print("Sentence with more than 10 tokens:")
    print(sentence['tokens'])
    print()  # Print a newline for better readability between sentences

print(len(long_sentences))

Sentence with more than 10 tokens:
['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']

Sentence with more than 10 tokens:
['Japan', 'began', 'the', 'defence', 'of', 'their', 'Asian', 'Cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'Syria', 'in', 'a', 'Group', 'C', 'championship', 'match', 'on', 'Friday', '.']

Sentence with more than 10 tokens:
['But', 'China', 'saw', 'their', 'luck', 'desert', 'them', 'in', 'the', 'second', 'match', 'of', 'the', 'group', ',', 'crashing', 'to', 'a', 'surprise', '2-0', 'defeat', 'to', 'newcomers', 'Uzbekistan', '.']

Sentence with more than 10 tokens:
['China', 'controlled', 'most', 'of', 'the', 'match', 'and', 'saw', 'several', 'chances', 'missed', 'until', 'the', '78th', 'minute', 'when', 'Uzbek', 'striker', 'Igor', 'Shkvyrin', 'took', 'advantage', 'of', 'a', 'misdirected', 'defensive', 'header', 'to', 'lob', 'the', 'ball', 'over', 'the', 'advancing', 'Chinese', 'keeper', 'and', 'into', 'an', 

# Detect missclassifications for sentences with more than 10 tokens. We provide those sentences as input to the model and compare predicted and true label of the token ( using NER Tags)

In [9]:
def evaluate_sentence(model, encoded_sentence, device):
    """Evaluate a single encoded sentence using the provided NER model."""
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # Prepare the inputs
        input_ids = encoded_sentence['input_ids'].unsqueeze(0).to(device)  # Add batch dimension
        attention_mask = encoded_sentence['attention_mask'].unsqueeze(0).to(device)
        labels = encoded_sentence['labels'].unsqueeze(0).to(device)

        # Get model predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=-1).squeeze()  # Remove batch dimension for comparison

    return predictions.cpu(), labels.squeeze().cpu()


def detect_misclassifications(model, long_sentences, tokenizer, device):
    misclassified_sentences = []
    for sentence in long_sentences:
        encoded = encode(sentence)
        predictions, labels = evaluate_sentence(model, encoded, device)

        # Filter out the special tokens before comparison
        valid_indices = labels != -100  # assuming -100 used for padding/ignore
        if not torch.equal(predictions[valid_indices], labels[valid_indices]):
            # Decode tokens for readability
            tokens = [tokenizer.decode([tok]) for tok in encoded['input_ids'][valid_indices]]
            misclassified_sentences.append({
                'tokens': tokens,
                'predicted': predictions[valid_indices].tolist(),
                'true': labels[valid_indices].tolist()
            })
    return misclassified_sentences

In [10]:
#Find sentences with misclassifications
misclassified = detect_misclassifications(model, long_sentences, tokenizer, device)

In [11]:
count=0
# Display the misclassified sentences
for info in misclassified:
    count+=1
    print("Misclassified Sentence:")
    print("Tokens:", info['tokens'])
    print("Predicted Tags:", info['predicted'])
    print("True Tags:", info['true'])
    print()
print("Misclassification count of (test set) sentences with more than 10 token: ", count)

Misclassified Sentence:
Tokens: ['soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.']
Predicted Tags: [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
True Tags: [0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0]

Misclassified Sentence:
Tokens: ['cut', 'announced', 'his', 'retirement', 'after', 'the', '1995', 'world', 'cup', ',', 'where', 'he', 'took', 'issue', 'with', 'being', 'dropped', 'from', 'the', 'italy', 'side', 'that', 'faced', 'england', 'in', 'the', 'pool', 'stages', '.']
Predicted Tags: [2, 0, 0, 0, 0, 0, 6, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
True Tags: [2, 0, 0, 0, 0, 0, 6, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]

Misclassified Sentence:
Tokens: ['results', 'of', 'asian', 'cup', 'group', 'c', 'matches', 'played', 'on', 'friday', ':']
Predicted Tags: [0, 0, 6, 8, 0, 8, 0, 0, 0, 0, 0]
True Tags: [0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 0]

Misclassified Sentence:
Tokens: ['dutch', 'forward', 'reggie', 'blink', 'h

# Find sentences that are highly misclassified, define the rate of misclassfication with a threshold variable

In [12]:
from tabulate import tabulate

# Function to find sentences with more than N misclassified tokens
def find_highly_misclassified_sentences(sentences, threshold):
    highly_misclassified = []
    for sentence in sentences:
        miscount = sum(1 for actual, predicted in zip(sentence['true'], sentence['predicted']) if actual != predicted)
        if miscount > threshold:
            highly_misclassified.append(sentence)
    return highly_misclassified

# Find sentences with more than 5 misclassified tokens
threshold = 4
sentences_with_many_errors = find_highly_misclassified_sentences(misclassified, threshold)
print(sentences_with_many_errors[0])
misclassified_sentence = sentences_with_many_errors[0]

# Prepare data for tabulate
data = []
for token, actual, predicted in zip(misclassified_sentence['tokens'], misclassified_sentence['true'], misclassified_sentence['predicted']):
    is_misclassified = actual != predicted
    data.append({
        "Token": token,
        "Misclassification": is_misclassified,
        "Actual": actual,
        "Predicted": predicted
    })

# Using tabulate to print the table
print(tabulate(data, headers="keys"))


{'tokens': ['ncaa', 'american', 'football', 'state', "'", 'pace', 'first', 'repeat', 'lombard', 'award', 'winner', '.'], 'predicted': [6, 8, 8, 5, 0, 0, 0, 0, 6, 8, 0, 0], 'true': [3, 0, 6, 8, 0, 2, 0, 0, 6, 8, 0, 0]}
Token     Misclassification      Actual    Predicted
--------  -------------------  --------  -----------
ncaa      True                        3            6
american  True                        0            8
football  True                        6            8
state     True                        8            5
'         False                       0            0
pace      True                        2            0
first     False                       0            0
repeat    False                       0            0
lombard   False                       6            6
award     False                       8            8
winner    False                       0            0
.         False                       0            0


# Detect misclassifications in the selected appropriate sentence ( with more than 10 tokens)

In [13]:
example_sentence = load_sentences(base_path + 'example.txt')
#print(example_sentence)

# Find sentences with more than 10 tokens
long_sentence = find_long_sentences(example_sentence)

# Display the results
for sentence in long_sentence:
    print("Sentence with more than 10 tokens:")
    print(sentence['tokens'])
    print()  # Print a newline for better readability between sentences

print(len(long_sentence))

#Find sentences with misclassifications
misclassified2 = detect_misclassifications(model, long_sentence, tokenizer, device)

count=0
# Display the misclassified sentences
for info in misclassified2:
    count+=1
    print("Misclassified Sentence:")
    print("Tokens:", info['tokens'])
    print("Predicted Tags:", info['predicted'])
    print("True Tags:", info['true'])
    print()
print("Misclassification count of (test set) sentences with more than 10 token: ", count)


Sentence with more than 10 tokens:
['The', 'rapid', 'spread', 'of', 'COVID-19', 'in', 'early', '2020', 'prompted', 'the', 'World', 'Health', 'Organization', 'to', 'declare', 'a', 'global', 'pandemic', ',', 'affecting', 'millions', 'of', 'people', 'worldwide', '.']

1
Misclassified Sentence:
Tokens: ['the', 'rapid', 'spread', 'of', 'co', 'in', 'early', '2020', 'prompted', 'the', 'world', 'health', 'organization', 'to', 'declare', 'a', 'global', 'pan', ',', 'affecting', 'millions', 'of', 'people', 'worldwide', '.']
Predicted Tags: [0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 3, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
True Tags: [0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Misclassification count of (test set) sentences with more than 10 token:  1


In [14]:
data = []
for token, predicted, true in zip(misclassified2[0]['tokens'], misclassified2[0]['predicted'], misclassified2[0]['true']):
    misclassified = 'Yes' if predicted != true else 'No'
    data.append((token, predicted, true, misclassified))

# Preparing headers for the table
headers = ["Token", "Predicted Tag", "True Tag", "Misclassification"]

# Print in tabulated format using the 'grid' style
print(tabulate(data, headers=headers))

Token           Predicted Tag    True Tag  Misclassification
------------  ---------------  ----------  -------------------
the                         0           0  No
rapid                       0           0  No
spread                      0           0  No
of                          0           0  No
co                          6           8  Yes
in                          0           0  No
early                       0           0  No
2020                        0           0  No
prompted                    0           0  No
the                         0           0  No
world                       3           5  Yes
health                      5           5  No
organization                5           5  No
to                          0           0  No
declare                     0           0  No
a                           0           0  No
global                      0           0  No
pan                         0           0  No
,                           0           0  No


# Classification report for the selected example sentence

In [15]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

# Assuming tagmap.get_itos() returns list of tag names like ['O', 'B-LOC', 'I-PER', ...]
tag_names = tagmap.get_itos()  # List of tag names
tag_indices = list(range(len(tag_names)))  # Indices of the tags

# Get predictions and true labels from the first element in misclassified2
example_pred = misclassified2[0]['predicted']
example_true = misclassified2[0]['true']

# Print Accuracy and Macro-Accuracy
print("\nExample's Test Accuracy : {:.3f}".format(accuracy_score(example_true, example_pred)))
print("\nExample's Test Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(example_true, example_pred)))

# Print Classification Report
print("\nClassification Report : ")
print(classification_report(example_true, example_pred, labels=tag_indices, target_names=tag_names, zero_division=0))


Example's Test Accuracy : 0.920

Example's Test Macro-Accuracy : 0.556

Classification Report : 
              precision    recall  f1-score   support

           O       1.00      1.00      1.00        21
       B-LOC       0.00      0.00      0.00         0
       B-PER       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00         0
       I-PER       0.00      0.00      0.00         0
       I-ORG       1.00      0.67      0.80         3
      B-MISC       0.00      0.00      0.00         0
       I-LOC       0.00      0.00      0.00         0
      I-MISC       0.00      0.00      0.00         1

   micro avg       0.92      0.92      0.92        25
   macro avg       0.22      0.19      0.20        25
weighted avg       0.96      0.92      0.94        25





# Question 3

The align_label function is placed within encode_sentence to match the tagset labels with the appropriate encoded tokens. It processes each token by its word ID, assigning labels based on a predefined tag mapping. This function can handle several scnerarios, when the model needs to consider padding tokens, as well new entities or words. Also, it helps the model identify tokens that are part of the same entity. The output is an aligned mapping of label IDs to the corresponding tokens.

The align_label function ensures that instances like "well-fancied" are treated correctly by the encoder. The tokenizer will split this word into three token: "well","-","fancied". This function will assign -100 to the last two tokens of the sequence (namely  "-","fancied") . As a result, the model recognizes that these tokens are related to the first one (namely "well"). In this way, the original sequence is represented correctly and the model can understand the meaning of it. Also, this function assigns -100 to the tokens that the model should ignore in the train phase. Consistent Entity Recognition is achieved because ensuring that only the first token of a split entity word will carry the label prevents the model from mistakenly treating each part of a split word as a separate entity, which can skew recognition accuracy. Then, the model concentrates on tokens that represent actual words or meaningful parts of words. Finally, the model can learn how labels are associated with words and phrases in the text.



# Question 4: Freeze weights of pretrained - BERT, consider only classification weights at the training

## Minor modifications were implemented in the code. The cells below highlight some variations compared to the original code. Specifically, specific weights were frozen, and the optimizer was adjusted.

In [16]:
# Initialize the model3 including a classification layer with num_labels classes
model3 = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model3.to(device)

# Freeze all the parameters in the BERT model3
for param in model3.bert.parameters():
    param.requires_grad = False

# Optionally, ensure the classifier layer is trainable
for param in model3.classifier.parameters():
    param.requires_grad = True

# Modify optimizer to only update the parameters that require gradients
optimizer2 = optim.AdamW(params=[p for p in model3.parameters() if p.requires_grad], lr=LR)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
frozen_parameters = sum(p.numel() for p in model3.parameters() if not p.requires_grad)
trainable_parameters = sum(p.numel() for p in model3.parameters() if p.requires_grad)

print(f"Total frozen parameters: {frozen_parameters}")
print(f"Total trainable parameters: {trainable_parameters}")


Total frozen parameters: 108891648
Total trainable parameters: 6921


In [18]:
# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model3.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model3(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer2.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer2.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model3,valid_loader)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model3,test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))

training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.829

Validation Macro-Accuracy : 0.127
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.836

Validation Macro-Accuracy : 0.141
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.852

Validation Macro-Accuracy : 0.189
applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.846

Test Macro-Accuracy : 0.193

Classification Report : 
              precision    recall  f1-score   support

           O       0.85      0.99      0.92     38323
       B-LOC       0.74      0.28      0.41      1668
       B-PER       0.79      0.12      0.21      1617
       B-ORG       0.60      0.25      0.36      1661
       I-PER       0.85      0.08      0.14      1156
       I-ORG       0.00      0.00      0.00       835
      B-MISC       0.00      0.00      0.00       702
       I-LOC       0.00      0.00      0.00       257
      I-MISC       0.00      0.00      0.00       216

    accuracy                           0.85     46435
   macro avg       0.43      0.19      0.23     46435
weighted avg       0.80      0.85      0.80     46435



## The results significantly deteriorated compared to the initial experiment where we did not freeze the weights. Specifically, the Test-Macro accuracy has declined by approximately fivefold compared to the model discussed in question 1. Moreover, the overall test accuracy has dropped by about 10%. These outcomes underscore the importance of not freezing the pretrained BERT weights, as they are essential for maintaining strong performance in the model. This conclusion is further supported by the classification report, where all scores have decreased relative to those obtained when the model weights were not frozen.

# Question 5: Combine train and validation set to test model's accuracy

We changed the train loader and omitted validation during training. The tagset was altered since it is based on the train loader. New encodings were produced according to the new dataset

In [19]:
# Load and combine datasets
print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

# Combining training and validation data
combined_train_sentences = train_sentences + valid_sentences
# build tagset and tag ids
tags = [sentence['ner_tags'] for sentence in combined_train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])
print('Tagset size:',len(tagset))


# Encode all data
print('encoding data')
combined_train_dataset = [encode(sentence) for sentence in combined_train_sentences]
test_dataset = [encode(sentence) for sentence in test_sentences]

# Initialize the model
print('initializing the model')
model4 = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model4.to(device)
optimizer3 = optim.AdamW(params=model4.parameters(), lr=LR)

# Prepare the new combined train loader
train_loader = torch.utils.data.DataLoader(combined_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Training the model using the combined dataset
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model4.train()
    print('epoch', epoch + 1)
    for i, batch in enumerate(tqdmn(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model4(**batch)
        loss = outputs[0]
        optimizer3.zero_grad()
        loss.backward()
        optimizer3.step()

print('applying the model to the test set')
Y_actual, Y_preds = EvaluateModel(model4, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, labels=tagmap(tagmap.get_itos()), target_names=tagmap.get_itos(), zero_division=0))


loading data
Tagset size: 9
encoding data
initializing the model


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/2162 [00:00<?, ?it/s]

epoch 2


  0%|          | 0/2162 [00:00<?, ?it/s]

epoch 3


  0%|          | 0/2162 [00:00<?, ?it/s]

applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.979

Test Macro-Accuracy : 0.905

Classification Report : 
              precision    recall  f1-score   support

           O       1.00      0.99      0.99     38323
       B-LOC       0.93      0.92      0.93      1668
       B-PER       0.98      0.97      0.97      1617
       B-ORG       0.89      0.91      0.90      1661
       I-PER       0.99      0.99      0.99      1156
       I-ORG       0.78      0.91      0.84       835
      B-MISC       0.84      0.83      0.84       702
      I-MISC       0.64      0.73      0.68       216
       I-LOC       0.81      0.89      0.85       257

    accuracy                           0.98     46435
   macro avg       0.87      0.91      0.89     46435
weighted avg       0.98      0.98      0.98     46435



## It is observed that the accuracy has improved by 0.1% compared to the initial experiment. However, the Test-Macro accuracy is 0.5% lower. Therefore, combining the validation and train sets into a single dataset does not significantly alter the model's overall accuracy. It slightly improves the test accuracy while marginally reducing the Test-Macro accuracy.

In [20]:
torch.save(model4, './drive/My Drive/CoNLL003/model/model4.pt')

In [21]:
model4 = torch.load( './drive/My Drive/CoNLL003/model/model4.pt')

In [22]:
# Combining training and validation data
combined_train_sentences = train_sentences + valid_sentences
print(len(combined_train_sentences))

17291


#  Question 6: POS_Tagging instead of NER Tagging

Changes were made to encoder and tagset code to utilize pos tags instead of ner tags. Also, new loaders were used in the training process.

In [23]:
# Build POS tagset and tag IDs
tags = [sentence['pos_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])

In [24]:
# Initialize the model with the correct number of POS labels
model5 = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model5.to(device)
optimizer5 = optim.AdamW(params=model5.parameters(), lr=LR)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def encode2(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['pos_tags'])  # Change to POS tags
    return { 'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels) }

print('encoding data')
train_dataset2 = [encode2(sentence) for sentence in train_sentences]
valid_dataset2 = [encode2(sentence) for sentence in valid_sentences]
test_dataset2 = [encode2(sentence) for sentence in test_sentences]

# prepare batches of data
train_loader2 = torch.utils.data.DataLoader(train_dataset2, batch_size=BATCH_SIZE, shuffle=True)
valid_loader2 = torch.utils.data.DataLoader(valid_dataset2, batch_size=BATCH_SIZE)
test_loader2 = torch.utils.data.DataLoader(test_dataset2, batch_size=BATCH_SIZE)



encoding data


In [26]:
# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model5.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader2)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model5(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer5.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer5.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model5,valid_loader2)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model5,test_loader2)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))


training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.935

Validation Macro-Accuracy : 0.747
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.943

Validation Macro-Accuracy : 0.817
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.946

Validation Macro-Accuracy : 0.823
applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.942

Test Macro-Accuracy : 0.861

Classification Report : 
              precision    recall  f1-score   support

         NNP       0.91      0.93      0.92      8595
          NN       0.89      0.90      0.89      4931
          CD       0.97      0.99      0.98      5962
          IN       0.99      0.99      0.99      4018
          DT       0.99      0.99      0.99      2799
          JJ       0.88      0.81      0.84      2393
         NNS       0.93      0.92      0.93      2174
         VBD       0.94      0.95      0.95      1699
           .       1.00      1.00      1.00      1630
           ,       1.00      1.00      1.00      1637
          VB       0.96      0.89      0.92       933
         VBN       0.87      0.87      0.87       866
          RB       0.90      0.85      0.87       888
          CC       1.00      0.99      1.00       765
          TO       1.00      1.00      1.00       818
         PRP       1.00      0.96      0.98       605
   

## Using POS tags instead Of NER tags results in producing different encodings to be used by the model. However, the test accuracy and test macro accuracy are significantly worse than the one of NER tags (around 1-5%)

detect_misclassifications function was changed to accept the new encodings (as seen here: encoded = encode2(sentence))

In [27]:
def detect_misclassifications2(model, long_sentences, tokenizer, device):
    misclassified_sentences = []
    for sentence in long_sentences:
        encoded = encode2(sentence)
        predictions, labels = evaluate_sentence(model, encoded, device)

        # Filter out the special tokens before comparison
        valid_indices = labels != -100  # assuming -100 used for padding/ignore
        if not torch.equal(predictions[valid_indices], labels[valid_indices]):
            # Decode tokens for readability
            tokens = [tokenizer.decode([tok]) for tok in encoded['input_ids'][valid_indices]]
            misclassified_sentences.append({
                'tokens': tokens,
                'predicted': predictions[valid_indices].tolist(),
                'true': labels[valid_indices].tolist()
            })
    return misclassified_sentences

Uses new model to detect misclassifications

In [28]:
#Find sentences with misclassifications
misclassified5 = detect_misclassifications2(model5, long_sentences, tokenizer, device)

count=0
# Display the misclassified sentences
for info in misclassified5:
    count+=1
    print("Misclassified Sentence:")
    print("Tokens:", info['tokens'])
    print("Predicted Tags:", info['predicted'])
    print("True Tags:", info['true'])
    print()
print("Misclassification count of (test set) sentences with more than 10 token: ", count)

Misclassified Sentence:
Tokens: ['soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.']
Predicted Tags: [1, 20, 0, 0, 0, 0, 9, 0, 3, 0, 0, 8]
True Tags: [1, 20, 0, 10, 0, 0, 9, 0, 3, 4, 1, 8]

Misclassified Sentence:
Tokens: ['japan', 'began', 'the', 'defence', 'of', 'their', 'asian', 'cup', 'title', 'with', 'a', 'lucky', '2', 'win', 'against', 'syria', 'in', 'a', 'group', 'c', 'championship', 'match', 'on', 'friday', '.']
Predicted Tags: [0, 7, 4, 1, 3, 23, 0, 0, 1, 3, 4, 5, 2, 24, 3, 0, 3, 4, 1, 2, 1, 1, 3, 0, 8]
True Tags: [0, 7, 4, 1, 3, 23, 5, 0, 1, 3, 4, 5, 2, 24, 3, 0, 3, 4, 0, 0, 1, 1, 3, 0, 8]

Misclassified Sentence:
Tokens: ['but', 'china', 'saw', 'their', 'luck', 'desert', 'them', 'in', 'the', 'second', 'match', 'of', 'the', 'group', ',', 'crashing', 'to', 'a', 'surprise', '2', 'defeat', 'to', 'newcomers', 'uzbekistan', '.']
Predicted Tags: [13, 0, 7, 23, 1, 10, 15, 3, 4, 5, 1, 3, 4, 1, 9, 18, 14, 4, 5, 2, 1, 14, 6, 0, 8]
True Tags: [1

This code remains largely unchanged; it selects a sentence with more than ten tokens that is significantly misclassified, based on a predefined threshold.

In [29]:
from tabulate import tabulate

# Function to find sentences with more than N misclassified tokens
def find_highly_misclassified_sentences(sentences, threshold):
    highly_misclassified = []
    for sentence in sentences:
        miscount = sum(1 for actual, predicted in zip(sentence['true'], sentence['predicted']) if actual != predicted)
        if miscount > threshold:
            highly_misclassified.append(sentence)
    return highly_misclassified

# Find sentences with more than 5 misclassified tokens
threshold = 4
sentences_with_many_errors5 = find_highly_misclassified_sentences(misclassified5, threshold)
print(sentences_with_many_errors5[0])
misclassified_sentence5 = sentences_with_many_errors5[0]

# Prepare data for tabulate
data5 = []
for token, actual, predicted in zip(misclassified_sentence5['tokens'], misclassified_sentence5['true'], misclassified_sentence5['predicted']):
    is_misclassified = actual != predicted
    data5.append({
        "Token": token,
        "Misclassification": is_misclassified,
        "Actual": actual,
        "Predicted": predicted
    })

# Using tabulate to print the table
print(tabulate(data5, headers="keys"))


{'tokens': ['league', 'duties', 'restricted', 'the', 'barbarians', "'", 'selector', 'options', 'but', 'they', 'still', 'bo', '13', 'internationals', 'including', 'england', 'full', 'tim', 'st', 'and', 'recalled', 'wing', 'tony', 'underwood', ',', 'plus', 'all', 'black', 'forwards', 'ian', 'jones', 'and', 'norm', 'hewitt', '.'], 'predicted': [1, 6, 7, 4, 26, 22, 1, 6, 13, 15, 12, 24, 2, 6, 18, 0, 1, 0, 0, 13, 7, 1, 0, 0, 9, 13, 4, 5, 6, 0, 0, 13, 0, 0, 8], 'true': [0, 6, 11, 4, 0, 22, 5, 6, 13, 15, 12, 24, 2, 6, 18, 0, 1, 0, 0, 13, 7, 1, 0, 0, 9, 13, 4, 0, 12, 0, 0, 13, 0, 0, 8]}
Token           Misclassification      Actual    Predicted
--------------  -------------------  --------  -----------
league          True                        0            1
duties          False                       6            6
restricted      True                       11            7
the             False                       4            4
barbarians      True                        0           26
'

Show misclassification of the example sentence

In [30]:
example_sentence5 = load_sentences(base_path + 'example.txt')
#print(example_sentence5)

# Find sentences with more than 10 tokens
long_sentence5 = find_long_sentences(example_sentence5)

# Display the results
for sentence in long_sentence5:
    print("Sentence with more than 10 tokens:")
    print(sentence['tokens'])
    print()  # Print a newline for better readability between sentences

print(len(long_sentence5))

#Find sentences with misclassifications
misclassified6 = detect_misclassifications2(model5, long_sentence5, tokenizer, device)

count=0
# Display the misclassified sentences
for info in misclassified6:
    count+=1
    print("Misclassified Sentence:")
    print("Tokens:", info['tokens'])
    print("Predicted Tags:", info['predicted'])
    print("True Tags:", info['true'])
    print()
print("Misclassification count of (test set) sentences with more than 10 token: ", count)

data6 = []
for token, predicted, true in zip(misclassified6[0]['tokens'], misclassified6[0]['predicted'], misclassified6[0]['true']):
    misclassified = 'Yes' if predicted != true else 'No'
    data6.append((token, predicted, true, misclassified))

# Preparing headers for the table
headers = ["Token", "Predicted Tag", "True Tag", "Misclassification"]

# Print in tabulated format using the 'grid' style
print(tabulate(data6, headers=headers))

Sentence with more than 10 tokens:
['The', 'rapid', 'spread', 'of', 'COVID-19', 'in', 'early', '2020', 'prompted', 'the', 'World', 'Health', 'Organization', 'to', 'declare', 'a', 'global', 'pandemic', ',', 'affecting', 'millions', 'of', 'people', 'worldwide', '.']

1
Misclassified Sentence:
Tokens: ['the', 'rapid', 'spread', 'of', 'co', 'in', 'early', '2020', 'prompted', 'the', 'world', 'health', 'organization', 'to', 'declare', 'a', 'global', 'pan', ',', 'affecting', 'millions', 'of', 'people', 'worldwide', '.']
Predicted Tags: [4, 5, 1, 3, 0, 3, 5, 2, 7, 4, 0, 0, 0, 14, 10, 4, 5, 1, 9, 18, 6, 3, 6, 12, 8]
True Tags: [4, 5, 1, 3, 0, 3, 5, 2, 7, 4, 0, 0, 0, 14, 10, 4, 5, 1, 9, 18, 6, 3, 6, 1, 8]

Misclassification count of (test set) sentences with more than 10 token:  1
Token           Predicted Tag    True Tag  Misclassification
------------  ---------------  ----------  -------------------
the                         4           4  No
rapid                       5           5  No
sp

This classification report pertains to the example sentence. The accuracy of the model has significantly improved when analyzing this random sentence. (5% test accuracy, 40% macro-accuracy)

In [31]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

# Assuming tagmap.get_itos() returns list of tag names like ['O', 'B-LOC', 'I-PER', ...]
tag_names = tagmap.get_itos()  # List of tag names
tag_indices = list(range(len(tag_names)))  # Indices of the tags

# Get predictions and true labels from the first element in misclassified6
example_pred = misclassified6[0]['predicted']
example_true = misclassified6[0]['true']

# Print Accuracy and Macro-Accuracy
print("\nExample's Test Accuracy : {:.3f}".format(accuracy_score(example_true, example_pred)))
print("\nExample's Test Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(example_true, example_pred)))

# Print Classification Report
print("\nClassification Report : ")
print(classification_report(example_true, example_pred, labels=tag_indices, target_names=tag_names, zero_division=0))


Example's Test Accuracy : 0.960

Example's Test Macro-Accuracy : 0.974

Classification Report : 
              precision    recall  f1-score   support

         NNP       1.00      1.00      1.00         4
          NN       1.00      0.67      0.80         3
          CD       1.00      1.00      1.00         1
          IN       1.00      1.00      1.00         3
          DT       1.00      1.00      1.00         3
          JJ       1.00      1.00      1.00         3
         NNS       1.00      1.00      1.00         2
         VBD       1.00      1.00      1.00         1
           .       1.00      1.00      1.00         1
           ,       1.00      1.00      1.00         1
          VB       1.00      1.00      1.00         1
         VBN       0.00      0.00      0.00         0
          RB       0.00      0.00      0.00         0
          CC       0.00      0.00      0.00         0
          TO       1.00      1.00      1.00         1
         PRP       0.00      0.00    



In [32]:
torch.save(model5, './drive/My Drive/CoNLL003/model/model5.pt')
model5lod = torch.load( './drive/My Drive/CoNLL003/model/model5.pt')

# Question 7: use Chunk tags instead of NER Tags

The code was changed similarly as before but we used chunk tags instead of POS.

In [33]:
# Build chunk tagset and tag IDs
tags = [sentence['chunk_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])

# Initialize the model with the correct number of chunk labels
model6 = BertForTokenClassification.from_pretrained(bert_version, num_labels=len(tagset))
model6.to(device)
optimizer6 = optim.AdamW(params=model6.parameters(), lr=LR)


def encode3(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['chunk_tags'])  # Change to chunk tags
    return { 'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels) }

print('encoding data')
train_dataset3 = [encode3(sentence) for sentence in train_sentences]
valid_dataset3 = [encode3(sentence) for sentence in valid_sentences]
test_dataset3 = [encode3(sentence) for sentence in test_sentences]

# prepare batches of data
train_loader3 = torch.utils.data.DataLoader(train_dataset3, batch_size=BATCH_SIZE, shuffle=True)
valid_loader3 = torch.utils.data.DataLoader(valid_dataset3, batch_size=BATCH_SIZE)
test_loader3 = torch.utils.data.DataLoader(test_dataset3, batch_size=BATCH_SIZE)

# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model6.train()
    print('epoch',epoch+1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader3)):
        # move the batch tensors to the same device as the model
        batch = { k: v.to(device) for k, v in batch.items() }
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model6(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer6.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer6.step()
    # calculate performence on validation set
    Y_actual, Y_preds = EvaluateModel(model6,valid_loader3)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model6,test_loader3)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds,labels = tagmap(tagmap.get_itos()), target_names = tagmap.get_itos(), zero_division = 0))


def detect_misclassifications3(model, long_sentences, tokenizer, device):
    misclassified_sentences = []
    for sentence in long_sentences:
        encoded = encode3(sentence)
        predictions, labels = evaluate_sentence(model, encoded, device)

        # Filter out the special tokens before comparison
        valid_indices = labels != -100  # assuming -100 used for padding/ignore
        if not torch.equal(predictions[valid_indices], labels[valid_indices]):
            # Decode tokens for readability
            tokens = [tokenizer.decode([tok]) for tok in encoded['input_ids'][valid_indices]]
            misclassified_sentences.append({
                'tokens': tokens,
                'predicted': predictions[valid_indices].tolist(),
                'true': labels[valid_indices].tolist()
            })
    return misclassified_sentences

from tabulate import tabulate


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


encoding data
training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.949

Validation Macro-Accuracy : 0.497
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.954

Validation Macro-Accuracy : 0.581
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.957

Validation Macro-Accuracy : 0.625
applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.953

Test Macro-Accuracy : 0.602

Classification Report : 
              precision    recall  f1-score   support

        I-NP       0.95      0.97      0.96     16177
        B-NP       0.96      0.95      0.95     12985
           O       0.98      0.98      0.98      6210
        B-PP       0.96      0.98      0.97      3979
        B-VP       0.94      0.92      0.93      3767
        I-VP       0.93      0.93      0.93      1913
      B-ADVP       0.82      0.73      0.77       559
      B-SBAR       0.88      0.83      0.85       296
      B-ADJP       0.75      0.66      0.70       276
       B-PRT       0.73      0.70      0.72       110
      I-ADJP       0.60      0.60      0.60        55
      I-ADVP       0.69      0.33      0.45        33
        I-PP       0.90      0.60      0.72        15
      B-INTJ       0.00      0.00      0.00        13
     I-CONJP       0.50      0.29      0.36         7
       B-LST       1.00      0.21      0.34        29
   

NER model has significantly higher test accuracy (0.978 vs. 0.953) and macro-accuracy (0.910 vs. 0.602) compared to Chunk model.
The weighted average and micro-average scores in NER model are also considerably higher, indicating more consistent performance across all entity classes.

Entity Recognition:

Chunk model performs well on common classes (like I-NP and B-NP) but struggles with less frequent tags.
NER model demonstrates robust performance in recognizing and accurately classifying a wide range of entities, both frequent and infrequent.

Consistency Across Classes:

Chunk model's macro average metrics show it performs unevenly across different classes, probably due to overfitting on common classes or inappropriate training for rarer classes.
NER model maintains high precision and recall across all classes, indicating a more properly trained model.


Conclusion:
NER model is clearly superior to Chunk Model

## Detect Misclassifications (same code as before except for encoder and model)

In [34]:
#Find sentences with misclassifications
misclassified7 = detect_misclassifications3(model6, long_sentences, tokenizer, device)

count=0
# Display the misclassified sentences
for info in misclassified7:
    count+=1
    print("Misclassified Sentence:")
    print("Tokens:", info['tokens'])
    print("Predicted Tags:", info['predicted'])
    print("True Tags:", info['true'])
    print()
print("Misclassification count of (test set) sentences with more than 10 token: ", count)


Misclassified Sentence:
Tokens: ['soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.']
Predicted Tags: [1, 2, 1, 0, 0, 0, 2, 1, 3, 1, 0, 2]
True Tags: [1, 2, 1, 4, 1, 0, 2, 1, 3, 1, 0, 2]

Misclassified Sentence:
Tokens: ['japan', 'began', 'the', 'defence', 'of', 'their', 'asian', 'cup', 'title', 'with', 'a', 'lucky', '2', 'win', 'against', 'syria', 'in', 'a', 'group', 'c', 'championship', 'match', 'on', 'friday', '.']
Predicted Tags: [1, 4, 1, 0, 3, 1, 0, 0, 0, 3, 1, 0, 0, 0, 3, 1, 3, 1, 0, 0, 0, 0, 3, 1, 2]
True Tags: [1, 4, 1, 0, 3, 1, 0, 0, 0, 3, 1, 0, 0, 4, 3, 1, 3, 1, 0, 0, 0, 0, 3, 1, 2]

Misclassified Sentence:
Tokens: ['china', 'controlled', 'most', 'of', 'the', 'match', 'and', 'saw', 'several', 'chances', 'missed', 'until', 'the', '78', 'minute', 'when', 'u', 'striker', 'igor', 'sh', 'took', 'advantage', 'of', 'a', 'mis', 'defensive', 'header', 'to', 'lo', 'the', 'ball', 'over', 'the', 'advancing', 'chinese', 'keeper', 'and', 'into', 'a

In [35]:
# Function to find sentences with more than N misclassified tokens
def find_highly_misclassified_sentences(sentences, threshold):
    highly_misclassified = []
    for sentence in sentences:
        miscount = sum(1 for actual, predicted in zip(sentence['true'], sentence['predicted']) if actual != predicted)
        if miscount > threshold:
            highly_misclassified.append(sentence)
    return highly_misclassified

# Find sentences with more than 6 misclassified tokens
threshold = 4
sentences_with_many_errors7 = find_highly_misclassified_sentences(misclassified7, threshold)
print(sentences_with_many_errors7[0])
misclassified_sentence7 = sentences_with_many_errors7[0]

# Prepare data for tabulate
data7 = []
for token, actual, predicted in zip(misclassified_sentence7['tokens'], misclassified_sentence7['true'], misclassified_sentence7['predicted']):
    is_misclassified = actual != predicted
    data7.append({
        "Token": token,
        "Misclassification": is_misclassified,
        "Actual": actual,
        "Predicted": predicted
    })

# Using tabulate to print the table
print(tabulate(data7, headers="keys"))




{'tokens': ['the', 'syrian', 'scored', 'early', 'and', 'then', 'played', 'defensive', 'and', 'adopted', 'long', 'balls', 'which', 'made', 'it', 'hard', 'for', 'us', '.', "'"], 'predicted': [1, 0, 4, 6, 0, 4, 5, 1, 2, 4, 1, 0, 1, 4, 1, 8, 3, 1, 2, 2], 'true': [1, 0, 4, 1, 2, 4, 5, 6, 2, 4, 5, 5, 1, 4, 1, 8, 3, 1, 2, 2]}
Token      Misclassification      Actual    Predicted
---------  -------------------  --------  -----------
the        False                       1            1
syrian     False                       0            0
scored     False                       4            4
early      True                        1            6
and        True                        2            0
then       False                       4            4
played     False                       5            5
defensive  True                        6            1
and        False                       2            2
adopted    False                       4            4
long       True                

In [36]:
example_sentence7 = load_sentences(base_path + 'example.txt')
#print(example_sentence7)

# Find sentences with more than 10 tokens
long_sentence7 = find_long_sentences(example_sentence7)

# Display the results
for sentence in long_sentence7:
    print("Sentence with more than 10 tokens:")
    print(sentence['tokens'])
    print()  # Print a newline for better readability between sentences

print(len(long_sentence7))

#Find sentences with misclassifications
misclassified8 = detect_misclassifications3(model6, long_sentence7, tokenizer, device)

count=0
# Display the misclassified sentences
for info in misclassified8:
    count+=1
    print("Misclassified Sentence:")
    print("Tokens:", info['tokens'])
    print("Predicted Tags:", info['predicted'])
    print("True Tags:", info['true'])
    print()
print("Misclassification count of (test set) sentences with more than 10 token: ", count)

data8 = []
for token, predicted, true in zip(misclassified8[0]['tokens'], misclassified8[0]['predicted'], misclassified8[0]['true']):
    misclassified = 'Yes' if predicted != true else 'No'
    data8.append((token, predicted, true, misclassified))

# Preparing headers for the table
headers = ["Token", "Predicted Tag", "True Tag", "Misclassification"]

# Print in tabulated format using the 'grid' style
print(tabulate(data8, headers=headers))

Sentence with more than 10 tokens:
['The', 'rapid', 'spread', 'of', 'COVID-19', 'in', 'early', '2020', 'prompted', 'the', 'World', 'Health', 'Organization', 'to', 'declare', 'a', 'global', 'pandemic', ',', 'affecting', 'millions', 'of', 'people', 'worldwide', '.']

1
Misclassified Sentence:
Tokens: ['the', 'rapid', 'spread', 'of', 'co', 'in', 'early', '2020', 'prompted', 'the', 'world', 'health', 'organization', 'to', 'declare', 'a', 'global', 'pan', ',', 'affecting', 'millions', 'of', 'people', 'worldwide', '.']
Predicted Tags: [1, 0, 0, 3, 1, 3, 1, 0, 4, 1, 0, 0, 0, 4, 5, 1, 0, 0, 2, 4, 1, 3, 1, 6, 2]
True Tags: [1, 0, 0, 12, 1, 3, 1, 0, 4, 1, 0, 0, 0, 4, 5, 1, 0, 0, 2, 4, 1, 12, 0, 6, 2]

Misclassification count of (test set) sentences with more than 10 token:  1
Token           Predicted Tag    True Tag  Misclassification
------------  ---------------  ----------  -------------------
the                         1           1  No
rapid                       0           0  No
spread 

In [37]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

# Assuming tagmap.get_itos() returns list of tag names like ['O', 'B-LOC', 'I-PER', ...]
tag_names = tagmap.get_itos()  # List of tag names
tag_indices = list(range(len(tag_names)))  # Indices of the tags

# Get predictions and true labels from the first element in misclassified8
example_pred = misclassified8[0]['predicted']
example_true = misclassified8[0]['true']

# Print Accuracy and Macro-Accuracy
print("\nExample's Test Accuracy : {:.3f}".format(accuracy_score(example_true, example_pred)))
print("\nExample's Test Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(example_true, example_pred)))

# Print Classification Report
print("\nClassification Report : ")
print(classification_report(example_true, example_pred, labels=tag_indices, target_names=tag_names, zero_division=0))


Example's Test Accuracy : 0.880

Example's Test Macro-Accuracy : 0.861

Classification Report : 
              precision    recall  f1-score   support

        I-NP       1.00      0.89      0.94         9
        B-NP       0.86      1.00      0.92         6
           O       1.00      1.00      1.00         2
        B-PP       0.33      1.00      0.50         1
        B-VP       1.00      1.00      1.00         3
        I-VP       1.00      1.00      1.00         1
      B-ADVP       1.00      1.00      1.00         1
      B-SBAR       0.00      0.00      0.00         0
      B-ADJP       0.00      0.00      0.00         0
       B-PRT       0.00      0.00      0.00         0
      I-ADJP       0.00      0.00      0.00         0
      I-ADVP       0.00      0.00      0.00         0
        I-PP       0.00      0.00      0.00         2
      B-INTJ       0.00      0.00      0.00         0
     I-CONJP       0.00      0.00      0.00         0
       B-LST       0.00      0.00    

The accuracy of the model using example sentence as input is not as good as the one of POS tagging (around 10% worse) but it is still high. B-PP tag has very low precision and f1-score, showcasing the model's inefficiency regarding that tag.

In [38]:
torch.save(model6, './drive/My Drive/CoNLL003/model/model6.pt')
model6lod = torch.load( './drive/My Drive/CoNLL003/model/model6.pt')

## Question 8: roberta-base tokenizer

New model and tokenizer was used. The functions are the same as before but we had to adjust the tagset, the loaders and the model definition as you will see below.

In [39]:
#
# Named-entity recognition using RoBERTa
# Dataset: https://www.kaggle.com/datasets/alaakhaled/conll003-englishversion
#

# dependencies
import torch
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
from transformers import RobertaForTokenClassification, RobertaTokenizerFast
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import tqdm
tqdmn = tqdm.notebook.tqdm

# hyper-parameters
EPOCHS = 3
BATCH_SIZE = 8
LR = 1e-5

# the path of the data files
base_path = './drive/My Drive/CoNLL003/'

# use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# read the data files
def load_sentences(filepath):
    sentences = []
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []

    with open(filepath, 'r') as f:
        for line in f.readlines():
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(tokens) > 0:
                    sentences.append({'tokens': tokens, 'pos_tags': pos_tags, 'chunk_tags': chunk_tags, 'ner_tags': ner_tags})
                    tokens = []
                    pos_tags = []
                    chunk_tags = []
                    ner_tags = []
            else:
                l = line.split(' ')
                tokens.append(l[0])
                pos_tags.append(l[1])
                chunk_tags.append(l[2])
                ner_tags.append(l[3].strip('\n'))
    return sentences

print('loading data')
train_sentences = load_sentences(base_path + 'train.txt')
test_sentences = load_sentences(base_path + 'test.txt')
valid_sentences = load_sentences(base_path + 'valid.txt')

# build tagset and tag ids
tags = [sentence['ner_tags'] for sentence in train_sentences]
tagmap = build_vocab_from_iterator(tags)
tagset = set([item for sublist in tags for item in sublist])
print('Tagset size:', len(tagset))

# load RoBERTa tokenizer
roberta_version = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(roberta_version, add_prefix_space=True)

# map tokens and tags to token ids and label ids
def align_label(tokens, labels):
    word_ids = tokens.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(tagmap[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx
    return label_ids

def encode(sentence):
    encodings = tokenizer(sentence['tokens'], truncation=True, padding='max_length', is_split_into_words=True)
    labels = align_label(encodings, sentence['ner_tags'])
    return {'input_ids': torch.LongTensor(encodings.input_ids), 'attention_mask': torch.LongTensor(encodings.attention_mask), 'labels': torch.LongTensor(labels)}

print('encoding data')
train_dataset = [encode(sentence) for sentence in train_sentences]
valid_dataset = [encode(sentence) for sentence in valid_sentences]
test_dataset = [encode(sentence) for sentence in test_sentences]

# initialize the model including a classification layer with num_labels classes
print('initializing the model')
model = RobertaForTokenClassification.from_pretrained(roberta_version, num_labels=len(tagset))
model.to(device)
optimizer = optim.AdamW(params=model.parameters(), lr=LR)

# prepare batches of data
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

# evaluate the performance of the model
def EvaluateModel(model, data_loader):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds = [], []
        for i, batch in enumerate(tqdmn(data_loader)):
            # move the batch tensors to the same device as the model
            batch = {k: v.to(device) for k, v in batch.items()}
            # send 'input_ids', 'attention_mask' and 'labels' to the model
            outputs = model(**batch)
            # iterate through the examples
            for idx, _ in enumerate(batch['labels']):
                # get the true values
                true_values_all = batch['labels'][idx]
                true_values = true_values_all[true_values_all != -100]
                # get the predicted values
                pred_values = torch.argmax(outputs[1], dim=2)[idx]
                pred_values = pred_values[true_values_all != -100]
                # update the lists of true answers and predictions
                Y_actual.append(true_values)
                Y_preds.append(pred_values)
        Y_actual = torch.cat(Y_actual)
        Y_preds = torch.cat(Y_preds)
    # Return list of actual labels, predicted labels
    return Y_actual.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()

# train the model
print('training the model')
for epoch in tqdmn(range(EPOCHS)):
    model.train()
    print('epoch', epoch + 1)
    # iterate through each batch of the train data
    for i, batch in enumerate(tqdmn(train_loader)):
        # move the batch tensors to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()}
        # send 'input_ids', 'attention_mask' and 'labels' to the model
        outputs = model(**batch)
        loss = outputs[0]
        # set the gradients to zero
        optimizer.zero_grad()
        # propagate the loss backwards
        loss.backward()
        # update the model weights
        optimizer.step()
    # calculate performance on validation set
    Y_actual, Y_preds = EvaluateModel(model, valid_loader)
    print("\nValidation Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
    print("\nValidation Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))

print('applying the model to the test set')
# apply the trained model to the test set
Y_actual, Y_preds = EvaluateModel(model, test_loader)

print("\nTest Accuracy : {:.3f}".format(accuracy_score(Y_actual, Y_preds)))
print("\nTest Macro-Accuracy : {:.3f}".format(balanced_accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, labels=range(len(tagmap.get_itos())), target_names=tagmap.get_itos(), zero_division=0))


loading data
Tagset size: 9


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

encoding data
initializing the model


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training the model


  0%|          | 0/3 [00:00<?, ?it/s]

epoch 1


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.989

Validation Macro-Accuracy : 0.930
epoch 2


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.992

Validation Macro-Accuracy : 0.949
epoch 3


  0%|          | 0/1756 [00:00<?, ?it/s]

  0%|          | 0/407 [00:00<?, ?it/s]


Validation Accuracy : 0.991

Validation Macro-Accuracy : 0.957
applying the model to the test set


  0%|          | 0/432 [00:00<?, ?it/s]


Test Accuracy : 0.982

Test Macro-Accuracy : 0.916

Classification Report : 
              precision    recall  f1-score   support

           O       1.00      0.99      1.00     38323
       B-LOC       0.96      0.93      0.94      1668
       B-PER       0.98      0.95      0.96      1617
       B-ORG       0.88      0.95      0.91      1661
       I-PER       0.98      0.98      0.98      1156
       I-ORG       0.86      0.93      0.89       835
      B-MISC       0.82      0.85      0.83       702
       I-LOC       0.86      0.89      0.88       257
      I-MISC       0.60      0.77      0.68       216

    accuracy                           0.98     46435
   macro avg       0.88      0.92      0.90     46435
weighted avg       0.98      0.98      0.98     46435



## Test Accuracy and Macro-Accuracy
RoBERTa:
Test Accuracy: 0.982
Test Macro-Accuracy: 0.916
BERT:
Test Accuracy: 0.978
Test Macro-Accuracy: 0.910
RoBERTa shows a slight improvement in both overall test accuracy and macro-accuracy, suggesting it handles the general classification and the balance across different classes slightly better than BERT.

Additionally, RoBERTa's higher macro-accuracy means it performs more consistently across various types of entities, not just the common ones but also the rarer types.

The predictions of B-LOC and B-ORG tags are more accurate for reborta model as we see in the classification reports.

The trend remains the same regarding less frequent tags such as B-MISC but there are some occasions where precision scores of original BERT model are higher tahn roberta, such as I-MISC Ones(0.62 and 0.60 respectively)

Since roberta is more extensively trained model, the results seem logical.

Review

All the questions were answered but the execution times were very high due to the demand for GPU. (L4 gpu was used for this assignment mainly, also T4 for some tests.) Also BERT models required high RAM usage. The best model of this assignement was roberta-base.

## Question 9 - chat gpt 3.5 prompts (answering Question 2)

Zero shot prompting

```

Prompt:
Suppose that we have a sentence with Tokens: ['He', 'guided', 'Ireland', 'to', 'two', 'successive', 'World', 'Cup', 'finals', 'tournaments', 'and', 'to', 'the', '1988', 'European', 'championship', 'finals', 'in', 'Germany', ',', 'after', 'the', 'Irish', 'beat', 'a', 'well-fancied', 'England', 'team', '1-0', 'in', 'their', 'group', 'qualifier', '.']. Label these tokens with POS tags, NER Tags and Chunk tags according to CoNLL 2003 dataset format. The result should be in this format for each row : token POS Tag Chunk Tag NER Tag

Chat-gpt 3.5 result:

Prompt: Aligned

Chat-gpt 3.5 result:

Token       POS Tag   Chunk Tag   NER Tag
-----------------------------------------
He          PRP       B-NP        O
guided      VBD       B-VP        O
Ireland     NNP       B-NP        B-LOC
to          TO        B-PP        O
two         CD        B-NP        O
successive  JJ        I-NP        O
World       NNP       B-NP        B-ORG
Cup         NNP       I-NP        I-ORG
finals      NNS       I-NP        O
tournaments NNS       I-NP        O
and         CC        O           O
to          TO        B-PP        O
the         DT        B-NP        O
1988        CD        I-NP        B-MISC
European    NNP       I-NP        I-MISC
championshipNN        I-NP        I-MISC
finals      NNS       I-NP        I-MISC
in          IN        B-PP        O
Germany     NNP       B-NP        B-LOC
,           ,         O           O
after       IN        B-PP        O
the         DT        B-NP        O
Irish       JJ        I-NP        B-MISC
beat        VBD       B-VP        O
a           DT        B-NP        O
well-fanciedJJ        I-NP        O
England     NNP       I-NP        B-LOC
team        NN        I-NP        O
1-0         CD        B-NP        O
in          IN        B-PP        O
their       PRP$      B-NP        O
group       NN        I-NP        O
qualifier   NN        I-NP        O
.           .         O           O



Prompt:

if the text below represents the correct tags, compare tags for each token( comparison should be in one row, markdown  code needed to copy results) He PRP B-NP O
guided VBD B-VP O
Ireland NNP B-NP B-LOC
to TO B-VP O
two CD B-NP O
successive JJ I-NP O
World NNP I-NP B-MISC
Cup NNP I-NP I-MISC
finals NNS I-NP O
tournaments NNS I-NP O
and CC O O
to TO B-PP O
the DT B-NP O
1988 CD I-NP O
European JJ I-NP B-MISC
championship NN I-NP O
finals NNS I-NP O
in IN B-PP O
Germany NNP B-NP B-LOC
, , O O
after IN B-SBAR O
the DT B-NP O
Irish JJ I-NP B-MISC
beat NN I-NP O
a DT B-NP O
well-fancied JJ I-NP O
England NNP I-NP B-LOC
team NN I-NP O
1-0 CD I-NP O
in IN B-PP O
their PRP$ B-NP O
group NN I-NP O
qualifier NN I-NP O
. . O O


chat gpt 3.5 result:


| Token      | Provided POS Tag | Correct POS Tag | Provided Chunk Tag | Correct Chunk Tag | Provided NER Tag | Correct NER Tag |
|------------|------------------|-----------------|--------------------|-------------------|------------------|-----------------|
| He         | PRP              | PRP             | B-NP               | B-NP              | O                | O               |
| guided     | VBD              | VBD             | B-VP               | B-VP              | O                | O               |
| Ireland    | NNP              | NNP             | B-NP               | B-NP              | B-LOC            | B-LOC           |
| to         | TO               | TO              | B-VP               | B-PP              | O                | O               |
| two        | CD               | CD              | B-NP               | B-NP              | O                | O               |
| successive | JJ               | JJ              | I-NP               | I-NP              | O                | O               |
| World      | NNP              | NNP             | I-NP               | B-NP              | B-MISC           | B-ORG           |
| Cup        | NNP              | NNP             | I-NP               | I-NP              | I-MISC           | I-ORG           |
| finals     | NNS              | NNS             | I-NP               | I-NP              | O                | O               |
| tournaments| NNS              | NNS             | I-NP               | I-NP              | O                | O               |
| and        | CC               | CC              | O                  | O                 | O                | O               |
| to         | TO               | TO              | B-PP               | B-PP              | O                | O               |
| the        | DT               | DT              | B-NP               | B-NP              | O                | O               |
| 1988       | CD               | CD              | I-NP               | I-NP              | B-MISC           | B-MISC          |
| European   | NNP              | NNP             | I-NP               | I-NP              | I-MISC           | I-MISC          |
| championship| NN              | NN              | I-NP               | I-NP              | I-MISC           | I-MISC          |
| finals     | NNS              | NNS             | I-NP               | I-NP              | I-MISC           | I-MISC          |
| in         | IN               | IN              | B-PP               | B-PP              | O                | O               |
| Germany    | NNP              | NNP             | B-NP               | B-NP              | B-LOC            | B-LOC           |
| ,          | ,                | ,               | O                  | O                 | O                | O               |
| after      | IN               | IN              | B-SBAR             | B-PP              | O                | O               |
| the        | DT               | DT              | B-NP               | B-NP              | O                | O               |
| Irish      | JJ               | JJ              | I-NP               | I-NP              | B-MISC           | B-MISC          |
| beat       | NN               | VBD             | I-NP               | B-VP              | O                | O               |
| a          | DT               | DT              | B-NP               | B-NP              | O                | O               |
| well-fancied| JJ              | JJ              | I-NP               | I-NP              | O                | O               |
| England    | NNP              | NNP             | I-NP               | I-NP              | B-LOC            | B-LOC           |
| team       | NN               | NN              | I-NP               | I-NP              | O                | O               |
| 1-0        | CD               | CD              | I-NP               | B-NP              | O                | O               |
| in         | IN               | IN              | B-PP               | B-PP              | O                | O               |
| their      | PRP$             | PRP$            | B-NP               | B-NP              | O                | O               |
| group      | NN               | NN              | I-NP               | I-NP              | O                | O               |
| qualifier  | NN               | NN              | I-NP               | I-NP              | O                | O               |
| .          | .                | .               | O                  | O                 | O                | O               |



chat gpt-prompt:

Need missclassfications for each column(1st POS, 2nd Chuck, 3rd NER)

Result:

Here are the misclassifications for each column:

POS Tags:

'beat' is incorrectly labeled as NN (should be VBD)
'1-0' is incorrectly labeled as CD (should be NN)
Chunk Tags:

'after' is incorrectly labeled as B-SBAR (should be B-PP)
NER Tags:

'World' is incorrectly labeled as B-MISC (should be B-ORG)
'Cup' is incorrectly labeled as I-MISC (should be I-ORG)
'Irish' is incorrectly labeled as B-MISC (should be B-LOC)
Let me know if you need further clarification!



```

## Question 9 - Gemini 1.0 prompts (answering Question 2)

Zero shot prompting

```
Same prompts as before for Gemini-1.0

Token	POS Tag	Chunk Tag	NER Tag
He	PRP	B-NP	O
guided	VBD	B-VP	O
Ireland	NNP	B-NP	B-LOC
to	TO	B-PP	O
two	CD	B-NP	O
successive	JJ	I-NP	O
World	NNP	B-NP	O
Cup	NNP	I-NP	O
finals	NNPS	I-NP	O
tournaments	NNPS	I-NP	O
and	CC	O	O
to	TO	B-PP	O
the	DT	B-NP	O
1988	CD	B-NP	O
European	JJ	I-NP	O
championship	NN	I-NP	O
finals	NNPS	I-NP	O
in	IN	B-PP	O
Germany	NNP	B-NP	B-LOC
,	,	O	O
after	IN	B-PP	O
the	DT	B-NP	O
Irish	JJ	I-NP	B-MISC
beat	VBD	B-VP	O
a	DT	B-NP	O
well-fancied	JJ	I-NP	O
England	NNP	B-NP	B-LOC
team	NN	I-NP	O
1-0	CD	B-NP	O
in	IN	B-PP	O
their	PRP$	B-NP	O
group	NN	I-NP	O
qualifier	NN	I-NP	O
.	.	O	O



```
```
Token,Your POS,Your Chunk,Your NER,Corrected POS,Corrected Chunk,Corrected NER
He,PRP,B-NP,O,PRP,B-NP,O
guided,VBD,B-VP,O,VBD,B-VP,O
Ireland,NNP,B-NP,B-LOC,NNP,B-NP,B-LOC
to,TO,B-VP,O,TO,B-PP,O
two,CD,B-NP,O,CD,B-NP,O
successive,JJ,I-NP,O,JJ,I-NP,O
World,NNP,I-NP,B-MISC,NNP,B-NP,O
Cup,NNP,I-NP,I-MISC,NNP,I-NP,O
finals,NNS,I-NP,O,NNPS,I-NP,O
tournaments,NNS,I-NP,O,NNPS,I-NP,O
and,CC,O,O,CC,O,O
to,TO,B-PP,O,TO,B-PP,O
the,DT,B-NP,O,DT,B-NP,O
1988,CD,I-NP,O,CD,B-NP,O
European,JJ,I-NP,B-MISC,JJ,I-NP,B-MISC
championship,NN,I-NP,O,NN,I-NP,O
finals,NNS,I-NP,O,NNPS,I-NP,O
in,IN,B-PP,O,IN,B-PP,O
Germany,NNP,B-NP,B-LOC,NNP,B-NP,B-LOC
",",",",O,O,",",O,O
after,IN,B-SBAR,O,IN,B-PP,O
the,DT,B-NP,O,DT,B-NP,O
Irish,JJ,I-NP,B-MISC,JJ,I-NP,B-DEMONYM
beat,NN,I-NP,O,VBD,B-VP,O
a,DT,B-NP,O,DT,B-NP,O
well-fancied,JJ,I-NP,O,JJ,I-NP,O
England,NNP,I-NP,B-LOC,NNP,B-NP,B-LOC
team,NN,I-NP,O,NN,I-NP,O
1-0,CD,I-NP,O,CD,B-NP,O
in,IN,B-PP,O,IN,B-PP,O
their,PRP$,B-NP,O,PRP$,B-NP,O
group,NN,I-NP,O,NN,I-NP,O
qualifier,NN,I-NP,O,NN,I-NP,O
.,.,O,O,.,O,O


Gemini prompt: Need missclassfications for each column(1st POS, 2nd Chuck, 3rd NER)

Result:

Column	Your Tag	Corrected Tag	Explanation
POS	beat (NN)	VBD	Verb (past tense) vs Noun
Chunk	World (I-NP), Cup (I-NP)	World (B-NP), Cup (B-NP)	Separate NPs forming a named entity
Chunk	European (I-NP)	European (I-NP)	Correct Chunk tag, but missed NER classification
NER	World (B-MISC), Cup (I-MISC), European (B-MISC)	World (O), Cup (O), European (B-MISC)	Named entity "World Cup", European part of "European Championship" (not MISC)
NER	Irish (B-MISC)	Irish (B-DEMONYM)	Nationality vs Miscellaneous entity
Chunk	after (B-SBAR)	after (B-PP)	Preposition introducing PP vs Beginning of subordinate clause
```


## Question 10 - chat gpt prompts (answering Question 2)

Few shot prompting. We basically provide the correct example at the start. Many examples can be provided, particualarly targetting misclassifications to further eliminate misclassifications.

```

Shot

He (PRP B-NP O)
guided (VBD B-VP O)
Ireland (NNP B-NP B-LOC)
to (TO B-VP O)
two (CD B-NP O)
successive (JJ I-NP O)
World (NNP I-NP B-MISC)
Cup (NNP I-NP I-MISC)
finals (NNS I-NP O)
tournaments (NNS I-NP O)
and (CC O O)
to (TO B-PP O)
the (DT B-NP O)
1988 (CD I-NP O)
European (JJ I-NP B-MISC)
championship (NN I-NP O)
finals (NNS I-NP O)
in (IN B-PP O)
Germany (NNP B-NP B-LOC)
, (, O O)
after (IN B-SBAR O)
the (DT B-NP O)
Irish (JJ I-NP B-MISC)
beat (NN I-NP O)
a (DT B-NP O)
well-fancied (JJ I-NP O)
England (NNP I-NP B-LOC)
team (NN I-NP O)
1-0 (CD I-NP O)
in (IN B-PP O)
their (PRP$ B-NP O)
group (NN I-NP O)
qualifier (NN I-NP O)
. (. O O)


```


```
Shot 2

label this sentence with pos, chuck and ner tags ['He', 'guided', 'Ireland', 'to', 'two', 'successive', 'World', 'Cup', 'finals', 'tournaments', 'and', 'to', 'the', '1988', 'European', 'championship', 'finals', 'in', 'Germany', ',', 'after', 'the', 'Irish', 'beat', 'a', 'well-fancied', 'England', 'team', '1-0', 'in', 'their', 'group', 'qualifier', '.']

i need the output in this format 1995 CD I-NP O


He PRP B-NP O
guided VBD B-VP O
Ireland NNP B-NP B-LOC
to TO B-VP O
two CD B-NP O
successive JJ I-NP O
World NNP I-NP B-MISC
Cup NNP I-NP I-MISC
finals NNS I-NP O
tournaments NNS I-NP O
and CC O O
to TO B-PP O
the DT B-NP O
1988 CD I-NP O
European JJ I-NP B-MISC
championship NN I-NP O
finals NNS I-NP O
in IN B-PP O
Germany NNP B-NP B-LOC
, , O O
after IN B-PP O
the DT B-NP O
Irish JJ I-NP B-MISC
beat NN I-NP O
a DT B-NP O
well-fancied JJ I-NP O
England NNP I-NP B-LOC
team NN I-NP O
1-0 CD I-NP O
in IN B-PP O
their PRP$ B-NP O
group NN I-NP O
qualifier NN I-NP O
. . O O



Need missclassfications for each column(1st POS, 2nd Chuck, 3rd NER)


result:


Here are the misclassifications for each column:

POS misclassifications:

"1988" is tagged as CD (Cardinal Number) instead of a JJ (Adjective).
"1-0" is tagged as CD (Cardinal Number) instead of a NN (Noun).
Chunk misclassifications:

None observed.
NER misclassifications:

"1988" is tagged as I-NP (Inside Named Entity Phrase) instead of O (Outside any Named Entity Phrase).
"1-0" is tagged as I-NP (Inside Named Entity Phrase) instead of O (Outside any Named Entity Phrase).
These misclassifications are based on the traditional understanding of the English language grammar and named entity recognition rules.

```

Result: Two less misclassifications.

## Gemini 1.0

```

Shot(Correctly annotated example)

He (PRP B-NP O)
guided (VBD B-VP O)
Ireland (NNP B-NP B-LOC)
to (TO B-VP O)
two (CD B-NP O)
successive (JJ I-NP O)
World (NNP I-NP B-MISC)
Cup (NNP I-NP I-MISC)
finals (NNS I-NP O)
tournaments (NNS I-NP O)
and (CC O O)
to (TO B-PP O)
the (DT B-NP O)
1988 (CD I-NP O)
European (JJ I-NP B-MISC)
championship (NN I-NP O)
finals (NNS I-NP O)
in (IN B-PP O)
Germany (NNP B-NP B-LOC)
, (, O O)
after (IN B-SBAR O)
the (DT B-NP O)
Irish (JJ I-NP B-MISC)
beat (NN I-NP O)
a (DT B-NP O)
well-fancied (JJ I-NP O)
England (NNP I-NP B-LOC)
team (NN I-NP O)
1-0 (CD I-NP O)
in (IN B-PP O)
their (PRP$ B-NP O)
group (NN I-NP O)
qualifier (NN I-NP O)
. (. O O)
expand_more

```

```
prompt: label this sentence with pos, chuck and ner tags ['He', 'guided', 'Ireland', 'to', 'two', 'successive', 'World', 'Cup', 'finals', 'tournaments', 'and', 'to', 'the', '1988', 'European', 'championship', 'finals', 'in', 'Germany', ',', 'after', 'the', 'Irish', 'beat', 'a', 'well-fancied', 'England', 'team', '1-0', 'in', 'their', 'group', 'qualifier', '.']

Output:


i need the output in this format 1995 CD I-NP O, Irish NNP I-NP B-ORG

output:
```


```
Token	POS	Chunk	NER
He	PRP	B-NP	O
guided	VBD	B-VP	O
Ireland	NNP	B-LOC	B-LOC
to	TO	B-PP	O
two	CD	B-NP	O
successive	JJ	I-NP	O
World	NNP	B-MISC	B-MISC
Cup	NNP	I-MISC	B-MISC
finals	NNS	I-NP	O
tournaments	NNS	I-NP	O
and	CC	O	O
to	TO	B-PP	O
the	DT	B-NP	O
1988	CD	I-NP	O
European	JJ	B-MISC	B-MISC
championship	NN	I-MISC	B-MISC
finals	NNS	I-NP	O
in	IN	B-PP	O
Germany	NNP	B-LOC	B-LOC
,	,	O	O
after	IN	B-SBAR	O
the	DT	B-NP	B-MISC
Irish	JJ	I-NP	B-MISC
beat	NN	I-NP	O
a	DT	B-NP	O
well-fancied	JJ	I-NP	O
England	NNP	I-NP	B-LOC
team	NN	I-NP	O
1-0	CD	I-NP	O
in	IN	B-PP	O
their	PRP$	B-NP	O
group	NN	I-NP	O
qualifier	NN	I-NP	O
.	.	O	O
```

```

prompt: if the text below represents the correct tags, compare tags for each token( comparison should be in one row, markdown  code needed to copy results) He PRP B-NP O
guided VBD B-VP O
Ireland NNP B-NP B-LOC
to TO B-VP O
two CD B-NP O
successive JJ I-NP O
World NNP I-NP B-MISC
Cup NNP I-NP I-MISC
finals NNS I-NP O
tournaments NNS I-NP O
and CC O O
to TO B-PP O
the DT B-NP O
1988 CD I-NP O
European JJ I-NP B-MISC
championship NN I-NP O
finals NNS I-NP O
in IN B-PP O
Germany NNP B-NP B-LOC
, , O O
after IN B-SBAR O
the DT B-NP O
Irish JJ I-NP B-MISC
beat NN I-NP O
a DT B-NP O
well-fancied JJ I-NP O
England NNP I-NP B-LOC
team NN I-NP O
1-0 CD I-NP O
in IN B-PP O
their PRP$ B-NP O
group NN I-NP O
qualifier NN I-NP O
. . O O

output:

Token	POS	Chunking	NER
He	PRP	B-NP	O
guided	VBD	B-VP	O
Ireland	NNP	B-NP	B-LOC
to	TO	B-VP	O
two	CD	B-NP	O
successive	JJ	I-NP	O
World	NNP	I-NP	B-MISC
Cup	NNP	I-NP	I-MISC
finals	NNS	I-NP	O
tournaments	NNS	I-NP	O
and	CC	O	O
to	TO	B-PP	O
the	DT	B-NP	O
1988	CD	I-NP	O
European	JJ	I-NP	B-MISC
championship	NN	I-NP	I-MISC
finals	NNS	I-NP	O
in	IN	B-PP	O
Germany	NNP	B-NP	B-LOC
,	,	O	O
after	IN	B-SBAR	O
the	DT	B-NP	O
Irish	JJ	I-NP	B-MISC
beat	NN	I-NP	O (Potential missclassification)
a	DT	B-NP	O
well-fancied	JJ	I-NP	O
England	NNP	I-NP	B-LOC
team	NN	I-NP	O
1-0	CD	I-NP	O
in	IN	B-PP	O
their	PRP$	B-NP	O
group	NN	I-NP	O
qualifier	NN	I-NP	O
.	.	O	O
```

```
Gemini prompt: Need missclassfications for each column(1st POS, 2nd Chuck, 3rd NER)

Result:
Column	Current Tag	Possible Improvement	Reason
POS	-	No issues found	-
Chunking	-	No issues found	-
NER	Ireland (B-LOC)	Ireland (B-ORG)	Refers to national team in sports context

```

Result: Five less misclassifications, only one totally