In [91]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report    
from collections import defaultdict


MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 7
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
from torch import cuda


# Data Reading and Preprocessing Functions


# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Get the name and other details of each GPU
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
        print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
else:
    print("CUDA is not available. Running on CPU.")

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        labels = [labels_to_ids[label] for label in word_labels]

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

def clean_tag(tag):
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

def train_model(training_set, model, optimizer):
    model.train()
    tr_loss = 0
    nb_tr_steps = 0

    training_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        tr_loss += loss.item()
        nb_tr_steps += 1

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    return epoch_loss

train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

data = {'sentence': [" ".join(sentence) for sentence in train_tokens],
        'word_labels': [",".join(tags) for tags in train_tags]}

df = pd.DataFrame(data)

data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
             'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)

# Initialize a dictionary to hold the counts
tag_counts = defaultdict(int)

# Iterate through each list in test_tags and count the occurrences of each tag
for sentence in test_tags:
    for tag in sentence:
        tag_counts[tag] += 1

# Convert the defaultdict to a regular dictionary for easier printing
tag_counts = dict(tag_counts)

# Print the counts for each tag
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")

# Create mappings
all_tags = [tag for tags in df['word_labels'] for tag in tags.split(",")]
unique_tags = set(all_tags)
labels_to_ids = {k: v for v, k in enumerate(unique_tags)}
ids_to_labels = {v: k for k, v in labels_to_ids.items()}

# Display the mappings
print("labels_to_ids:", labels_to_ids)
print("ids_to_labels:", ids_to_labels)

# Create training and testing datasets
training_set = dataset(df, tokenizer, MAX_LEN)
testing_set = dataset(df_test, tokenizer, MAX_LEN)

test_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

# Function to count tag occurrences
def count_tags(tags_list):
    tag_counts = defaultdict(int)
    for sentence in tags_list:
        for tag in sentence:
            tag_counts[tag] += 1
    return tag_counts

# Count initial tag occurrences in test_tags
initial_tag_counts = count_tags(test_tags)
print("Initial tag counts in test_tags:", dict(initial_tag_counts))

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1_score = f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1_score}")
    report = seqeval_classification_report(eval_labels, eval_preds, output_dict=True)
    print(report)
    
    return eval_loss, eval_accuracy, F1_score, report


# Train and evaluate the model on the entire dataset
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))
model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)

# Training the model
for epoch in range(EPOCHS):
    train_loss = train_model(training_set, model, optimizer)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}")

# Evaluating the model
eval_loss, eval_accuracy, f1_score, eval_report = valid(model, testing_loader)
print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
print(eval_report)


# Display the evaluation metrics in a DataFrame
metrics = {
    "eval_loss": eval_loss,
    "accuracy": eval_accuracy,
    "f1_score": f1_score,
    "report": eval_report
}
metrics_df = pd.DataFrame([metrics])
print(metrics_df)

# Flatten the classification report for easier viewing
flat_reports = []
for label, scores in eval_report.items():
    flat_reports.append({
        "label": label,
        "precision": scores["precision"],
        "recall": scores["recall"],
        "f1-score": scores["f1-score"],
        "support": scores["support"]
    })

reports_df = pd.DataFrame(flat_reports)
print(reports_df)


Number of GPUs available: 1
GPU 0: NVIDIA A100 80GB PCIe
  Memory Allocated: 1.67 GB
  Memory Cached: 11.97 GB
O: 25000
B-CHAR: 820
I-CHAR: 85
B-LOC: 216
B-ORG: 2
I-LOC: 2
labels_to_ids: {'B-CHAR': 0, 'I-CHAR': 1, 'O': 2, 'B-ORG': 3, 'B-LOC': 4, 'I-LOC': 5}
ids_to_labels: {0: 'B-CHAR', 1: 'I-CHAR', 2: 'O', 3: 'B-ORG', 4: 'B-LOC', 5: 'I-LOC'}
Initial tag counts in test_tags: {'O': 25000, 'B-CHAR': 820, 'I-CHAR': 85, 'B-LOC': 216, 'B-ORG': 2, 'I-LOC': 2}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/7, Train Loss: 0.054613624334596336
Epoch 2/7, Train Loss: 0.008279851809808404
Epoch 3/7, Train Loss: 0.004345378296814598
Epoch 4/7, Train Loss: 0.003000743028331121
Epoch 5/7, Train Loss: 0.0021651852104789134
Epoch 6/7, Train Loss: 0.0016578336803918811
Epoch 7/7, Train Loss: 0.0013517654544650131
Validation loss per 100 evaluation steps: 0.16483165323734283
Validation Loss: 0.06220581477714909
Validation Accuracy: 0.9900095693779905
F1 Score: 0.8716981132075472
{'CHAR': {'precision': 0.8446026097271648, 'recall': 0.8682926829268293, 'f1-score': 0.8562838244137101, 'support': 820}, 'LOC': {'precision': 0.8865546218487395, 'recall': 0.9768518518518519, 'f1-score': 0.9295154185022027, 'support': 216}, 'ORG': {'precision': 1.0, 'recall': 0.5, 'f1-score': 0.6666666666666666, 'support': 2}, 'micro avg': {'precision': 0.8539741219963032, 'recall': 0.8901734104046243, 'f1-score': 0.8716981132075472, 'support': 1038}, 'macro avg': {'precision': 0.9103857438586348, 'recall': 0.78171

In [59]:
# Count initial tag occurrences in test_tags
train_tag_counts = count_tags(train_tags)
print("Initial tag counts in test_tags:", dict(train_tag_counts))

Initial tag counts in test_tags: {'O': 337411, 'B-CHAR': 8230, 'B-LOC': 1679, 'I-CHAR': 213, 'B-ORG': 6, 'I-LOC': 10}


In [60]:
train_tag_counts

defaultdict(int,
            {'O': 337411,
             'B-CHAR': 8230,
             'B-LOC': 1679,
             'I-CHAR': 213,
             'B-ORG': 6,
             'I-LOC': 10})

In [3]:
model.save_pretrained("final_train")
tokenizer.save_pretrained('final_tokenizer')

('final_tokenizer/tokenizer_config.json',
 'final_tokenizer/special_tokens_map.json',
 'final_tokenizer/vocab.txt',
 'final_tokenizer/added_tokens.json',
 'final_tokenizer/tokenizer.json')

In [4]:
import json
config = json.load(open('bert_train/config.json'))
config['id2label'] = ids_to_labels
config['label2id'] = labels_to_ids
json.dump(config, open('bert_train/config.json', 'w'))

Model metrics per epoch

In [61]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seqeval.metrics import f1_score as seq_f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report    
from collections import defaultdict

MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 10
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

# Assuming replace_per_tags, read_data, and dataset are defined as in previous steps

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1 = seq_f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1}")
    report = seqeval_classification_report(eval_labels, eval_preds, output_dict=True)
    print(report)
    
    return eval_loss, eval_accuracy, F1, report


# Lists to store metrics for each epoch
all_metrics = []
all_reports = []

# Training and evaluation for each epoch
for epoch in range(EPOCHS):
    train_loss = train_model(training_set, model, optimizer)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}")
    
    eval_loss, eval_accuracy, F1, eval_report = valid(model, testing_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
    
    # Store the metrics for this epoch
    metrics = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "eval_loss": eval_loss,
        "accuracy": eval_accuracy,
        "f1_score": F1
    }
    all_metrics.append(metrics)
    
    # Flatten the classification report for this epoch
    flat_report = []
    for label, scores in eval_report.items():
        flat_report.append({
            "epoch": epoch + 1,
            "label": label,
            "precision": scores["precision"],
            "recall": scores["recall"],
            "f1-score": scores["f1-score"],
            "support": scores["support"]
        })
    all_reports.extend(flat_report)

# Convert the lists of metrics and reports to DataFrames
metrics_df = pd.DataFrame(all_metrics)
reports_df = pd.DataFrame(all_reports)

# Display the DataFrames
print(metrics_df)
print(reports_df)


cuda
Epoch 1/10, Train Loss: 0.05366666917170095
Validation loss per 100 evaluation steps: 0.10730549693107605
Validation Loss: 0.05126193371704883
Validation Accuracy: 0.988133971291866
F1 Score: 0.8108360579168612


  _warn_prf(average, modifier, msg_start, len(result))


{'CHAR': {'precision': 0.7551487414187643, 'recall': 0.8048780487804879, 'f1-score': 0.7792207792207793, 'support': 820}, 'LOC': {'precision': 0.9082969432314411, 'recall': 0.9629629629629629, 'f1-score': 0.9348314606741573, 'support': 216}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}, 'micro avg': {'precision': 0.7869446962828649, 'recall': 0.8362235067437379, 'f1-score': 0.8108360579168612, 'support': 1038}, 'macro avg': {'precision': 0.5544818948834017, 'recall': 0.5892803372478169, 'f1-score': 0.5713507466316455, 'support': 1038}, 'weighted avg': {'precision': 0.7855627241824451, 'recall': 0.8362235067437379, 'f1-score': 0.8101008039177813, 'support': 1038}}
Epoch 1/10, Eval Loss: 0.05126193371704883, Eval Accuracy: 0.988133971291866
Epoch 2/10, Train Loss: 0.008632887777305878
Validation loss per 100 evaluation steps: 0.11779356002807617
Validation Loss: 0.04939408013597131
Validation Accuracy: 0.9895885167464115
F1 Score: 0.850984067478913
{'CHAR': {'p

In [62]:
metrics_df.to_csv('data/model-metrics-epoch.csv', index=False)

In [63]:
reports_df.to_csv('data/model-reports-epoch.csv', index=False)

Trying to do EWT Data

In [86]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report    
from collections import defaultdict


MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 7
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
from torch import cuda


# Data Reading and Preprocessing Functions


# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Get the name and other details of each GPU
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
        print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
else:
    print("CUDA is not available. Running on CPU.")

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        labels = [labels_to_ids[label] for label in word_labels]

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

def clean_tag(tag):
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

def train_model(training_set, model, optimizer):
    model.train()
    tr_loss = 0
    nb_tr_steps = 0

    training_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        tr_loss += loss.item()
        nb_tr_steps += 1

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    return epoch_loss

train_tokens, train_tags = read_data("./en_ewt-ud-train.iob2")
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
def replace_per_tags(tags_list):
    updated_tags_list = []
    for tags in tags_list:
        updated_tags = []
        for tag in tags:
            if tag == "B-PER":
                updated_tags.append("B-CHAR")
            elif tag == "I-PER":
                updated_tags.append("I-CHAR")
            else:
                updated_tags.append(tag)
        updated_tags_list.append(updated_tags)
    return updated_tags_list

train_tags = replace_per_tags(train_tags)

data = {'sentence': [" ".join(sentence) for sentence in train_tokens],
        'word_labels': [",".join(tags) for tags in train_tags]}

df = pd.DataFrame(data)

data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
             'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)



Number of GPUs available: 1
GPU 0: NVIDIA A100 80GB PCIe
  Memory Allocated: 4.12 GB
  Memory Cached: 11.97 GB


In [87]:
# Function to count tag occurrences
def count_tags(tags_list):
    tag_counts = defaultdict(int)
    for sentence in tags_list:
        for tag in sentence:
            tag_counts[tag] += 1
    return tag_counts

In [88]:
# Initialize a dictionary to hold the counts
tag_counts_train = defaultdict(int)

# Iterate through each list in test_tags and count the occurrences of each tag
for sentence in train_tags:
    for tag in sentence:
        tag_counts_train[tag] += 1

# Convert the defaultdict to a regular dictionary for easier printing
tag_counts_train = dict(tag_counts_train)

# Print the counts for each tag
print('tag_counts_train: ')
for tag, count in tag_counts_train.items():
    print(f"{tag}: {count}")

# Create mappings
all_tags = [tag for tags in df['word_labels'] for tag in tags.split(",")]
unique_tags = set(all_tags)
labels_to_ids = {k: v for v, k in enumerate(unique_tags)}
ids_to_labels = {v: k for k, v in labels_to_ids.items()}

# Display the mappings
print("labels_to_ids:", labels_to_ids)
print("ids_to_labels:", ids_to_labels)

tag_counts_train: 
O: 194219
B-LOC: 2712
I-LOC: 877
B-CHAR: 2874
B-ORG: 1436
I-ORG: 1167
I-CHAR: 1294
labels_to_ids: {'B-CHAR': 0, 'O': 1, 'I-CHAR': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
ids_to_labels: {0: 'B-CHAR', 1: 'O', 2: 'I-CHAR', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}


In [67]:
# Create training and testing datasets
training_set = dataset(df, tokenizer, MAX_LEN)
testing_set = dataset(df_test, tokenizer, MAX_LEN)

test_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

initial_train_tag_counts = count_tags(train_tags)
print("Initial tag counts in train_tags:", dict(initial_train_tag_counts))


# Count initial tag occurrences in test_tags
initial_tag_counts = count_tags(test_tags)
print("Initial tag counts in test_tags:", dict(initial_tag_counts))

Initial tag counts in test_tags: {'O': 194219, 'B-LOC': 2712, 'I-LOC': 877, 'B-CHAR': 2874, 'B-ORG': 1436, 'I-ORG': 1167, 'I-CHAR': 1294}
Initial tag counts in test_tags: {'O': 25000, 'B-CHAR': 820, 'I-CHAR': 85, 'B-LOC': 216, 'B-ORG': 2, 'I-LOC': 2}


In [89]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1_score = f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1_score}")
    report = seqeval_classification_report(eval_labels, eval_preds, output_dict=True)
    print(report)
    
    return eval_loss, eval_accuracy, F1_score, report


# Train and evaluate the model on the entire dataset
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))
model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)

# Training the model
for epoch in range(EPOCHS):
    train_loss = train_model(training_set, model, optimizer)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}")

# Evaluating the model
eval_loss, eval_accuracy, f1_score, eval_report = valid(model, testing_loader)
print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
print(eval_report)


# Display the evaluation metrics in a DataFrame
metrics = {
    "eval_loss": eval_loss,
    "accuracy": eval_accuracy,
    "f1_score": f1_score,
    "report": eval_report
}
metrics_df = pd.DataFrame([metrics])
print(metrics_df)

# Flatten the classification report for easier viewing
flat_reports = []
for label, scores in eval_report.items():
    flat_reports.append({
        "label": label,
        "precision": scores["precision"],
        "recall": scores["recall"],
        "f1-score": scores["f1-score"],
        "support": scores["support"]
    })

reports_df = pd.DataFrame(flat_reports)
print(reports_df)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/7, Train Loss: 0.056734724169757195
Epoch 2/7, Train Loss: 0.008274268891156657
Epoch 3/7, Train Loss: 0.004442892926945321
Epoch 4/7, Train Loss: 0.00292872416117276
Epoch 5/7, Train Loss: 0.0021876643148026945
Epoch 6/7, Train Loss: 0.00184585696249788
Epoch 7/7, Train Loss: 0.0011599590029763235
Validation loss per 100 evaluation steps: 0.1601811796426773
Validation Loss: 0.061447188542741865
Validation Accuracy: 0.9904306220095693
F1 Score: 0.8768796992481203
{'CHAR': {'precision': 0.8325635103926097, 'recall': 0.8792682926829268, 'f1-score': 0.8552787663107948, 'support': 820}, 'LOC': {'precision': 0.9459459459459459, 'recall': 0.9722222222222222, 'f1-score': 0.9589041095890412, 'support': 216}, 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2}, 'micro avg': {'precision': 0.8559633027522936, 'recall': 0.8988439306358381, 'f1-score': 0.8768796992481203, 'support': 1038}, 'macro avg': {'precision': 0.9261698187795185, 'recall': 0.9504968383017163, 'f1-s

In [50]:
metrics_df.to_csv('data/EWT-training_metrics.csv', index=False)

In [52]:
reports_df.to_csv('data/EWT-training-report.csv', index=False)

In [69]:
model.save_pretrained("EWT-baseline")
tokenizer.save_pretrained('EWT-tokenizer')

('EWT-tokenizer/tokenizer_config.json',
 'EWT-tokenizer/special_tokens_map.json',
 'EWT-tokenizer/vocab.txt',
 'EWT-tokenizer/added_tokens.json',
 'EWT-tokenizer/tokenizer.json')

In [72]:
import json
config = json.load(open('EWT-baseline/config.json'))
config['id2label'] = ids_to_labels
config['label2id'] = labels_to_ids
json.dump(config, open('EWT-baseline/config.json', 'w'))

'BertForTokenClassification'

EWT: ##Metrics per epoch


In [54]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seqeval.metrics import f1_score as seq_f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report    
from collections import defaultdict

MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 10
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

# Assuming replace_per_tags, read_data, and dataset are defined as in previous steps

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1 = seq_f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1}")
    report = seqeval_classification_report(eval_labels, eval_preds, output_dict=True)
    print(report)
    
    return eval_loss, eval_accuracy, F1, report


# Lists to store metrics for each epoch
all_metrics = []
all_reports = []

# Training and evaluation for each epoch
for epoch in range(EPOCHS):
    train_loss = train_model(training_set, model, optimizer)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}")
    
    eval_loss, eval_accuracy, F1, eval_report = valid(model, testing_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
    
    # Store the metrics for this epoch
    metrics = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "eval_loss": eval_loss,
        "accuracy": eval_accuracy,
        "f1_score": F1
    }
    all_metrics.append(metrics)
    
    # Flatten the classification report for this epoch
    flat_report = []
    for label, scores in eval_report.items():
        flat_report.append({
            "epoch": epoch + 1,
            "label": label,
            "precision": scores["precision"],
            "recall": scores["recall"],
            "f1-score": scores["f1-score"],
            "support": scores["support"]
        })
    all_reports.extend(flat_report)

# Convert the lists of metrics and reports to DataFrames
metrics_df = pd.DataFrame(all_metrics)
reports_df = pd.DataFrame(all_reports)

# Display the DataFrames
print(metrics_df)
print(reports_df)


cuda
Epoch 1/10, Train Loss: 0.003665855062925922
Validation loss per 100 evaluation steps: 0.39667797088623047
Validation Loss: 0.27037916084130603
Validation Accuracy: 0.9535693779904306
F1 Score: 0.3712433706540955
{'CHAR': {'precision': 0.4827586206896552, 'recall': 0.23902439024390243, 'f1-score': 0.31973898858075034, 'support': 820}, 'LOC': {'precision': 0.5021097046413502, 'recall': 0.5509259259259259, 'f1-score': 0.5253863134657836, 'support': 216}, 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2}, 'micro avg': {'precision': 0.4779969650986343, 'recall': 0.30346820809248554, 'f1-score': 0.3712433706540955, 'support': 1038}, 'macro avg': {'precision': 0.32828944177700176, 'recall': 0.26331677205660947, 'f1-score': 0.2817084340155113, 'support': 1038}, 'weighted avg': {'precision': 0.4858552650944595, 'recall': 0.30346820809248554, 'f1-score': 0.3619165841472298, 'support': 1038}}
Epoch 1/10, Eval Loss: 0.27037916084130603, Eval Accuracy: 0.9535693779904306

In [56]:
metrics_df.to_csv('data/EWT_epochs_metrics.csv', index=False)

In [57]:
reports_df.to_csv('data/EWT_epochs_reports.csv', index=False)

Train EWT+LOTR

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report    
from collections import defaultdict


MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 7
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
from torch import cuda


# Data Reading and Preprocessing Functions


# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Get the name and other details of each GPU
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
        print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
else:
    print("CUDA is not available. Running on CPU.")

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        labels = [labels_to_ids[label] for label in word_labels]

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

def clean_tag(tag):
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

def train_model(training_set, model, optimizer):
    model.train()
    tr_loss = 0
    nb_tr_steps = 0

    training_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        tr_loss += loss.item()
        nb_tr_steps += 1

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    return epoch_loss

train_tokens, train_tags = read_data("./tagged_sentences_train.iob2")
test_tokens, test_tags = read_data("./tagged_sentences_test.iob2")

data = {'sentence': [" ".join(sentence) for sentence in train_tokens],
        'word_labels': [",".join(tags) for tags in train_tags]}

df = pd.DataFrame(data)

data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
             'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)

# Initialize a dictionary to hold the counts
tag_counts = defaultdict(int)

# Iterate through each list in test_tags and count the occurrences of each tag
for sentence in test_tags:
    for tag in sentence:
        tag_counts[tag] += 1

# Convert the defaultdict to a regular dictionary for easier printing
tag_counts = dict(tag_counts)

# Print the counts for each tag
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")

labels_to_ids = {'B-CHAR': 0, 'O': 1, 'I-CHAR': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
ids_to_labels = {0: 'B-CHAR', 1: 'O', 2: 'I-CHAR', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

# Create training and testing datasets
training_set = dataset(df, tokenizer, MAX_LEN)
testing_set = dataset(df_test, tokenizer, MAX_LEN)

test_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

# Function to count tag occurrences
def count_tags(tags_list):
    tag_counts = defaultdict(int)
    for sentence in tags_list:
        for tag in sentence:
            tag_counts[tag] += 1
    return tag_counts

# Count initial tag occurrences in test_tags
initial_tag_counts = count_tags(test_tags)
print("Initial tag counts in test_tags:", dict(initial_tag_counts))

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1_score = f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1_score}")
    report = seqeval_classification_report(eval_labels, eval_preds, output_dict=True)
    print(report)
    
    return eval_loss, eval_accuracy, F1_score, report


# Train and evaluate the model on the entire dataset

model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)

# Training the model
for epoch in range(EPOCHS):
    train_loss = train_model(training_set, model, optimizer)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss}")

# Evaluating the model
eval_loss, eval_accuracy, f1_score, eval_report = valid(model, testing_loader)
print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
print(eval_report)


# Display the evaluation metrics in a DataFrame
metrics = {
    "eval_loss": eval_loss,
    "accuracy": eval_accuracy,
    "f1_score": f1_score,
    "report": eval_report
}
metrics_df = pd.DataFrame([metrics])
print(metrics_df)

# Flatten the classification report for easier viewing
flat_reports = []
for label, scores in eval_report.items():
    flat_reports.append({
        "label": label,
        "precision": scores["precision"],
        "recall": scores["recall"],
        "f1-score": scores["f1-score"],
        "support": scores["support"]
    })

reports_df = pd.DataFrame(flat_reports)
print(reports_df)


Number of GPUs available: 1
GPU 0: NVIDIA A100 80GB PCIe
  Memory Allocated: 4.13 GB
  Memory Cached: 11.97 GB
O: 25000
B-CHAR: 820
I-CHAR: 85
B-LOC: 216
B-ORG: 2
I-LOC: 2
Initial tag counts in test_tags: {'O': 25000, 'B-CHAR': 820, 'I-CHAR': 85, 'B-LOC': 216, 'B-ORG': 2, 'I-LOC': 2}
Epoch 1/7, Train Loss: 0.0011966206406399417
Epoch 2/7, Train Loss: 0.0009113156999200716
Epoch 3/7, Train Loss: 0.000998297705215229
Epoch 4/7, Train Loss: 0.0005944750726566028


In [85]:
metrics_df.to_csv('data/EWT_LOTR_metrics.csv', index=False)

In [84]:
reports_df.to_csv('data/EWT_LOTR_reports.csv', index=False)

In [79]:
model.save_pretrained("EWT-LOTR-baseline")
tokenizer.save_pretrained('EWT-LOTR-tokenizer')
import json
config = json.load(open('EWT-LOTR-baseline/config.json'))
config['id2label'] = ids_to_labels
config['label2id'] = labels_to_ids
json.dump(config, open('EWT-LOTR-baseline/config.json', 'w'))