# Import Libraries

In [1]:
# library
import os
#from pprint import pprint as pp

import torch
from torch import nn
from transformers import LongformerTokenizer, AutoTokenizer
from torch.utils.data import Dataset, DataLoader

from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

import random

from collections import defaultdict

In [2]:
seed = 43

random.seed(seed)

In [3]:
first_dataset_doc_path = "./dataset/First_Phase_Release(Correction)/First_Phase_Text_Dataset/"
second_dataset_doc_path = "./dataset/Second_Phase_Dataset/Second_Phase_Text_Dataset/"
label_path = ["./dataset/First_Phase_Release(Correction)/answer.txt", "./dataset/Second_Phase_Dataset/answer.txt"]
val_dataset_doc_parh = "./dataset/validation_dataset/Validation_Release/"
val_label_path = "./dataset/validation_dataset/answer.txt"

first_dataset_path = [first_dataset_doc_path + file_path for file_path in os.listdir(first_dataset_doc_path)]
second_dataset_path = [second_dataset_doc_path + file_path for file_path in os.listdir(second_dataset_doc_path)]
train_path = first_dataset_path + second_dataset_path
val_path = [val_dataset_doc_parh + file_path for file_path in os.listdir(val_dataset_doc_parh)]

#check number of data-path
print(len(first_dataset_path)) #1120
print(len(second_dataset_path)) #614
print()
print(len(train_path)) #1734
print(len(val_path)) #560

1120
614

1734
560


In [4]:

def create_label_dict(label_path):
    label_dict = {}  # y
    with open(label_path, "r", encoding="utf-8-sig") as f:
        file_text = f.read().strip()  

    # (id, label, start, end, query) or (id, label, start, end, query, time_org, timefix)
    for line in file_text.split("\n"):
        sample = line.split("\t")  
        sample[2], sample[3] = int(sample[2]), int(sample[3])

        if sample[0] not in label_dict:
            label_dict[sample[0]] = [sample[1:]]
        else:
            label_dict[sample[0]].append(sample[1:])

    return label_dict

train_label_dict = create_label_dict(label_path[0])
second_dataset_label_dict = create_label_dict(label_path[1])
train_label_dict.update(second_dataset_label_dict)
val_label_dict = create_label_dict(val_label_path)


def load_medical_records(paths):
    medical_record_dict = {}
    for data_path in paths:

        if os.path.isfile(data_path):
            file_id = data_path.split("/")[-1].split(".txt")[0]
            with open(data_path, "r", encoding="utf-8") as f:
                file_text = f.read()
                medical_record_dict[file_id] = file_text
    return medical_record_dict

train_medical_record_dict = load_medical_records(train_path)
val_medical_record_dict = load_medical_records(val_path)

In [5]:
all_medical_record_dict = {**train_medical_record_dict, **val_medical_record_dict}
all_label_dict = {**train_label_dict, **val_label_dict}

In [6]:

def check_labels(text, labels, record_id, tag=False):
    for i, label in enumerate(labels):  
        extracted_text = text[label[1]:label[2]]
        if extracted_text != label[3]:
            print(f"Error in ID {record_id}, Line {i}: {label[0]}, position: {label[1]}-{label[2]}, "
                  f"label: '{label[3]}', extracted: '{extracted_text}'")
        elif tag:
            print(f"Correct in ID {record_id}, Line {i}: {label[0]}, position: {label[1]}-{label[2]}, extracted: '{extracted_text}'")

def check_all_labels(medical_records, label_dict, tag=False):
    for record_id, text in medical_records.items():
        if record_id in label_dict:
            labels = label_dict[record_id]
            check_labels(text, labels, record_id, tag)
        else:
            print(f"ID: {record_id} has no label")

         

In [7]:
# check training data
check_all_labels(all_medical_record_dict, all_label_dict)   

Error in ID 1139, Line 16: HOSPITAL, position: 2702-2722, label: 'PLANTAGENET HOSPITAL', extracted: 'PLANTAGENE3/9 JENNIE'
Error in ID 1481, Line 21: DEPARTMENT, position: 2390-2403, label: 'SEALS Central', extracted: 'SEAKALBARRI H'
Error in ID file21297, Line 20: ORGANIZATION, position: 6045-6064, label: 'KB Home Los Angeles', extracted: 'KB Home	Los Angeles'


In [8]:
# check 1139, PLANTAGENET 3/9 JENNIE COX CLOSE Pathology ?
print(all_medical_record_dict['1139'][2702:2722])
print(all_label_dict['1139'][16])

# replace it
all_label_dict['1139'][16][3]=all_medical_record_dict['1139'][2702:2722]

PLANTAGENE3/9 JENNIE
['HOSPITAL', 2702, 2722, 'PLANTAGENET HOSPITAL']


In [9]:
# check 1481, there is no DEPARTMENT
print(all_medical_record_dict['1481'][2390:2403])
print(all_label_dict['1481'][21])

# remove it 
all_label_dict['1481'].pop(21)

SEAKALBARRI H
['DEPARTMENT', 2390, 2403, 'SEALS Central']


['DEPARTMENT', 2390, 2403, 'SEALS Central']

In [10]:
# check file21297, index 6047 is '\t'
all_medical_record_dict['file21297'][6045:6064]

# replace it
all_medical_record_dict['file21297'] = val_medical_record_dict['file21297'][:6047] + ' ' + val_medical_record_dict['file21297'][6048:]

In [11]:
all_keys = list(all_medical_record_dict.keys())
random.shuffle(all_keys)

In [12]:
train_size = int(0.8 * len(all_keys))
val_size = len(all_keys) - train_size

train_keys = all_keys[:train_size]
val_keys = all_keys[train_size:]

train_medical_record_dict = {key: all_medical_record_dict[key] for key in train_keys}
train_label_dict = {key: all_label_dict[key] for key in train_keys}

val_medical_record_dict = {key: all_medical_record_dict[key] for key in val_keys}
val_label_dict = {key: all_label_dict[key] for key in val_keys}

print("New Train Set Size:", len(train_medical_record_dict))
print("New Validation Set Size:", len(val_medical_record_dict))

New Train Set Size: 1835
New Validation Set Size: 459


In [13]:
# fix it
labels_type_table={'OTHER': 0, 'PATIENT': 1, 'DOCTOR': 2, 'CITY': 3, 'ROOM': 4, 'STREET': 5, 'MEDICALRECORD': 6, 'DEPARTMENT': 7, 'LOCATION-OTHER': 8, 'COUNTRY': 9, 'IDNUM': 10, 'STATE': 11, 'AGE': 12, 'SET': 13, 'HOSPITAL': 14, 'DATE': 15, 'ZIP': 16, 'URL': 17, 'DURATION': 18, 'ORGANIZATION': 19, 'TIME': 20, 'PHONE': 21}
print(labels_type_table)

{'OTHER': 0, 'PATIENT': 1, 'DOCTOR': 2, 'CITY': 3, 'ROOM': 4, 'STREET': 5, 'MEDICALRECORD': 6, 'DEPARTMENT': 7, 'LOCATION-OTHER': 8, 'COUNTRY': 9, 'IDNUM': 10, 'STATE': 11, 'AGE': 12, 'SET': 13, 'HOSPITAL': 14, 'DATE': 15, 'ZIP': 16, 'URL': 17, 'DURATION': 18, 'ORGANIZATION': 19, 'TIME': 20, 'PHONE': 21}


In [14]:
from transformers import LongformerModel
from torchcrf import CRF

class MyLongformerModel(nn.Module):
    def __init__(self, num_labels):
        super(MyLongformerModel, self).__init__()

        self.longformer = LongformerModel.from_pretrained('allenai/longformer-base-4096')
        self.dropout = nn.Dropout(p=0.1)
        self.classifier = nn.Linear(768, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.longformer(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss = -self.crf(logits, labels, mask=attention_mask.byte())
            return loss
        else:
            return self.crf.decode(logits, mask=attention_mask.byte())

model = MyLongformerModel(num_labels=22)


In [15]:
model_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Decode Model Result

In [16]:
def decode_model_result(model_predict_list, offsets_mapping, labels_type_table):
    """
    Decode the model predictions into a list of labeled segments.

    Parameters:
    - model_predict_list (list): List of predicted labels from the model.
    - offsets_mapping (list): List of offset mappings for the predicted labels.
    - labels_type_table (dict): Dictionary mapping label IDs to label names.

    Returns:
    - list: List of labeled segments, where each segment is represented as (label, start, end).
    """

    id_to_label = {id: label for label, id in labels_type_table.items()}
    predict_y = []
    pre_label_id = 0

    for position_id, label_id in enumerate(model_predict_list):
        if label_id != 0:
            if pre_label_id != label_id:
                start = int(offsets_mapping[position_id][0])
            end = int(offsets_mapping[position_id][1])

        if pre_label_id != label_id and pre_label_id != 0:
            predict_y.append([id_to_label[pre_label_id], start, end])
        pre_label_id = label_id

    if pre_label_id != 0:
        predict_y.append([id_to_label[pre_label_id], start, end])

    return predict_y


def merge_overlapping_predictions(predictions):
    """
    Merge overlapping labeled segments in a list.

    Parameters:
    - predictions (list): List of labeled segments, where each segment is represented as (label, start, end).

    Returns:
    - list: List of merged labeled segments after resolving overlaps.
    """
    if not predictions:
        return []

    sorted_predictions = sorted(predictions, key=lambda x: x[1])

    merged_predictions = [sorted_predictions[0]]
    for current in sorted_predictions[1:]:
        last = merged_predictions[-1]
        if current[0] == last[0] and current[1] <= last[2]:
            merged_predictions[-1] = (last[0], last[1], max(last[2], current[2]))
        else:
            merged_predictions.append(current)

    return merged_predictions


def predict_text_segments(model, tokenizer, text, max_length, overlap, device):
    """
    Predict labeled segments in a given text using the model.

    Parameters:
    - model: The trained model for prediction.
    - tokenizer: Tokenizer for processing the input text.
    - text (str): The input text to be processed.
    - max_length (int): Maximum length of text segments for prediction.
    - overlap (int): Overlapping length between consecutive text segments.
    - device: Device to run the model on.

    Returns:
    - list: List of predicted labeled segments, where each segment is represented as (label, start, end).
    """
    all_predictions = []
    offset = 0

    for i in range(0, len(text), max_length - overlap):
        segment = text[i:i+max_length]
        encodings = tokenizer(segment, padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
        encodings["input_ids"] = encodings["input_ids"].to(device)
        encodings["attention_mask"] = encodings["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(encodings["input_ids"], encodings["attention_mask"])
            model_predict_list = outputs[0]  
            predictions = decode_model_result(model_predict_list, encodings["offset_mapping"][0], labels_type_table)

        adjusted_predictions = [(label, start+offset, end+offset) for label, start, end in predictions]
        all_predictions.extend(adjusted_predictions)
        offset = i + max_length - overlap

    return all_predictions


# Post-Processing

In [17]:
def post_processing(label_name, start, end, text_segment):
    """
    Perform post-processing on labeled segments to refine label information.

    Parameters:
    - label_name (str): The predicted label for the segment.
    - start (int): Start position of the labeled segment.
    - end (int): End position of the labeled segment.
    - text_segment (str): The actual text content of the segment.

    Returns:
    - tuple: Processed label, refined start position, refined end position, and updated text content.
    """
    processed_label = label_name.strip()

    if processed_label.endswith('-') or processed_label.endswith('"') or processed_label.endswith("'"):
        processed_label = processed_label[:-1]
        end -= 1

    if processed_label == 'DATE' and text_segment.isdigit() and len(text_segment) > 8:
        end = start + 8
        text_segment = text_segment[:8]

    if processed_label == 'STATE':
        if text_segment.endswith('TAS'):
            text_segment = 'TAS'
            start = end - 3
        elif (len(text_segment) >= 3):
            if text_segment[0].isupper() and text_segment[1].isupper() and text_segment[2].islower():
                if len(text_segment) == 3:
                    text_segment = text_segment[:2]
                    end -= 1
                else:
                    text_segment = text_segment[1:]
                    start += 1

    if processed_label == 'CITY':
        if any(text_segment.endswith(suffix) for suffix in ['ONT', 'LET', 'NET', 'LAT']):
            end -= 1
        elif any(text_segment.endswith(suffix) for suffix in ['RAS', 'CHS', 'LES']):
            end -= 1

    return processed_label, start, end, text_segment


In [18]:
def merge_continuous_time_labels(predictions):
    """
    Merge continuous time labels that are adjacent in predictions.

    Parameters:
    - predictions (list of tuples): List of predictions with each tuple containing label name, start position,
      end position, and predicted text content.

    Returns:
    - list of tuples: Merged predictions where continuous time labels are combined into a single prediction.
    """
    merged_predictions = []
    prev_label = None

    for label_name, start, end, predict_str in predictions:
        if label_name == 'TIME' and prev_label and prev_label['label_name'] == 'TIME':
            if prev_label['end'] + 1 == start:
                prev_label['predict_str'] += ' ' + predict_str
                prev_label['end'] = end
                continue

        if prev_label:
            merged_predictions.append((prev_label['label_name'], prev_label['start'], prev_label['end'], prev_label['predict_str']))

        prev_label = {'label_name': label_name, 'start': start, 'end': end, 'predict_str': predict_str}

    if prev_label:
        merged_predictions.append((prev_label['label_name'], prev_label['start'], prev_label['end'], prev_label['predict_str']))

    return merged_predictions


In [19]:
def predict_for_single_sample(model, tokenizer, sample_id, val_medical_record_dict, device, max_length=4096, overlap=512):
    """
    Predict labels for a single medical record sample.

    Parameters:
    - model (torch.nn.Module): The trained model for making predictions.
    - tokenizer: The tokenizer used for encoding the input text.
    - sample_id (str): Identifier for the medical record sample.
    - val_medical_record_dict (dict): Dictionary containing medical record samples with sample_id as keys and text as values.
    - device: Device (e.g., 'cuda' or 'cpu') on which the model should run.
    - max_length (int): Maximum length for each text segment during prediction.
    - overlap (int): Overlap size between consecutive text segments during prediction.

    Returns:
    - str: String containing the predicted labels in the required format for the given medical record sample.
    """
    output_string = ""
    sample_text = val_medical_record_dict[sample_id]
    predictions = predict_text_segments(model, tokenizer, sample_text, max_length, overlap, device)
    final_predictions = merge_overlapping_predictions(predictions)

    extended_predictions = [(label_name, start, end, sample_text[start:end]) for label_name, start, end in final_predictions]

    merged_predictions = merge_continuous_time_labels(extended_predictions)

    for label_name, start, end, predict_str in merged_predictions:
        label_name, start, end, predict_str = post_processing(label_name, start, end, predict_str)
        sample_result_str = f"{sample_id}\t{label_name}\t{start}\t{end}\t{predict_str}\n"
        output_string += sample_result_str

    return output_string


In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

# Predict Data

In [21]:
def predict_for_entire_dataset(model, tokenizer, val_medical_record_dict, device, max_length=4096, overlap=512):
    """
    Predict labels for an entire dataset of medical record samples.

    Parameters:
    - model (torch.nn.Module): The trained model for making predictions.
    - tokenizer: The tokenizer used for encoding the input text.
    - val_medical_record_dict (dict): Dictionary containing medical record samples with sample_id as keys and text as values.
    - device: Device (e.g., 'cuda' or 'cpu') on which the model should run.
    - max_length (int): Maximum length for each text segment during prediction.
    - overlap (int): Overlap size between consecutive text segments during prediction.

    Returns:
    - str: String containing the predicted labels for the entire dataset in the required format.
    """
    output_string = ""
    for sample_id, sample_text in val_medical_record_dict.items():
        predictions = predict_text_segments(model, tokenizer, sample_text, max_length, overlap, device)
        final_predictions = merge_overlapping_predictions(predictions)

        extended_predictions = [(label_name, start, end, sample_text[start:end]) for label_name, start, end in final_predictions]

        merged_predictions = merge_continuous_time_labels(extended_predictions)

        for label_name, start, end, predict_str in merged_predictions:
            label_name, start, end, predict_str = post_processing(label_name, start, end, predict_str)
            sample_result_str = f"{sample_id}\t{label_name}\t{start}\t{end}\t{predict_str}\n"
            output_string += sample_result_str

    return output_string


# Compare Predicted with Ground Truth

In [22]:
def compare_ner(ground_truth, predictions, category):
    """
    Compare predicted Named Entity Recognition (NER) results with ground truth for a specific category.

    Parameters:
    - ground_truth (dict): Ground truth labeled data with document IDs as keys and associated labels.
    - predictions (dict): Predicted labeled data with document IDs as keys and associated labels.
    - category (str): Specific NER category to evaluate.

    Prints:
    - Outputs differences between ground truth and predictions for the specified category.
    - Calculates and prints Precision, Recall, and F1-Score for the specified category.
    """
    def extract_entities(label_dict, category):
        """
        Extract entities of a specific category from a label dictionary.

        Parameters:
        - label_dict (dict): Dictionary containing labels with document IDs as keys and associated labels.
        - category (str): Specific NER category to extract.

        Returns:
        - dict: Dictionary with entities and their corresponding labels.
        """
        entities = {}
        for doc_id, labels in label_dict.items():
            for label in labels:
                if label[0] == category:
                    entities[(doc_id, tuple(label[1:3]))] = label[3]
        return entities

    gt_entities = extract_entities(ground_truth, category)
    pred_entities = extract_entities(predictions, category)

    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    TP = len([e for e in pred_entities if e in gt_entities and pred_entities[e] == gt_entities[e]])
    FP = len([e for e in pred_entities if e not in gt_entities or pred_entities[e] != gt_entities[e]])
    FN = len([e for e in gt_entities if e not in pred_entities])

    # Print differences
    print(f"Differences in '{category}':")
    for e in pred_entities:
        if e not in gt_entities or pred_entities[e] != gt_entities[e]:
            print(f"Predicted but incorrect or not in ground truth: {e}, Prediction: '{pred_entities[e]}'")

    for e in gt_entities:
        if e not in pred_entities:
            print(f"Missing in predictions: {e}, Ground Truth: '{gt_entities[e]}'")

    # Calculate Precision, Recall, F1
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    print(f"\nPrecision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1_score:.4f}")


# Calculate macro scores from predicted data

In [23]:
def calculate_macro_scores(ground_truth, predictions):
    """
    Calculate Macro Precision, Recall, and F1-Score across different categories for Named Entity Recognition (NER).

    Parameters:
    - ground_truth (dict): Ground truth labeled data with document IDs as keys and associated labels.
    - predictions (dict): Predicted labeled data with document IDs as keys and associated labels.

    Prints:
    - Macro Precision, Recall, and F1-Score.

    Note:
    - Assumes the labeled data format with (doc_id, (start, end), label, text) for each entity.

    Example:
    ```python
    calculate_macro_scores(ground_truth, predictions)
    ```

    The function calculates the Macro Precision, Recall, and F1-Score across different categories for NER.

    """
    def extract_entities(label_dict):
        """
        Extract entities from a label dictionary.

        Parameters:
        - label_dict (dict): Dictionary containing labels with document IDs as keys and associated labels.

        Returns:
        - dict: Dictionary with entities and their corresponding labels.
        """
        entities = {}
        for doc_id, labels in label_dict.items():
            for label in labels:
                key = (doc_id, tuple(label[1:3]), label[0])
                entities[key] = label[3]
        return entities

    gt_entities = extract_entities(ground_truth)
    pred_entities = extract_entities(predictions)

    # Organize entities by category
    categories = set([key[2] for key in gt_entities.keys()] + [key[2] for key in pred_entities.keys()])

    total_precision, total_recall, total_f1 = 0, 0, 0
    for category in categories:
        # Filter entities by category
        gt_cat = {k: v for k, v in gt_entities.items() if k[2] == category}
        pred_cat = {k: v for k, v in pred_entities.items() if k[2] == category}

        # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
        TP = len([e for e in pred_cat if e in gt_cat and pred_cat[e] == gt_cat[e]])
        FP = len([e for e in pred_cat if e not in gt_cat or pred_cat[e] != gt_cat[e]])
        FN = len([e for e in gt_cat if e not in pred_cat])

        # Calculate Precision, Recall, F1 for this category
        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

        total_precision += precision
        total_recall += recall
        total_f1 += f1_score

    # Calculate Macro Precision, Recall, F1
    num_categories = len(categories)
    macro_precision = total_precision / num_categories
    macro_recall = total_recall / num_categories
    macro_f1 = total_f1 / num_categories

    print(f"Macro Precision: {macro_precision:.4f}, Macro Recall: {macro_recall:.4f}, Macro F1-Score: {macro_f1:.4f}")


In [24]:
import os
import re

def get_model_files(model_dir, model_name, seed):
    """
    Retrieve a list of model files from a directory based on the model name and seed.

    Parameters:
    - model_dir (str): Directory path containing the model files.
    - model_name (str): Name of the model to search for in the file names.
    - seed (int): Seed value used during training.

    Returns:
    - list: A sorted list of model files matching the specified criteria.

    Example:
    ```python
    model_dir = 'model/'
    model_name = 'longformer-crf'
    seed = 42
    model_files = get_model_files(model_dir, model_name, seed)
    print(model_files)
    ```

    The function searches for model files in the specified directory based on the provided model name and seed.
    It returns a sorted list of model files, assuming that the epoch information is located in the third part of the file name.

    """
    pattern = re.compile(rf"{model_name}_{seed}_\d+_\d+\.\d+")
    model_files = []

    for file in os.listdir(model_dir):
        if pattern.match(file):
            model_files.append(file)

    model_files.sort(key=lambda x: int(x.split('_')[2]))  # Assuming epoch is always in the third part of the file name
    return model_files

model_dir = 'model/'
model_name = 'longformer-crf'
seed = 42
model_files = get_model_files(model_dir, model_name, seed)
print(model_files)


['longformer-crf_43_11_0.9824877389237668', 'longformer-crf_43_12_0.9778169732490528', 'longformer-crf_43_13_0.9842817215780284', 'longformer-crf_43_14_0.9834499750541934', 'longformer-crf_43_15_0.9790109505011605', 'longformer-crf_43_16_0.9846767433239177', 'longformer-crf_43_17_0.981072594189133', 'longformer-crf_43_18_0.9845563964585489', 'longformer-crf_43_19_0.9840764474056046']


# Predict all data from all models

In [27]:

for model_path in model_files:
#    model_path = './model/longformer-crf_42_13_0.9825048099176974'
    model.load_state_dict(torch.load('./model/' + model_path, map_location=torch.device('cuda')))

    output_string = predict_for_entire_dataset(model, tokenizer, val_medical_record_dict, device)

    file_text = output_string
    predict = {}
    for line in file_text.split("\n"):
        if line.strip():  
            sample = line.split("\t")

            if len(sample) >= 4:
                sample[2], sample[3] = int(sample[2]), int(sample[3])

                if sample[0] not in predict:
                    predict[sample[0]] = [sample[1:]]
                else:
                    predict[sample[0]].append(sample[1:])
    ans = calculate_macro_scores(val_label_dict, predict)
    print(ans, model_path)


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Macro Precision: 0.9161, Macro Recall: 0.8772, Macro F1-Score: 0.8903
None longformer-crf_43_11_0.9824877389237668


In [36]:
compare_ner(val_label_dict, predict, 'CITY')

Differences in 'CITY':
Predicted but incorrect or not in ground truth: ('1836', (89, 92)), Prediction: 'ACT'
Predicted but incorrect or not in ground truth: ('1394', (101, 109)), Prediction: 'Victoria'
Predicted but incorrect or not in ground truth: ('1489', (88, 90)), Prediction: 'QL'
Predicted but incorrect or not in ground truth: ('1412', (77, 88)), Prediction: 'BONDI NORTH'
Missing in predictions: ('1836', (77, 89)), Ground Truth: 'THE ENTRANCE'
Missing in predictions: ('1394', (92, 101)), Ground Truth: 'SINGLETON'
Missing in predictions: ('1489', (79, 88)), Ground Truth: 'THIRLMERE'

Precision: 0.9840, Recall: 0.9880, F1-Score: 0.9860
