1. Load untagged data
2. Tag them with our LSTM
3. Check with original tags and measure accuracy

In [11]:
test_sentences = [
    "I visited New York last summer.",
    "The Eiffel Tower is located in Paris.",
    "Mount Everest is the world's tallest mountain.",
]

# Manually annotated labels for the test data in IOB2 format
test_labels = [
    ["O", "O", "B-LOC", "I-LOC", "O", "O", "O"],
    ["O", "B-LOC", "I-LOC", "O", "O", "O", "O", "O"],
    ["B-LOC", "I-LOC", "O", "O", "O", "O", "O", "O", "O", "O"],
    # Labels corresponding to each sentence above
]


In [8]:
def preprocess_data(sentences, vocab, max_len):

    tokenized_sentences = [sentence.split() for sentence in sentences]  # Basic tokenization
    indexed_sentences = [[vocab.get(token.lower(), vocab['<UNK>']) for token in sentence] for sentence in tokenized_sentences]
    padded_sentences = [sentence + [vocab['<PAD>']] * (max_len - len(sentence)) for sentence in indexed_sentences]
    return padded_sentences

# Assuming `vocab` is a dictionary mapping tokens to indices and `<UNK>` is the unknown token index
# Also assuming `max_len` is defined as the maximum sentence length in your dataset


In [3]:
def labels_to_indices(labels, label_vocab):
    return [[label_vocab.get(label, 0) for label in sentence_labels] for sentence_labels in labels]

# Assuming `label_vocab` is a dictionary mapping NER labels to indices


In [4]:
def predict(model, test_data):
    model.eval()  # Put the model in evaluation mode
    predictions = []
    with torch.no_grad():  # No gradients required for prediction
        for sentence in test_data:
            sentence_tensor = torch.tensor(sentence).unsqueeze(0)  # Convert to tensor and add batch dimension
            prediction = model(sentence_tensor)
            predictions.append(prediction.argmax(dim=2))  # Get the most likely tag index
    return predictions



In [5]:
def evaluate(predictions, true_labels):
    # Flatten lists if they are nested
    flatten = lambda l: [item for sublist in l for item in sublist]
    predictions = [p.item() for p in flatten(predictions)]
    true_labels = flatten(true_labels)
    
    # Calculate accuracy
    correct = sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
    total = len(true_labels)
    accuracy = correct / total
    return accuracy

# Assuming `predictions` is the output from `predict` function
# `true_labels` is the list of label indices corresponding to the test data


In [10]:
# Load the vocab and label vocab here
vocab = {...}
label_vocab = {...}

# Preprocess the test data
max_len = 50  # or whatever max length you have set for your training data
preprocessed_test_data = preprocess_data(test_sentences, vocab, max_len)
test_label_indices = labels_to_indices(test_labels, label_vocab)


model = ...  #trained LSTM model

#


AttributeError: 'ellipsis' object has no attribute 'split'