# A Jupyter Notebook with Type Annotations for Named Entity Recognition (NER)

This notebook demonstrates how to implement type annotations for all code related to Named Entity Recognition (NER). Type annotations improve code readability, maintainability, and debugging by explicitly specifying the expected types of variables, function parameters, and return values.

## 1. Import Required Libraries with Type Annotations

Add type annotations for imported libraries and modules, ensuring clarity on their usage.

In [None]:
# Import Required Libraries with Type Annotations
from typing import List, Tuple, Dict, Any, Optional
import nltk
from nltk.corpus import conll2002
import spacy
import matplotlib.pyplot as plt
import string
from nltk.tag import CRFTagger
from itertools import combinations
import time
import pandas as pd
import seaborn as sns

## 2. Load and Prepare Data with Type Annotations

Include type annotations for variables and functions used to load and prepare the conll2002 dataset.

In [None]:
# Load and Prepare Data with Type Annotations
def load_data() -> Tuple[List[List[Tuple[str, str, str]]], List[List[Tuple[str, str, str]]]]:
    """
    Load the Spanish NER data from the conll2002 corpus.

    Returns:
        A tuple containing the training and test datasets.
    """
    train: List[List[Tuple[str, str, str]]] = conll2002.iob_sents('esp.train')
    test: List[List[Tuple[str, str, str]]] = conll2002.iob_sents('esp.testb')
    return train, test

# Load the data
train_data, test_data = load_data()

print(f"Training set: {len(train_data)} sentences")
print(f"Test set: {len(test_data)} sentences")

## 3. Process Data Using Custom Classes with Type Annotations

Add type annotations to the custom data processing class and its methods.

In [None]:
# Process Data Using Custom Classes with Type Annotations
class DataProcessor:
    def __init__(self, sentence: List[Tuple[str, str, str]]) -> None:
        self.sentence = sentence

    def get_words(self) -> List[str]:
        return [word for word, _, _ in self.sentence]

    def get_pos_tags(self) -> List[str]:
        return [pos for _, pos, _ in self.sentence]

    def get_bio_tags(self) -> List[str]:
        return [bio for _, _, bio in self.sentence]

# Example usage
processor = DataProcessor(train_data[0])
print("Words:", processor.get_words())
print("POS Tags:", processor.get_pos_tags())
print("BIO Tags:", processor.get_bio_tags())

## 4. Feature Engineering for NER with Type Annotations

Implement type annotations for the `OptimizedFeatFunc` class and its methods.

In [None]:
# Feature Engineering for NER with Type Annotations
class OptimizedFeatFunc:
    def __init__(
        self,
        use_basic: bool = True,
        use_context_words: bool = True,
        use_context_pos_tags: bool = True,
        use_specific_characteristics: bool = True,
        use_lemmas: bool = True
    ) -> None:
        self.use_basic = use_basic
        self.use_context_words = use_context_words
        self.use_context_pos_tags = use_context_pos_tags
        self.use_specific_characteristics = use_specific_characteristics
        self.use_lemmas = use_lemmas

    def __call__(self, tokens: List[Tuple[str, str, str]], idx: int) -> Dict[str, Any]:
        features: Dict[str, Any] = {}
        if idx < 0 or idx >= len(tokens):
            return features

        word, pos, lemma = tokens[idx]
        if self.use_basic:
            features["word"] = word
            features["length"] = len(word)
        if self.use_context_words and idx > 0:
            features["prev_word"] = tokens[idx - 1][0]
        if self.use_context_pos_tags:
            features["pos"] = pos
        if self.use_lemmas:
            features["lemma"] = lemma
        return features

## 5. Prepare Data for CRF Model with Type Annotations

Add type annotations to the `prepare_data_for_crf` function and its parameters.

In [None]:
# Prepare Data for CRF Model with Type Annotations
def prepare_data_for_crf(
    conll_data: List[List[Tuple[str, str, str]]],
    include_lemmas: bool = True
) -> List[List[Tuple[Tuple[str, str, str], str]]]:
    """
    Process conll data into format for CRF tagging with optional lemmatization.

    Args:
        conll_data: The input data in conll format.
        include_lemmas: Whether to include lemmas in the processed data.

    Returns:
        The processed data formatted for CRF tagging.
    """
    processed_data: List[List[Tuple[Tuple[str, str, str], str]]] = []
    for sentence in conll_data:
        processed_sentence: List[Tuple[Tuple[str, str, str], str]] = [
            ((word, pos, word.lower()), tag) for word, pos, tag in sentence
        ]
        processed_data.append(processed_sentence)
    return processed_data

# Process the training data
processed_train_data = prepare_data_for_crf(train_data)
print("Processed training data example:", processed_train_data[0])

## 6. Train a CRF Model for NER with Type Annotations

Include type annotations for the CRF model training process and related functions.

In [None]:
# Train a CRF Model for NER with Type Annotations
def train_crf_model(
    train_data: List[List[Tuple[Tuple[str, str, str], str]]],
    model_path: str
) -> CRFTagger:
    """
    Train a CRF model for NER.

    Args:
        train_data: The training data formatted for CRF tagging.
        model_path: The path to save the trained model.

    Returns:
        The trained CRFTagger instance.
    """
    crf_tagger = CRFTagger()
    crf_tagger.train(train_data, model_path)
    return crf_tagger

# Train the model
model_path = "ner_model.crf.tagger"
trained_crf_model = train_crf_model(processed_train_data, model_path)
print("CRF model trained and saved to:", model_path)

## 7. Experiment with Different Tag Encoding Schemes with Type Annotations

Add type annotations to functions for converting between tagging schemes (BIO, IO, BIOES).

In [None]:
# Experiment with Different Tag Encoding Schemes with Type Annotations
def bio_to_io(tagged_sentence: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """
    Convert BIO tagging to IO tagging.

    Args:
        tagged_sentence: A list of tuples containing words and BIO tags.

    Returns:
        A list of tuples with IO tags.
    """
    io_sentence: List[Tuple[str, str]] = []
    for word, tag in tagged_sentence:
        if tag.startswith("B-"):
            io_sentence.append((word, "I-" + tag[2:]))
        else:
            io_sentence.append((word, tag))
    return io_sentence

## 8. Entity-Level Evaluation with Type Annotations

Implement type annotations for entity extraction, evaluation, and corpus-level evaluation functions.

In [None]:
# Entity-Level Evaluation with Type Annotations
def extract_entities(tags: List[str]) -> List[Tuple[str, int, int]]:
    """
    Extract entity spans from a sequence of BIO tags.

    Args:
        tags: List of BIO tags.

    Returns:
        List of tuples (entity_type, start_idx, end_idx).
    """
    entities: List[Tuple[str, int, int]] = []
    entity_type: Optional[str] = None
    start_idx: Optional[int] = None

    for i, tag in enumerate(tags):
        if tag.startswith("B-"):
            if entity_type is not None:
                entities.append((entity_type, start_idx, i - 1))
            entity_type = tag[2:]
            start_idx = i
        elif tag == "O" and entity_type is not None:
            entities.append((entity_type, start_idx, i - 1))
            entity_type = None
            start_idx = None

    if entity_type is not None:
        entities.append((entity_type, start_idx, len(tags) - 1))
    return entities

## 9. Feature Combination Analysis with Type Annotations

Add type annotations to functions for evaluating feature combinations and selecting the best configuration.

In [None]:
# Feature Combination Analysis with Type Annotations
def evaluate_feature_combination(
    config: Dict[str, bool],
    train_data: List[List[Tuple[Tuple[str, str, str], str]]],
    test_data: List[List[Tuple[Tuple[str, str, str], str]]]
) -> Dict[str, float]:
    """
    Evaluate a feature combination for NER.

    Args:
        config: A dictionary specifying which features to use.
        train_data: The training data formatted for CRF tagging.
        test_data: The test data formatted for CRF tagging.

    Returns:
        A dictionary with evaluation metrics (precision, recall, F1).
    """
    feat_func = OptimizedFeatFunc(
        use_basic=config["Basic"],
        use_context_words=config["Context_Words"],
        use_context_pos_tags=config["Context_POS"],
        use_specific_characteristics=config["Specific"],
        use_lemmas=config["Lemmas"]
    )
    crf_tagger = CRFTagger(feature_func=feat_func)
    crf_tagger.train(train_data, "temp_model.crf.tagger")
    # Placeholder for evaluation logic
    return {"precision": 0.0, "recall": 0.0, "f1": 0.0}

## 10. Full Analysis with Optimized Features with Type Annotations

Include type annotations for the complete analysis process using the optimal feature configuration.

In [None]:
# Full Analysis with Optimized Features with Type Annotations
def run_full_analysis(
    train_data: List[List[Tuple[Tuple[str, str, str], str]]],
    test_data: List[List[Tuple[Tuple[str, str, str], str]]]
) -> None:
    """
    Run a full analysis using the optimal feature configuration.

    Args:
        train_data: The training data formatted for CRF tagging.
        test_data: The test data formatted for CRF tagging.
    """
    optimal_config = {
        "Basic": True,
        "Context_Words": True,
        "Context_POS": True,
        "Specific": True,
        "Lemmas": True
    }
    metrics = evaluate_feature_combination(optimal_config, train_data, test_data)
    print("Optimal Configuration Metrics:", metrics)

# Run the full analysis
run_full_analysis(processed_train_data, prepare_data_for_crf(test_data))