# Named Entity Recognition (NER) Example

This notebook demonstrates how to use the data processing classes and NER functionality implemented in the project.

In [21]:
import nltk
from nltk.corpus import conll2002
import spacy
import matplotlib.pyplot as plt
import string
from nltk.tag import CRFTagger

from Pruebas_Hui import data, test_data

## 1. Load and Prepare Data

First, we'll load data from the conll2002 corpus for Spanish NER.

In [22]:
# Load the Spanish NER data
train = conll2002.iob_sents('esp.train')
test = conll2002.iob_sents('esp.testb')

print(f"Training set: {len(train)} sentences")
print(f"Test set: {len(test)} sentences")

# Show an example sentence
print("\nExample sentence:")
print(train[0])

Training set: 8323 sentences
Test set: 1517 sentences

Example sentence:
[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


## 2. Process Data Using Custom Classes

Let's use our custom `data` class to process a sample sentence.

In [23]:
# Process a sample sentence using our data class
sample_sentence = train[:5]
sentence_processor = data(sample_sentence)

# Get lemmas for the sentence
lemmas = sentence_processor(language="es")

# Display the original words, POS tags, and BIO tags
print("Words:", sentence_processor.get_word())
print("POS tags:", sentence_processor.get_pos())
print("BIO tags:", sentence_processor.get_bio())
print("Lemmas:", lemmas)

Words: ['Melbourne', '(', 'Australia', ')', ',', '25', 'may', '(', 'EFE', ')', '.', '-', 'El', 'Abogado', 'General', 'del', 'Estado', ',', 'Daryl', 'Williams', ',', 'subrayó', 'hoy', 'la', 'necesidad', 'de', 'tomar', 'medidas', 'para', 'proteger', 'al', 'sistema', 'judicial', 'australiano', 'frente', 'a', 'una', 'página', 'de', 'internet', 'que', 'imposibilita', 'el', 'cumplimiento', 'de', 'los', 'principios', 'básicos', 'de', 'la', 'Ley', '.', 'La', 'petición', 'del', 'Abogado', 'General', 'tiene', 'lugar', 'después', 'de', 'que', 'un', 'juez', 'del', 'Tribunal', 'Supremo', 'del', 'estado', 'de', 'Victoria', '(', 'Australia', ')', 'se', 'viera', 'forzado', 'a', 'disolver', 'un', 'jurado', 'popular', 'y', 'suspender', 'el', 'proceso', 'ante', 'el', 'argumento', 'de', 'la', 'defensa', 'de', 'que', 'las', 'personas', 'que', 'lo', 'componían', 'podían', 'haber', 'obtenido', 'información', 'sobre', 'el', 'acusado', 'a', 'través', 'de', 'la', 'página', 'CrimeNet', '.', 'Esta', 'página', 'we

## 3. Feature Engineering for NER

Let's implement an optimized feature function class for NER that includes:
- Basic word features
- Contextual features
- POS tag features
- Lemma features

In [24]:
class OptimizedFeatureFunction:
    def __init__(self, use_basic=True, use_context=True, use_pos=True, use_lemmas=False, use_specific=True):
        self.use_basic = use_basic
        self.use_context = use_context
        self.use_pos = use_pos
        self.use_lemmas = False  # Always set to False regardless of the input parameter
        self.use_specific = use_specific
        
        # Location-related suffixes
        self.loc_suffixes = {'ía', 'cia', 'dor', 'dal', 'guay', 'cha', 'nia', 'oz'}
        
        # Common organization precedents
        self.org_precedents = {
            "presidente de la",
            "el presidente del",
            "portavoz del",
            "general de la",
            "director general de",
        }
        
        # Cache for performance
        self.cache = {}
        
    def __call__(self, tokens, idx):
        # Check if tokens contain strings or tuples
        is_string_tokens = False
        if tokens and isinstance(tokens[0], str):
            is_string_tokens = True
        
        # Create a cache key for this token
        if is_string_tokens:
            sentence_key = tuple(tokens)
        else:
            # For tuples, create keys from the first element (word) only
            sentence_key = tuple([(t[0]) for t in tokens])
            
        cache_key = (sentence_key, idx)
        
        # Check if we've already computed features for this token
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        # Initialize feature dictionary
        features = {}
        
        # Check bounds
        if idx >= len(tokens) or idx < 0:
            self.cache[cache_key] = features
            return features
        
        # Handle different token formats (string vs tuple)
        if is_string_tokens:
            word = tokens[idx]
            pos = None  # No POS tag available for string tokens
            lemma = None  # No lemma available for string tokens
        else:
            # Current token info from tuple
            word = tokens[idx][0]
            pos = tokens[idx][1] if len(tokens[idx]) > 1 else None
            lemma = tokens[idx][2] if len(tokens[idx]) > 2 else None
        
        # 1. Basic word features
        if self.use_basic:
            features["word"] = word.lower()
            features["length"] = len(word)
            
            if word and word[0].isupper():
                features["capitalized"] = True
                
            if word.isupper() and len(word) > 1:
                features["all_caps"] = True
                
            if any(c.isdigit() for c in word):
                features["has_digit"] = True
                
            if any(c in string.punctuation for c in word):
                features["has_punct"] = True
                
            if len(word) > 1:
                features["prefix"] = word[:2]
                features["suffix"] = word[-2:]
        
        # 2. Context features - handle both string and tuple cases
        if self.use_context:
            if idx > 0:
                if is_string_tokens:
                    features["prev_word"] = tokens[idx-1].lower()
                else:
                    features["prev_word"] = tokens[idx-1][0].lower()
                
            if idx < len(tokens) - 1:
                if is_string_tokens:
                    features["next_word"] = tokens[idx+1].lower()
                else:
                    features["next_word"] = tokens[idx+1][0].lower()
        
        # 3. POS tag features - only if we have POS info
        if self.use_pos and pos is not None:
            features["pos"] = pos
            
            if idx > 0 and not is_string_tokens:
                features["prev_pos"] = tokens[idx-1][1]
                
            if idx < len(tokens) - 1 and not is_string_tokens:
                features["next_pos"] = tokens[idx+1][1]
        
        # 4. Lemma features - always skipped since we set self.use_lemmas = False
        # The code below will never execute
        if self.use_lemmas and lemma is not None:
            features["lemma"] = lemma
        
        # 5. Specific NER features
        if self.use_specific:
            # Check for location suffixes
            if len(word) > 2:
                for suffix in self.loc_suffixes:
                    if word.lower().endswith(suffix):
                        features["loc_suffix"] = True
                        break
            
            # Check for organization precedents - adapt for both token types
            if idx >= 3:
                if is_string_tokens:
                    word1 = tokens[idx-3]
                    word2 = tokens[idx-2]
                    word3 = tokens[idx-1]
                else:
                    word1 = tokens[idx-3][0]
                    word2 = tokens[idx-2][0]
                    word3 = tokens[idx-1][0]

                trigram = f"{word1} {word2} {word3}".lower()
                if any(precedent in trigram for precedent in self.org_precedents):
                    features["org_precedent"] = True
        
        # Cache and return features
        self.cache[cache_key] = features
        return features

## 4. Prepare Data for CRF Model

Now let's prepare our data for the CRF model, including lemmatization.

In [25]:
# Load SpaCy model for Spanish
nlp = spacy.load("es_core_news_sm")

def prepare_data_for_crf(conll_data, include_lemmas=True):
    """Process conll data into format for CRF tagging with optional lemmatization"""
    processed_data = []
    
    for sentence in conll_data:
        # Process entire sentence for better lemmatization context
        if include_lemmas:
            text = " ".join(word for word, _, _ in sentence)
            doc = nlp(text)
            
            # Create processed sentence
            processed_sentence = []
            for i, (word, pos, tag) in enumerate(sentence):
                if i < len(doc):
                    lemma = doc[i].lemma_
                    processed_sentence.append(((word, pos, lemma), tag))
                else:
                    # Fallback in case of token mismatch
                    processed_sentence.append(((word, pos, word.lower()), tag))
        else:
            processed_sentence = [((word, pos), tag) for word, pos, tag in sentence]
            
        processed_data.append(processed_sentence)
    
    return processed_data

# For demonstration, process just a small sample
sample_data = train
processed_sample = prepare_data_for_crf(sample_data, include_lemmas=True)

# Show the first processed sentence
print("Original:")
print(sample_data[0])
print("\nProcessed (with lemmas):")
print(processed_sample[0])

Original:
[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]

Processed (with lemmas):
[(('Melbourne', 'NP', 'Melbourne'), 'B-LOC'), (('(', 'Fpa', '('), 'O'), (('Australia', 'NP', 'Australia'), 'B-LOC'), ((')', 'Fpt', ')'), 'O'), ((',', 'Fc', ','), 'O'), (('25', 'Z', '25'), 'O'), (('may', 'NC', 'may'), 'O'), (('(', 'Fpa', '('), 'O'), (('EFE', 'NC', 'EFE'), 'B-ORG'), ((')', 'Fpt', ')'), 'O'), (('.', 'Fp', '.'), 'O')]


## 5. Train a CRF Model for NER

Let's train a small CRF model using our feature function and prepared data.

In [26]:
# Create our feature function
feature_func = OptimizedFeatureFunction(
    use_basic=True,
    use_context=True, 
    use_pos=True,
    use_lemmas=False,  # Set to False explicitly (though the class will ignore this anyway)
    use_specific=True
)

# Initialize CRF tagger with our feature function
crf_tagger = CRFTagger(feature_func=feature_func)

# For demonstration, train on a small subset
small_train = processed_sample
crf_tagger.train(small_train, 'example_model.crf.tagger')

print("CRF model trained!")

CRF model trained!


## 6. Experiment with Different Tag Encoding Schemes

The BIO scheme is the most common, but let's implement functions to convert to other schemes.

In [27]:
def bio_to_io(tagged_sent):
    """Convert BIO tagging to IO tagging"""
    io_sent = []
    for word, pos, tag in tagged_sent:
        if tag == "O":
            io_sent.append((word, pos, tag))
        else:
            # Replace B- with I- for any entity tag
            entity_type = tag[2:]
            io_sent.append((word, pos, f"I-{entity_type}"))
    return io_sent

def bio_to_bioes(sent):
    """Convert BIO tagging to BIOES tagging"""
    new_sent = []
    n = len(sent)
    i = 0
    
    while i < n:
        word, pos, tag = sent[i]
        
        if tag == "O":
            new_sent.append((word, pos, tag))
            i += 1
        elif tag.startswith("B-"):
            entity_type = tag[2:]
            
            # Check if it's a singleton entity (no following I- tags)
            if i + 1 == n or not sent[i+1][2].startswith(f"I-{entity_type}"):
                new_sent.append((word, pos, f"S-{entity_type}"))
                i += 1
            else:
                # It's the beginning of a multi-token entity
                new_sent.append((word, pos, f"B-{entity_type}"))
                i += 1
                
                # Process all the intermediate I- tags
                while i < n and sent[i][2] == f"I-{entity_type}":
                    # Check if this is the last I- tag
                    if i + 1 == n or sent[i+1][2] != f"I-{entity_type}":
                        new_sent.append((sent[i][0], sent[i][1], f"E-{entity_type}"))
                    else:
                        new_sent.append((sent[i][0], sent[i][1], f"I-{entity_type}"))
                    i += 1
        else:
            # Handle unexpected tags (like I- without preceding B-)
            new_sent.append((word, pos, tag))
            i += 1
            
    return new_sent

# Example of converting tags
sample_sent = train[5]  # Get a sentence that hopefully has some entities
print("Original (BIO):")
print(sample_sent)

io_sent = bio_to_io(sample_sent)
print("\nIO Scheme:")
print(io_sent)

bioes_sent = bio_to_bioes(sample_sent)
print("\nBIOES Scheme:")
print(bioes_sent)

Original (BIO):
[('Por', 'SP', 'O'), ('su', 'DP', 'O'), ('parte', 'NC', 'O'), (',', 'Fc', 'O'), ('el', 'DA', 'O'), ('Abogado', 'NC', 'B-PER'), ('General', 'AQ', 'I-PER'), ('de', 'SP', 'O'), ('Victoria', 'NC', 'B-LOC'), (',', 'Fc', 'O'), ('Rob', 'NC', 'B-PER'), ('Hulls', 'AQ', 'I-PER'), (',', 'Fc', 'O'), ('indicó', 'VMI', 'O'), ('que', 'CS', 'O'), ('no', 'RN', 'O'), ('hay', 'VAI', 'O'), ('nadie', 'PI', 'O'), ('que', 'PR', 'O'), ('controle', 'VMS', 'O'), ('que', 'CS', 'O'), ('las', 'DA', 'O'), ('informaciones', 'NC', 'O'), ('contenidas', 'AQ', 'O'), ('en', 'SP', 'O'), ('CrimeNet', 'NC', 'B-MISC'), ('son', 'VSI', 'O'), ('veraces', 'AQ', 'O'), ('.', 'Fp', 'O')]

IO Scheme:
[('Por', 'SP', 'O'), ('su', 'DP', 'O'), ('parte', 'NC', 'O'), (',', 'Fc', 'O'), ('el', 'DA', 'O'), ('Abogado', 'NC', 'I-PER'), ('General', 'AQ', 'I-PER'), ('de', 'SP', 'O'), ('Victoria', 'NC', 'I-LOC'), (',', 'Fc', 'O'), ('Rob', 'NC', 'I-PER'), ('Hulls', 'AQ', 'I-PER'), (',', 'Fc', 'O'), ('indicó', 'VMI', 'O'), ('que', '

## 7. Entity-Level Evaluation

Instead of just token-level accuracy, let's implement entity-level evaluation.

In [28]:
def extract_entities(tags):
    """
    Extract entity spans from a sequence of BIO tags.
    
    Args:
        tags: List of BIO tags (e.g., 'B-PER', 'I-PER', 'O')
        
    Returns:
        List of tuples (entity_type, start_idx, end_idx)
    """
    entities = []
    entity_type = None
    start_idx = None
    
    for i, tag in enumerate(tags):
        # Handle the case where tag might be a tuple
        if isinstance(tag, tuple):
            tag = tag[1]  # Extract the actual tag if it's a tuple (word, tag)
            
        if tag.startswith('B-'):
            # If we were tracking an entity, add it to the list
            if entity_type is not None:
                entities.append((entity_type, start_idx, i - 1))
            # Start a new entity
            entity_type = tag[2:]  # Remove 'B-' prefix
            start_idx = i
        elif tag.startswith('I-'):
            # Continue with the current entity
            curr_type = tag[2:]  # Remove 'I-' prefix
            # This handles inconsistent I- tags that don't match the current entity
            if entity_type is None or curr_type != entity_type:
                # Close any open entity and ignore this tag (it's an error in tagging)
                if entity_type is not None:
                    entities.append((entity_type, start_idx, i - 1))
                entity_type = None
                start_idx = None
        else:  # 'O' tag
            # If we were tracking an entity, add it to the list
            if entity_type is not None:
                entities.append((entity_type, start_idx, i - 1))
                entity_type = None
                start_idx = None
    
    # Don't forget the last entity if the sequence ends with an entity
    if entity_type is not None:
        entities.append((entity_type, start_idx, len(tags) - 1))
    
    return entities


def evaluate_entities(gold_entities, pred_entities):
    """
    Calculate precision, recall, and F1 score for entity recognition.
    
    Args:
        gold_entities: List of gold standard entity tuples (type, start, end)
        pred_entities: List of predicted entity tuples (type, start, end)
        
    Returns:
        Dictionary with precision, recall, and F1 scores
    """
    # Convert to sets for easier comparison
    gold_set = set(gold_entities)
    pred_set = set(pred_entities)
    
    # Calculate correct predictions (intersection)
    correct = len(gold_set.intersection(pred_set))
    
    # Calculate precision, recall, and F1
    precision = correct / len(pred_set) if pred_set else 0.0
    recall = correct / len(gold_set) if gold_set else 1.0  # Perfect recall if no gold entities
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'gold_count': len(gold_set),
        'pred_count': len(pred_set),
        'correct': correct
    }

def evaluate_ner_corpus(gold_data, predicted_data):
    """
    Evaluate NER performance at entity level across an entire corpus.
    
    Args:
        gold_data: List of sentences where each sentence is a list of (word, gold_tag) tuples
        predicted_data: List of sentences where each sentence is a list of (word, pred_tag) tuples
        
    Returns:
        Dictionary with overall precision, recall, and F1 scores
    """
    total_correct = 0
    total_gold = 0
    total_pred = 0
    
    for gold_sent, pred_sent in zip(gold_data, predicted_data):
        # Extract just the tags
        gold_tags = [tag for _, tag in gold_sent]
        pred_tags = [tag for _, tag in pred_sent]
        
        # Extract entities
        gold_entities = extract_entities(gold_tags)
        pred_entities = extract_entities(pred_tags)
        
        # Evaluate this sentence
        results = evaluate_entities(gold_entities, pred_entities)
        
        # Accumulate counts
        total_correct += results['correct']
        total_gold += results['gold_count']
        total_pred += results['pred_count']
    
    # Calculate overall metrics
    precision = total_correct / total_pred if total_pred > 0 else 0.0
    recall = total_correct / total_gold if total_gold > 0 else 1.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    accuracy = total_correct / total_gold
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy  # Using F1 as the "accuracy" metric for entity-level evaluation
    }


In [29]:

# Extend CRFTagger to support entity-level evaluation
def entity_level_accuracy(tagger, test_data):
    """
    Calculate entity-level evaluation metrics for a CRFTagger.
    
    Args:
        tagger: Trained CRFTagger model
        test_data: List of sentences where each sentence is a list of (word, pos, tag) tuples
        
    Returns:
        Dictionary with precision, recall, F1, and accuracy scores
    """
    # Convert test data to the format expected by the evaluation function
    formatted_test_data = [[(word, label) for (word, pos, label) in sent] for sent in test_data]
    
    # Get predictions
    predicted_data = []
    for sentence in test_data:  # Use original test_data to extract words
        words = [word for word, _, _ in sentence]
        tags = tagger.tag(words)
        predicted_data.append(list(zip(words, tags)))
    
    # Evaluate
    results = evaluate_ner_corpus(formatted_test_data, predicted_data)
    
    return results

# Replace the token-level accuracy evaluation with entity-level evaluation
entity_metrics = entity_level_accuracy(crf_tagger, test)  # Changed from trained_tagger to crf_tagger

print("Entity-Level Evaluation:")
print(f"Precision: {entity_metrics['precision']:.4f}")
print(f"Recall: {entity_metrics['recall']:.4f}")
print(f"F1 Score: {entity_metrics['f1']:.4f}")

# For comparison, also show the token-level accuracy
# First prepare test data in the format expected by the CRF tagger
test_data = [[(word, label) for (word, pos, label) in sent] for sent in test]
token_accuracy = crf_tagger.accuracy(test_data)  # Changed from trained_tagger to crf_tagger
print(f"\nToken-Level Accuracy: {token_accuracy:.4f}")

Entity-Level Evaluation:
Precision: 0.6171
Recall: 0.5450
F1 Score: 0.5788

Token-Level Accuracy: 0.9115

Token-Level Accuracy: 0.9115


## 8. Conclusion

This notebook has demonstrated:

1. How to use the `data` class for processing NER data
2. How to implement and customize a feature function
3. How to prepare data for CRF tagging
4. How to convert between different tagging schemes (BIO, IO, BIOES)
5. How to perform entity-level evaluation

These techniques can be applied to improve NER performance for Spanish and other languages.

## 9. Feature Combination Analysis

Let's evaluate different combinations of features to find the optimal configuration using entity-level evaluation metrics instead of token-level accuracy.

In [30]:
from itertools import combinations
import time
import pandas as pd
import seaborn as sns

def evaluate_feature_combinations():
    # Define feature groups to test
    feature_groups = {
        "Basic": True,        # word, length, etc.
        "Context_Words": True,  # prev_word, next_word
        "Context_POS": True,    # POS tags of surrounding words
        "Specific": True,      # location_suffix, organization_precedent, etc.
        "Lemmas": True,        # Use lemmatization features
    }
    
    # Generate all possible combinations of feature groups
    all_configs = []
    feature_names = list(feature_groups.keys())
    
    for r in range(1, len(feature_names) + 1):
        for combo in combinations(feature_names, r):
            config = {name: (name in combo) for name in feature_names}
            all_configs.append(config)
    
    # Create a dataframe to store results
    results = []
    
    # Evaluate each configuration
    for i, config in enumerate(all_configs):
        print(f"Testing configuration {i+1}/{len(all_configs)}: {config}")
        start_time = time.time()
        
        # Create and train model with this feature configuration
        tagger = train_crf_with_config(train, config)  # Using a subset for demonstration
        
        # Calculate entity-level metrics
        entity_metrics = entity_level_accuracy(tagger, test)
        
        # Calculate elapsed time
        elapsed = time.time() - start_time
        
        # Store results
        results.append({
            'Configuration': str(config),
            'Basic': config['Basic'],
            'Context_Words': config['Context_Words'],
            'Context_POS': config['Context_POS'],
            'Specific': config['Specific'],
            'Lemmas': config['Lemmas'],
            'Precision': entity_metrics['precision'],
            'Recall': entity_metrics['recall'],
            'F1': entity_metrics['f1'],
            'Time_Seconds': elapsed
        })
        
        print(f"  F1 Score: {entity_metrics['f1']:.4f}, Time: {elapsed:.2f} seconds")
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Sort by F1 score
    results_df = results_df.sort_values('F1', ascending=False)
    
    # Display best configurations
    print("\n--- Top 5 Feature Configurations ---")
    print(results_df.head(5))
    
    # Create plots to visualize results
    plot_feature_importance(results_df)
    
    return results_df

def train_crf_with_config(training_data, config):
    # Create feature function with the specified configuration
    feat_func = OptimizedFeatureFunction(
        use_basic=config['Basic'], 
        use_context=config['Context_Words'], 
        use_pos=config['Context_POS'], 
        use_specific=config['Specific'],
        use_lemmas=config['Lemmas']
    )
    
    # Process data for CRF training
    processed_data = []
    
    
    # Process with our data class to get lemmas
    sentence_processor = data(training_data)
    words = sentence_processor.get_word()
    pos_tags = sentence_processor.get_pos()
    bio_tags = sentence_processor.get_bio()
    lemmas = sentence_processor(language="es")
    
    # Format the data for CRF training
    processed_sentence = []
    for i, (word, pos, bio) in enumerate(zip(words, pos_tags, bio_tags)):
        if i < len(lemmas):  # Ensure we have a lemma for this word
            processed_sentence.append(((word, pos, lemmas[i]), bio))
        else:
            # Fallback in case of token mismatch
            processed_sentence.append(((word, pos, word.lower()), bio))
            
    processed_data.append(processed_sentence)
    
    # Create and train the CRF tagger
    ct = CRFTagger(feature_func=feat_func)
    ct.train(processed_data, 'temp_model.crf.tagger')  # Temporary file for evaluation
    
    return ct

def plot_feature_importance(results_df):
    # Create figure with multiple subplots
    plt.figure(figsize=(12, 10))
    
    # Plot 1: F1 score distribution
    plt.subplot(2, 2, 1)
    plt.hist(results_df['F1'], bins=15, color='skyblue', edgecolor='black')
    plt.xlabel('F1 Score')
    plt.ylabel('Frequency')
    plt.title('Distribution of F1 Scores')
    
    # Plot 2: Effect of each feature on F1 score
    plt.subplot(2, 2, 2)
    feature_cols = ['Basic', 'Context_Words', 'Context_POS', 'Specific', 'Lemmas']
    
    # Calculate mean F1 for each feature when it's present vs absent
    feature_effects = {}
    for feat in feature_cols:
        present_mean = results_df[results_df[feat] == True]['F1'].mean()
        absent_mean = results_df[results_df[feat] == False]['F1'].mean()
        feature_effects[feat] = present_mean - absent_mean
    
    # Plot the effects
    plt.bar(feature_effects.keys(), feature_effects.values(), color='lightgreen')
    plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
    plt.ylabel('Mean F1 Score Difference')
    plt.title('Effect of Each Feature on F1 Score')
    plt.xticks(rotation=45)
    
    # Plot 3: F1 vs number of features enabled
    plt.subplot(2, 2, 3)
    results_df['Features_Enabled'] = results_df[feature_cols].sum(axis=1)
    
    # Group by number of features and calculate mean F1
    grouped = results_df.groupby('Features_Enabled')['F1'].mean().reset_index()
    
    plt.plot(grouped['Features_Enabled'], grouped['F1'], marker='o', linestyle='-')
    plt.xlabel('Number of Features Enabled')
    plt.ylabel('Mean F1 Score')
    plt.title('F1 Score vs Feature Count')
    
    # Plot 4: Training time comparison
    plt.subplot(2, 2, 4)
    plt.scatter(results_df['Time_Seconds'], results_df['F1'], alpha=0.6)
    plt.xlabel('Time (seconds)')
    plt.ylabel('F1 Score')
    plt.title('F1 Score vs Training Time')
    
    plt.tight_layout()
    plt.show()
    
    # Additional plot: feature combinations heatmap
    plt.figure(figsize=(10, 8))
    
    # Create dummy binary columns for each feature combination
    for i, feat1 in enumerate(feature_cols):
        for j, feat2 in enumerate(feature_cols[i+1:], i+1):
            results_df[f"{feat1}_{feat2}"] = results_df[feat1] & results_df[feat2]
    
    # Calculate mean F1 for each feature combination
    combo_effects = {}
    for combo in [f"{feat1}_{feat2}" for i, feat1 in enumerate(feature_cols) 
                  for j, feat2 in enumerate(feature_cols[i+1:], i+1)]:
        if combo in results_df.columns:
            combo_effects[combo] = results_df[results_df[combo]]['F1'].mean()
    
    # Create heatmap data
    heatmap_data = pd.DataFrame(index=feature_cols, columns=feature_cols, data=0.0)
    for i, feat1 in enumerate(feature_cols):
        heatmap_data.loc[feat1, feat1] = results_df[results_df[feat1]]['F1'].mean()
        for feat2 in feature_cols[i+1:]:
            combo = f"{feat1}_{feat2}"
            if combo in combo_effects:
                heatmap_data.loc[feat1, feat2] = combo_effects[combo]
                heatmap_data.loc[feat2, feat1] = combo_effects[combo]
    
    # Plot heatmap
    sns.heatmap(heatmap_data, annot=True, cmap="YlGnBu", fmt=".4f")
    plt.title('Feature Combination Effectiveness (Mean F1 Score)')
    plt.tight_layout()
    plt.show()

# Uncomment to run the evaluation (it may take a while)
results_df = evaluate_feature_combinations()

Testing configuration 1/31: {'Basic': True, 'Context_Words': False, 'Context_POS': False, 'Specific': False, 'Lemmas': False}


KeyboardInterrupt: 

## 10. Full Analysis with Optimized Features

Now that we've identified the best feature combinations, we can train a model with the optimal configuration and evaluate it on the full test set.

In [None]:
# Example of running a complete analysis with the optimal feature configuration
def run_optimal_configuration():
    # Create feature function with optimal settings (example - replace with your findings)
    optimal_feat_func = OptimizedFeatureFunction(
        use_basic=True,
        use_context=True, 
        use_pos=True,
        use_lemmas=True,  # Including lemmatization
        use_specific=True
    )
    
    print("Processing training data...")
    # Process the training data
    processed_train = []
    for i, sentence in enumerate(train):
        if i % 500 == 0:  # Progress indicator
            print(f"Processing sentence {i}/{len(train)}...")
        
        # Use our data class to process the sentence
        sentence_processor = data(sentence)
        words = sentence_processor.get_word()
        pos_tags = sentence_processor.get_pos()
        bio_tags = sentence_processor.get_bio()
        lemmas = sentence_processor(language="es")  # Get lemmas
        
        # Format for CRF training
        processed_sentence = []
        for i, (word, pos, bio) in enumerate(zip(words, pos_tags, bio_tags)):
            if i < len(lemmas):
                processed_sentence.append(((word, pos, lemmas[i]), bio))
            else:
                processed_sentence.append(((word, pos, word.lower()), bio))
                
        processed_train.append(processed_sentence)
    
    print("Training CRF model...")
    # Train the model
    optimal_tagger = CRFTagger(feature_func=optimal_feat_func)
    optimal_tagger.train(processed_train, 'optimal_model.crf.tagger')
    
    print("Processing test data...")
    # Process the test data
    processed_test = []
    for sentence in test:
        # Use our data class to process the sentence
        sentence_processor = data(sentence)
        words = sentence_processor.get_word()
        pos_tags = sentence_processor.get_pos()
        bio_tags = sentence_processor.get_bio()
        lemmas = sentence_processor(language="es")  # Get lemmas
        
        # Format for evaluation
        processed_sentence = []
        for i, (word, pos, bio) in enumerate(zip(words, pos_tags, bio_tags)):
            if i < len(lemmas):
                processed_sentence.append(((word, pos, lemmas[i]), bio))
            else:
                processed_sentence.append(((word, pos, word.lower()), bio))
                
        processed_test.append(processed_sentence)
    
    # Evaluate using entity-level metrics
    entity_results = entity_level_accuracy(optimal_tagger, test)
    
    print("\n=== Entity-Level Evaluation Results ===")
    print(f"Precision: {entity_results['precision']:.4f}")
    print(f"Recall: {entity_results['recall']:.4f}")
    print(f"F1 Score: {entity_results['f1']:.4f}")
    
    return optimal_tagger, entity_results

# Uncomment to run the complete analysis (it will take a significant amount of time)
# optimal_tagger, results = run_optimal_configuration()

## Conclusion

In this notebook, we've demonstrated:

1. How to use the custom `data` class to process NER data and extract lemmas
2. Implementing a feature function class with various customizable feature sets
3. Training and evaluating CRF models for NER with different feature combinations
4. Converting between different tagging schemes (BIO, IO, BIOES)
5. Entity-level evaluation for more accurate performance measurement
6. Systematic analysis of feature importance for NER performance

The entity-level evaluation provides a more meaningful assessment of NER performance than token-level accuracy, as it ensures that complete entities are correctly identified rather than just individual tokens.