In [1]:
# New notebook file for transformer-based NER

import sys
import os
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Add project root to path (same as your current notebook)
sys.path.insert(0, '..')
from src.experiment_utils.helper_classes import span, repository
from src.d02_corpus_statistics.corpus import Corpus
from definitions import ROOT_DIR

# Transformer-specific imports
from transformers import (
    AutoModelForTokenClassification, 
    AutoTokenizer,
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

  from .autonotebook import tqdm as notebook_tqdm


2.1 Data Loading and Preprocessing

In [2]:
# Load the preprocessed dataframe (same as your current notebook)
dataframe_dir = os.path.join(ROOT_DIR, 'data/preprocessed_dataframe.pkl')
stat_df = pd.read_pickle(dataframe_dir)

# Create corpus object
corpus = Corpus(stat_df)

# Create a repository to match all documents
all_docs = repository()

# Extract Actor spans
actor_spans = corpus.get_span_list(
    conditional_rep=all_docs, 
    annotators='Curation', 
    item='feature', 
    value='Actor'
)

# Define the tags we're focusing on (same as your current notebook)
sufficient_tags = [
    'Addressee_default',
    'Addressee_sector',
    'Authority_default',
    'Authority_monitoring',
    'Addressee_monitored',
    'Authority_legislative',
    'Addressee_resource'
]

# Extract spans for each tag type (similar to your current approach)
all_spans = []

for tag in sufficient_tags:
    # Get spans for this specific tag
    tag_spans = corpus.get_span_list(
        conditional_rep=all_docs,
        annotators='Curation',
        item='tag',
        value=tag
    )
    
    # Extract relevant information and add context
    for span in tag_spans:
        # Get the document text
        doc_text = corpus.df.loc[span.rep.index_name, 'Text']
        
        # Calculate context window boundaries - use larger context for transformers
        start_ctx = max(0, span.start - 100)
        end_ctx = min(len(doc_text), span.stop + 100)
        
        # Extract context
        context = doc_text[start_ctx:end_ctx]
        
        all_spans.append({
            'text': span.text,
            'tag': span.tag,
            'document': span.rep.index_name,
            'context': context,
            'start': span.start - start_ctx,  # Adjust start position for the context
            'stop': span.stop - start_ctx     # Adjust stop position for the context
        })

# Convert to DataFrame
spans_df = pd.DataFrame(all_spans)

# Add high-level category (similar to your approach)
spans_df['high_level_category'] = spans_df['tag'].apply(lambda x: x.split('_')[0])

2.2 Preparing Data for Transformer-based NER

In [3]:
# Split into training and test sets with stratification
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    spans_df, 
    test_size=0.2, 
    random_state=42, 
    stratify=spans_df['tag']
)

print(f"Training set: {len(train_df)} samples")
print(f"Test set: {len(test_df)} samples")

# Define your NER label set
label_list = ["O", "B-AUTHORITY", "I-AUTHORITY", "B-ADDRESSEE", "I-ADDRESSEE", "B-SECTOR", "I-SECTOR"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in id2label.items()}

def convert_to_transformer_ner_format(df):
    """Convert dataframe to the format expected by HuggingFace transformers for NER"""
    examples = []
    
    for _, row in df.iterrows():
        context = row['context']
        span_text = row['text']
        start_pos = row['start']
        end_pos = row['stop']
        
        # Determine entity label
        if row['high_level_category'] == 'Authority':
            entity_type = "AUTHORITY"
        elif row['tag'] == 'Addressee_sector':
            entity_type = "SECTOR"
        else:
            entity_type = "ADDRESSEE"
            
        # Create IOB tags for each token
        tokens = []
        ner_tags = []
        
        # Tokenize the context
        for i, char in enumerate(context):
            # Very simple character-level tokenization for demonstration
            # In practice, you'd use the model's tokenizer
            tokens.append(char)
            
            # Assign NER tags
            if i == start_pos:
                ner_tags.append(f"B-{entity_type}")
            elif start_pos < i < end_pos:
                ner_tags.append(f"I-{entity_type}")
            else:
                ner_tags.append("O")
        
        examples.append({
            "tokens": tokens, 
            "ner_tags": ner_tags,
            "text": context
        })
    
    return examples

# Convert data for transformers
train_examples = convert_to_transformer_ner_format(train_df)
test_examples = convert_to_transformer_ner_format(test_df)

print(f"Created {len(train_examples)} training examples")
print(f"Created {len(test_examples)} test examples")

# Display a sample example
sample = train_examples[0]
print("\nSample example:")
entity_spans = []
current_entity = None
for i, (token, tag) in enumerate(zip(sample["tokens"][:50], sample["ner_tags"][:50])):
    if tag.startswith("B-"):
        if current_entity:
            entity_spans.append(current_entity)
        current_entity = {"start": i, "text": token, "tag": tag[2:]}
    elif tag.startswith("I-") and current_entity:
        current_entity["text"] += token
    elif current_entity:
        entity_spans.append(current_entity)
        current_entity = None

print(f"Text: '{''.join(sample['tokens'][:50])}...'")
print("Entities:")
for entity in entity_spans:
    print(f"  - '{entity['text']}' ({entity['tag']})")

Training set: 4673 samples
Test set: 1169 samples
Created 4673 training examples
Created 1169 test examples

Sample example:
Text: 'rated undertaking;
(b)
to monitor communications...'
Entities:


2.3 Tokenizer and Dataset Preparation

In [4]:
# Use a pre-trained transformer model
model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    """Tokenize examples and align labels with wordpiece tokens"""
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None
            if word_idx is None:
                label_ids.append(-100)
            # For the first token of a word, we keep the label
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            # For other tokens of the same word, add the same label
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert lists to Dataset objects
train_dataset = Dataset.from_list(train_examples)
test_dataset = Dataset.from_list(test_examples)

# Apply tokenization and label alignment
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 4673/4673 [00:01<00:00, 2503.54 examples/s]
Map: 100%|██████████| 1169/1169 [00:00<00:00, 2537.87 examples/s]


2.4 Model Training

In [5]:
# Fine-tune a pre-trained transformer model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./transformer_ner_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    save_strategy="epoch",
)

# Initialize Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
print("Training the transformer model...")
trainer.train()

# Save the model
trainer.save_model("./transformer_ner_model/final")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: __init__() got an unexpected keyword argument 'evaluation_strategy'

2.5 Model Evaluation

In [None]:
# Function to convert model predictions to NER tags
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    
    true_labels = [[] for _ in range(batch_size)]
    pred_labels = [[] for _ in range(batch_size)]
    
    for batch_idx in range(batch_size):
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                true_labels[batch_idx].append(id2label[label_ids[batch_idx][seq_idx]])
                pred_labels[batch_idx].append(id2label[preds[batch_idx][seq_idx]])
                
    return pred_labels, true_labels

# Make predictions on test set
predictions, label_ids, _ = trainer.predict(tokenized_test_dataset)
pred_labels, true_labels = align_predictions(predictions, label_ids)

# Calculate metrics
print("Transformer NER Model Evaluation Results:")
print(classification_report(true_labels, pred_labels))

# Calculate metrics by entity type
results_by_entity = {
    'AUTHORITY': {'tp': 0, 'fp': 0, 'fn': 0},
    'ADDRESSEE': {'tp': 0, 'fp': 0, 'fn': 0},
    'SECTOR': {'tp': 0, 'fp': 0, 'fn': 0}
}

# Process predictions to extract entity-level metrics
for true_seq, pred_seq in zip(true_labels, pred_labels):
    # Extract entities from the sequence
    true_entities = []
    pred_entities = []
    
    # Extract true entities
    current_entity = None
    for i, label in enumerate(true_seq):
        if label.startswith("B-"):
            if current_entity:
                true_entities.append(current_entity)
            entity_type = label[2:]  # Remove "B-"
            current_entity = {"start": i, "end": i+1, "type": entity_type}
        elif label.startswith("I-") and current_entity:
            current_entity["end"] = i+1
        elif current_entity:
            true_entities.append(current_entity)
            current_entity = None
    if current_entity:
        true_entities.append(current_entity)
    
    # Extract predicted entities
    current_entity = None
    for i, label in enumerate(pred_seq):
        if label.startswith("B-"):
            if current_entity:
                pred_entities.append(current_entity)
            entity_type = label[2:]  # Remove "B-"
            current_entity = {"start": i, "end": i+1, "type": entity_type}
        elif label.startswith("I-") and current_entity:
            current_entity["end"] = i+1
        elif current_entity:
            pred_entities.append(current_entity)
            current_entity = None
    if current_entity:
        pred_entities.append(current_entity)
    
    # Count TP, FP, FN
    for p_entity in pred_entities:
        matched = False
        for t_entity in true_entities:
            if (p_entity["start"] == t_entity["start"] and 
                p_entity["end"] == t_entity["end"] and 
                p_entity["type"] == t_entity["type"]):
                results_by_entity[p_entity["type"]]['tp'] += 1
                matched = True
                break
        if not matched:
            results_by_entity[p_entity["type"]]['fp'] += 1
    
    for t_entity in true_entities:
        if not any(p_entity["start"] == t_entity["start"] and 
                   p_entity["end"] == t_entity["end"] and 
                   p_entity["type"] == t_entity["type"] for p_entity in pred_entities):
            results_by_entity[t_entity["type"]]['fn'] += 1

# Calculate precision, recall, and F1 score for each entity type
transformer_metrics = {}
for entity_type, counts in results_by_entity.items():
    tp = counts['tp']
    fp = counts['fp']
    fn = counts['fn']
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    transformer_metrics[entity_type] = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'tp': tp,
        'fp': fp,
        'fn': fn
    }

# Display metrics by entity type
print("\nTransformer NER Model Evaluation Results by Entity Type:")
for entity_type, metrics in transformer_metrics.items():
    print(f"\nEntity Type: {entity_type}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print(f"True Positives: {metrics['tp']}")
    print(f"False Positives: {metrics['fp']}")
    print(f"False Negatives: {metrics['fn']}")

2.6 Inference Pipeline

In [None]:
def analyze_policy_text_with_transformer(text):
    """Extract entities from text using the transformer-based NER model"""
    # Tokenize the text
    tokens = list(text)  # Simple character tokenization
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True)
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Process predictions
    predictions = outputs.logits.argmax(dim=2).squeeze().tolist()
    word_ids = inputs.word_ids()
    
    # Convert predictions to entities
    entities = []
    current_entity = None
    
    for idx, pred_id in enumerate(predictions):
        if word_ids[idx] is None:
            continue
            
        token = tokens[word_ids[idx]]
        pred_label = id2label.get(pred_id, "O")
        
        if pred_label.startswith("B-"):
            if current_entity:
                entities.append(current_entity)
            entity_type = pred_label[2:]
            current_entity = {"start": word_ids[idx], "text": token, "type": entity_type}
        elif pred_label.startswith("I-") and current_entity:
            current_entity["text"] += token
        elif current_entity:
            entities.append(current_entity)
            current_entity = None
    
    if current_entity:
        entities.append(current_entity)
        
    return {
        'text': text,
        'entities': [
            {
                'text': entity['text'],
                'label': entity['type'],
                'start': entity['start'],
                'end': entity['start'] + len(entity['text'])
            } for entity in entities
        ]
    }

# Test the inference pipeline on a sample
sample_text = test_df['context'].iloc[0]
results = analyze_policy_text_with_transformer(sample_text)

print("Sample inference results:")
print(f"Text: '{sample_text[:100]}...'")
print("\nEntities found:")
for entity in results['entities']:
    print(f"  - '{entity['text']}' ({entity['label']})")

2.7 Comparison with spaCy NER

In [None]:
# Load your previously trained spaCy NER model
import spacy
try:
    spacy_ner = spacy.load("./ner_data/model/model-best")
    print("Successfully loaded trained spaCy NER model")
except Exception as e:
    print(f"Error loading spaCy model: {e}")
    print("Using base spaCy model as a fallback")
    spacy_ner = spacy.load("en_core_web_sm")
    
    # Add custom entity types
    if 'ner' not in spacy_ner.pipe_names:
        ner = spacy_ner.add_pipe('ner')
    else:
        ner = spacy_ner.get_pipe('ner')
        
    # Add custom entity labels
    for label in ['AUTHORITY', 'ADDRESSEE', 'SECTOR']:
        if label not in ner.labels:
            ner.add_label(label)

# Define a function to run and compare both models on the same text
def compare_ner_models(text):
    """Compare spaCy and transformer NER models on the same text"""
    # Process with spaCy NER
    spacy_doc = spacy_ner(text)
    spacy_entities = [
        {
            'text': ent.text,
            'label': ent.label_,
            'start': ent.start_char,
            'end': ent.end_char
        } for ent in spacy_doc.ents
    ]
    
    # Process with transformer NER
    transformer_results = analyze_policy_text_with_transformer(text)
    transformer_entities = transformer_results['entities']
    
    return {
        'text': text,
        'spacy_entities': spacy_entities,
        'transformer_entities': transformer_entities
    }

# Run comparison on a few test examples
comparison_results = []
for i in range(min(3, len(test_df))):
    text = test_df['context'].iloc[i]
    result = compare_ner_models(text)
    comparison_results.append(result)

# Display comparison results
print("\nModel Comparison Results:")
for i, result in enumerate(comparison_results):
    print(f"\nExample {i+1}:")
    print(f"Text: '{result['text'][:100]}...'")
    
    print("\nspaCy NER entities:")
    for entity in result['spacy_entities']:
        print(f"  - '{entity['text']}' ({entity['label']})")
    
    print("\nTransformer NER entities:")
    for entity in result['transformer_entities']:
        print(f"  - '{entity['text']}' ({entity['label']})")

2.8 Performance Comparison

In [None]:
# Compare the performance metrics of both models
def plot_comparison(spacy_metrics, transformer_metrics):
    """Create comparative visualizations of model performance"""
    # Prepare data for plotting
    entities = list(spacy_metrics.keys())
    metrics = ['precision', 'recall', 'f1']
    
    # Create figure with subplots
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    for i, metric in enumerate(metrics):
        # Extract metric values for each entity type
        spacy_values = [spacy_metrics[entity][metric] for entity in entities]
        transformer_values = [transformer_metrics[entity][metric] for entity in entities]
        
        # Set up bar positions
        x = np.arange(len(entities))
        width = 0.35
        
        # Create bars
        axes[i].bar(x - width/2, spacy_values, width, label='spaCy NER')
        axes[i].bar(x + width/2, transformer_values, width, label='Transformer NER')
        
        # Add labels and styling
        axes[i].set_xlabel('Entity Type')
        axes[i].set_ylabel(metric.capitalize())
        axes[i].set_title(f'{metric.capitalize()} Comparison')
        axes[i].set_xticks(x)
        axes[i].set_xticklabels(entities)
        axes[i].grid(True, linestyle='--', alpha=0.6)
        
        # Add value labels on bars
        for j, v in enumerate(spacy_values):
            axes[i].text(j - width/2, v + 0.02, f'{v:.2f}', ha='center')
        for j, v in enumerate(transformer_values):
            axes[i].text(j + width/2, v + 0.02, f'{v:.2f}', ha='center')
    
    axes[0].legend(loc='upper right')
    plt.tight_layout()
    plt.show()

# Extract spaCy metrics from your previous execution
try:
    # Load previously calculated spaCy metrics
    # Note: In a real implementation, you would want to reference your actual metrics
    spacy_metrics = {
        'AUTHORITY': {
            'precision': 0.5608,
            'recall': 0.6331,
            'f1': 0.5947,
            'tp': 226,
            'fp': 177,
            'fn': 131
        },
        'ADDRESSEE': {
            'precision': 0.5913,
            'recall': 0.6044,
            'f1': 0.5978,
            'tp': 327,
            'fp': 226,
            'fn': 214
        },
        'SECTOR': {
            'precision': 0.3802,
            'recall': 0.5387,
            'f1': 0.4458,
            'tp': 146,
            'fp': 238,
            'fn': 125
        }
    }
    
    # Compare model performance
    plot_comparison(spacy_metrics, transformer_metrics)
    
    # Print overall comparison summary
    print("\nModel Comparison Summary:")
    print("Average metrics across all entity types:")
    
    spacy_avg = {
        'precision': sum(m['precision'] for m in spacy_metrics.values()) / len(spacy_metrics),
        'recall': sum(m['recall'] for m in spacy_metrics.values()) / len(spacy_metrics),
        'f1': sum(m['f1'] for m in spacy_metrics.values()) / len(spacy_metrics),
    }
    
    transformer_avg = {
        'precision': sum(m['precision'] for m in transformer_metrics.values()) / len(transformer_metrics),
        'recall': sum(m['recall'] for m in transformer_metrics.values()) / len(transformer_metrics),
        'f1': sum(m['f1'] for m in transformer_metrics.values()) / len(transformer_metrics),
    }
    
    print(f"spaCy NER - Avg Precision: {spacy_avg['precision']:.4f}, Avg Recall: {spacy_avg['recall']:.4f}, Avg F1: {spacy_avg['f1']:.4f}")
    print(f"Transformer NER - Avg Precision: {transformer_avg['precision']:.4f}, Avg Recall: {transformer_avg['recall']:.4f}, Avg F1: {transformer_avg['f1']:.4f}")
    
    # Improvement percentage
    f1_improvement = (transformer_avg['f1'] - spacy_avg['f1']) / spacy_avg['f1'] * 100
    print(f"\nF1 Score improvement: {f1_improvement:.1f}%")
    
except Exception as e:
    print(f"Error in performance comparison: {e}")
    print("Please ensure you have both spaCy and transformer metrics available.")