In [None]:
# Install dependencies (run this in a notebook cell)
!pip install transformers datasets seqeval torch
!pip install datasets
!pip install seqeval --upgrade

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
from seqeval.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import random
import os

# Set device
device = 0 if torch.cuda.is_available() else -1
print(f"Using {'GPU' if device == 0 else 'CPU'}")

# Load dataset
dataset = load_dataset("dell-research-harvard/newswire")["train"]

# Define model names
models_info = {
    "Custom NER": "dell-research-harvard/historical_newspaper_ner",
    "RoBERTa-Large": "Jean-Baptiste/roberta-large-ner-english"
}

# Load models once
pipelines = {}
for name, model_name in models_info.items():
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    model.to("cuda" if device == 0 else "cpu")
    pipelines[name] = TokenClassificationPipeline(
        model=model, tokenizer=tokenizer,
        aggregation_strategy="simple", device=device
    )

# Function to get predictions
def get_predictions(pipeline, test_data):
    true_labels, pred_labels = [], []
    for row in test_data:
        words = row["ner_words"]
        gold = row["ner_labels"]
        sentence = " ".join(words)
        prediction = pipeline(sentence)
        pred_seq = ["O"] * len(words)
        for ent in prediction:
            word = ent["word"]
            tag = ent.get("entity_group", ent.get("entity"))
            for idx, w in enumerate(words):
                if word.lower() in w.lower() and pred_seq[idx] == "O":
                    pred_seq[idx] = "B-" + tag
                    break
        true_labels.append(gold)
        pred_labels.append(pred_seq)
    return true_labels, pred_labels


In [None]:
# Corpus sizes to test
corpus_sizes = [1000, 10000, 25000, 50000, 100000]

# Custom color palette
custom_palette = [
    "#F4A300", "#4BE3AC", "#9B5DE5", "#FF6B6B", "#033268",
    "#E777C2", "#6BA3D1", "#CFA15A", "#C0D2C1", "#F1C6C6",
    "#F5D4A1", "#E2B97C", "#EDE9E0"
]

# Store metrics
metrics = {metric: {model: [] for model in models_info.keys()} for metric in ["F1", "Precision", "Recall"]}

# Shuffle for consistent sampling
random.seed(42)
shuffled_indices = list(range(len(dataset)))
random.shuffle(shuffled_indices)

for size in corpus_sizes:
    subset = dataset.select(shuffled_indices[:size])
    split = int(0.8 * size)
    test_subset = subset.select(range(split, size))  # test 20%

    for model_name, pipe in pipelines.items():
        print(f"Evaluating {model_name} on {size} samples...")
        true, pred = get_predictions(pipe, test_subset)

        metrics["F1"][model_name].append(f1_score(true, pred))
        metrics["Precision"][model_name].append(precision_score(true, pred))
        metrics["Recall"][model_name].append(recall_score(true, pred))
        torch.cuda.empty_cache()

# Create directory for plots
os.makedirs("ner_model_plots", exist_ok=True)

# Plot and save each metric
for i, metric in enumerate(["F1", "Precision", "Recall"]):
    plt.figure(figsize=(5, 6))
    for j, (model_name, scores) in enumerate(metrics[metric].items()):
        plt.plot(corpus_sizes, scores, label=model_name, color=custom_palette[j], marker='o', linewidth=2)

    plt.title(f"{metric} Score vs. Corpus Size")
    plt.xlabel("Corpus Size")
    plt.ylabel(f"{metric} Score")
    plt.ylim(0, 1)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plot_path = f"ner_model_plots/{metric.lower()}_score_vs_corpus_size.png"
    plt.savefig(plot_path)
    print(f"Saved: {plot_path}")
    plt.show()