In [None]:
# Install dependencies (run this in a notebook cell, not needed in code below)
!pip install transformers datasets seqeval torch
!pip install datasets

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
import random

# Set device
device = 0 if torch.cuda.is_available() else -1
print(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")

# Load dataset
dataset = load_dataset("dell-research-harvard/newswire")
full_data = dataset["train"]

# Select 10000 random rows from the dataset
random.seed(42)  # Set seed for reproducibility
indices = random.sample(range(len(full_data)), 10000)  # Randomly sample
small_corpus = full_data.select(indices)  # Select the rows based on these indices

# Shuffle and split
random.seed(42)
indices = list(range(len(small_corpus)))
random.shuffle(indices)
split_idx = int(0.8 * len(indices))
train_data = small_corpus.select(indices[:split_idx])
test_data = small_corpus.select(indices[split_idx:])

# Optional: use smaller subset for testing speed (already done above by selecting 1000 samples)
#test_data = test_data.select(range(100))

# Define standard CoNLL-style label names
label_names = [
    "O", "B-PER", "I-PER", "B-ORG", "I-ORG",
    "B-LOC", "I-LOC", "B-MISC", "I-MISC"
]

# Define prediction function
def get_predictions(model_name, test_data):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

    true_labels, pred_labels = [], []

    for row in test_data:
        words = row["ner_words"]
        gold = row["ner_labels"]  # Already string labels
        sentence = " ".join(words)

        prediction = pipe(sentence)

        # Initialize with "O"
        pred_seq = ["O"] * len(words)

        for ent in prediction:
            entity = ent.get("entity_group", ent.get("entity"))
            word = ent["word"]

            # Match predicted word to a token
            for idx, w in enumerate(words):
                if word.lower() in w.lower() and pred_seq[idx] == "O":
                    pred_seq[idx] = "B-" + entity
                    break

        gold_seq = gold  # Already string labels
        true_labels.append(gold_seq)
        pred_labels.append(pred_seq)

    return true_labels, pred_labels


# Define evaluation function
def evaluate_model(name, true_labels, pred_labels):
    print(f"\n=== {name} ===")
    print("Precision:", precision_score(true_labels, pred_labels))
    print("Recall:", recall_score(true_labels, pred_labels))
    print("F1 Score:", f1_score(true_labels, pred_labels))
    print("\nDetailed Report:\n", classification_report(true_labels, pred_labels))

# Evaluate Historical NER model
model_custom = "dell-research-harvard/historical_newspaper_ner"
true_custom, pred_custom = get_predictions(model_custom, test_data)
evaluate_model("Custom Historical Newspaper NER", true_custom, pred_custom)

# Evaluate RoBERTa-Large NER model
model_roberta = "Jean-Baptiste/roberta-large-ner-english"
true_roberta, pred_roberta = get_predictions(model_roberta, test_data)
evaluate_model("RoBERTa-Large (CoNLL03)", true_roberta, pred_roberta)


In [None]:
print(full_data)

In [None]:
models_to_compare = {
    "Custom Historical Newspaper NER": "dell-research-harvard/historical_newspaper_ner",
    "BERT-Large (ConLL03)": "dbmdz/bert-large-cased-finetuned-conll03-english",
    "DistilBERT (ConLL03)": "elastic/distilbert-base-uncased-finetuned-conll03-english",
    "RoBERTa-Large (ConLL03)": "Jean-Baptiste/roberta-large-ner-english"
}


In [None]:
# Dictionary to store the metrics of each model
detailed_results = {}
results = {}

for model_desc, model_name in models_to_compare.items():
    print(f"Testing {model_desc} ...")
    true_labels, pred_labels = get_predictions(model_name, test_data)

    # save reults
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)
    results[model_desc] = {"Precision": precision, "Recall": recall, "F1 Score": f1}

    # create classification_report and save it on detailed_results
    report_dict = classification_report(true_labels, pred_labels, output_dict=True)
    detailed_results[model_desc] = report_dict

    print(f"Finished {model_desc}.")

In [None]:
import pandas as pd

results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="F1 Score", ascending=False)
print("\nModel Evaluation Summary")
print(results_df)


In [None]:
import matplotlib.pyplot as plt

colors_palette = ['#274472', '#C0D2C1', '#CFA15A', '#E777C2']

# convert to data frame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="F1 Score", ascending=False)

# plot
fig, ax = plt.subplots(figsize=(10, 6))
x = range(len(results_df))
bar_width = 0.25

ax.bar([i - bar_width for i in x], results_df['Precision'], width=bar_width, label='Precision', color=colors_palette[0])
ax.bar(x, results_df['Recall'], width=bar_width, label='Recall', color=colors_palette[1])
ax.bar([i + bar_width for i in x], results_df['F1 Score'], width=bar_width, label='F1 Score', color=colors_palette[2])

ax.set_xticks(x)
ax.set_xticklabels(results_df.index, rotation=45, ha="right")
ax.set_ylabel("Score")
ax.set_title("Comparison of Global Metrics between Models")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# F1 by category

# dataframe that contains F1 Score for each label

# get all labels
all_labels = []
for model_name, detail in detailed_results.items():
    for label in detail.keys():
        if label not in ['micro avg', 'macro avg', 'weighted avg', 'accuracy']:
            all_labels.append(label)
all_labels = sorted(set(all_labels))

# create dictionary
data = {}
for model_name, detail in detailed_results.items():
    data[model_name] = {}
    for label in all_labels:
        score = detail.get(label, {}).get('f1-score', 0)
        data[model_name][label] = score

df_labels = pd.DataFrame(data)

# plot
model_colors = colors_palette[:len(df_labels.columns)]
df_labels.plot(kind="bar", figsize=(12, 8), color=model_colors)
plt.title("F1 Score by Category (Label) between Models")
plt.ylabel("F1 Score")
plt.xlabel("Label")
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
!pip install nbstripout
!nbstripout Custom_NER_model_evaluation_Expanded.ipynb