In [None]:
# Install dependencies (run this in a notebook cell, not needed in code below)
!pip install transformers datasets seqeval torch
!pip install datasets

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
import random

# Set device
device = 0 if torch.cuda.is_available() else -1
print(f"Device set to use {'cuda:0' if device == 0 else 'cpu'}")

# Load dataset
dataset = load_dataset("dell-research-harvard/newswire")
full_data = dataset["train"]

# Select 10000 random rows from the dataset
random.seed(42)  # Set seed for reproducibility
indices = random.sample(range(len(full_data)), 10000)  # Randomly sample
small_corpus = full_data.select(indices)  # Select the rows based on these indices

# Shuffle and split
random.seed(42)
indices = list(range(len(small_corpus)))
random.shuffle(indices)
split_idx = int(0.8 * len(indices))
train_data = small_corpus.select(indices[:split_idx])
test_data = small_corpus.select(indices[split_idx:])

# Optional: use smaller subset for testing speed (already done above by selecting 1000 samples)
#test_data = test_data.select(range(100))

# Define standard CoNLL-style label names
label_names = [
    "O", "B-PER", "I-PER", "B-ORG", "I-ORG",
    "B-LOC", "I-LOC", "B-MISC", "I-MISC"
]

# Define prediction function
def get_predictions(model_name, test_data):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)

    true_labels, pred_labels = [], []

    for row in test_data:
        words = row["ner_words"]
        gold = row["ner_labels"]  # Already string labels
        sentence = " ".join(words)

        prediction = pipe(sentence)

        # Initialize with "O"
        pred_seq = ["O"] * len(words)

        for ent in prediction:
            entity = ent.get("entity_group", ent.get("entity"))
            word = ent["word"]

            # Match predicted word to a token
            for idx, w in enumerate(words):
                if word.lower() in w.lower() and pred_seq[idx] == "O":
                    pred_seq[idx] = "B-" + entity
                    break

        gold_seq = gold  # Already string labels
        true_labels.append(gold_seq)
        pred_labels.append(pred_seq)

    return true_labels, pred_labels


# Define evaluation function
def evaluate_model(name, true_labels, pred_labels):
    print(f"\n=== {name} ===")
    print("Precision:", precision_score(true_labels, pred_labels))
    print("Recall:", recall_score(true_labels, pred_labels))
    print("F1 Score:", f1_score(true_labels, pred_labels))
    print("\nDetailed Report:\n", classification_report(true_labels, pred_labels))

# Evaluate Historical NER model
model_custom = "dell-research-harvard/historical_newspaper_ner"
true_custom, pred_custom = get_predictions(model_custom, test_data)
evaluate_model("Custom Historical Newspaper NER", true_custom, pred_custom)

# Evaluate RoBERTa-Large NER model
model_roberta = "Jean-Baptiste/roberta-large-ner-english"
true_roberta, pred_roberta = get_predictions(model_roberta, test_data)
evaluate_model("RoBERTa-Large (CoNLL03)", true_roberta, pred_roberta)


In [None]:
print(full_data)

In [None]:
!pip install seqeval --upgrade

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score


def flatten_labels(true_labels, pred_labels):
    flat_true = [label for sublist in true_labels for label in sublist]
    flat_pred = [label for sublist in pred_labels for label in sublist]
    return flat_true, flat_pred

def evaluate_model(name, true_labels, pred_labels):
    print(f"\n=== {name} ===")

    # Get the flattened labels for scikit-learn metrics
    true_labels_flat, pred_labels_flat = flatten_labels(true_labels, pred_labels)

    precision = precision_score(true_labels_flat, pred_labels_flat, average='macro', zero_division=1)
    recall = recall_score(true_labels_flat, pred_labels_flat, average='macro', zero_division=1)
    f1 = f1_score(true_labels_flat, pred_labels_flat, average='macro', zero_division=1)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("\nDetailed Report:\n", classification_report(true_labels, pred_labels, zero_division=1))

model_custom = "dell-research-harvard/historical_newspaper_ner"
true_custom, pred_custom = get_predictions(model_custom, test_data)
evaluate_model("Custom Historical Newspaper NER", true_custom, pred_custom)

model_roberta = "Jean-Baptiste/roberta-large-ner-english"
true_roberta, pred_roberta = get_predictions(model_roberta, test_data)
evaluate_model("RoBERTa-Large (CoNLL03)", true_roberta, pred_roberta)

In [None]:
# Convert the small_corpus to df for topic-based analysis
df = pd.DataFrame(small_corpus)

# groubpy 'ca_topic'
topic_counts = df['ca_topic'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))

sns.barplot(y=topic_counts.index, x=topic_counts.values, palette='cubehelix')
plt.title('Distribution of Articles by Topic')
plt.xlabel('Number of Articles')  
plt.ylabel('Topic')  
plt.tight_layout()
plt.show()

# Create a dictionary to store entity counts by topic
entity_counts_by_topic = {}

# Loop through each row to count the entities by topic
for i, row in df.iterrows():
    topic = row['ca_topic']
    ner_labels = row['ner_labels'] 

    # Count the entities in the row
    entity_counts = Counter(ner_labels)

    # Add the counts to the dictionary for the topic
    if topic not in entity_counts_by_topic:
        entity_counts_by_topic[topic] = Counter()
    entity_counts_by_topic[topic].update(entity_counts)

entity_counts_df = pd.DataFrame.from_dict(entity_counts_by_topic, orient='index').fillna(0)
entity_counts_df = entity_counts_df.astype(int)  # Ensure counts are integers
print(entity_counts_df)

# Plot the entity distribution by topic
entity_counts_df.plot(kind='barh', stacked=True, figsize=(12, 8), colormap='Set3')

plt.title('Entity Distribution by Topic')
plt.xlabel('Count of Entities')
plt.ylabel('Topic')
plt.legend(title='Entity Types', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Store precision, recall, and F1 score for each topic
precision_by_topic = {}
recall_by_topic = {}
f1_by_topic = {}

# Group predictions and true labels by topic
for topic in df['ca_topic'].unique():
    # Filter data by topic
    topic_data = df[df['ca_topic'] == topic]

    # Get the true labels and predicted labels
    true_labels = topic_data['ner_labels']
    #pred_labels = topic_data['ner_labels']  # Replace w predictions

    # Calculate precision, recall, and F1 score
    precision_by_topic[topic] = precision_score(true_labels, pred_labels, average='macro', zero_division=1)
    recall_by_topic[topic] = recall_score(true_labels, pred_labels, average='macro', zero_division=1)
    f1_by_topic[topic] = f1_score(true_labels, pred_labels, average='macro', zero_division=1)

metrics_df = pd.DataFrame({
    'Precision': precision_by_topic,
    'Recall': recall_by_topic,
    'F1 Score': f1_by_topic
})

metrics_df.plot(kind='barh', figsize=(12, 6), colormap='Set1')
plt.title('Model Performance by Topic')
plt.xlabel('Topic')
plt.ylabel('Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def plot_entity_distribution(true_labels):
    flat_labels = [label for seq in true_labels for label in seq]
    label_counts = Counter(flat_labels)

    plt.figure(figsize=(6, 4))
    sns.barplot(x=list(label_counts.keys()),
                y=list(label_counts.values()),
                color='#CFA15A')
    plt.title("Entity Type Distribution")
    plt.xlabel("Entity Type")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# For historical newspaper model
plot_entity_distribution(true_custom)

# For RoBERTa model
plot_entity_distribution(true_roberta)