In [None]:
import csv
import nltk
import pandas as pd
import numpy as np
from nltk import word_tokenize, pos_tag
from typing import List
import re
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score
from seqeval.metrics import classification_report, f1_score as seqeval_f1_score
import ast

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Define label names as used in BERT-Style.ipynb
label_names = ['O', 'B-actor', 'I-actor', 'B-usecase', 'I-usecase', '[PAD]']
pad_token_label_id = 5  # Index of [PAD] in label_names

# Sample heuristic-based tagging function
def iob_tag_sentence(sentence: str) -> (List[str], List[int]):
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)

    iob_tags = []
    actor_started = False
    usecase_started = False

    for word, tag in pos_tags:
        # Heuristic patterns
        if tag in ['NNP', 'NNPS'] and not actor_started:
            # Begin Actor
            iob_tags.append(1)  # B-Actor
            actor_started = True
            usecase_started = False
        elif tag in ['NNP', 'NNPS'] and actor_started:
            # Inside Actor
            iob_tags.append(2)  # I-Actor
        elif tag in ['VB', 'VBP', 'VBZ', 'VBD', 'VBN'] and not usecase_started:
            # Begin Usecase
            iob_tags.append(3)  # B-Usecase
            actor_started = False
            usecase_started = True
        elif tag in ['NN', 'VB', 'VBP', 'VBG', 'VBD', 'VBN'] and usecase_started:
            # Inside Usecase
            iob_tags.append(4)  # I-Usecase
        else:
            # Outside
            iob_tags.append(0)
            actor_started = False
            usecase_started = False

    return tokens, iob_tags

# Main processing function
def process_sentences(sentences: List[str], output_csv='output.csv'):
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["tokens", "IOB_tag"])
        writer.writeheader()

        for sentence in sentences:
            tokens, tags = iob_tag_sentence(sentence)
            writer.writerow({
                "tokens": tokens,
                "IOB_tag": tags
            })

    print(f"Inference complete. Output written to {output_csv}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAYMOND\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\RAYMOND\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Inference complete. Output written to output.csv


# Reading and Processing Test Data from BERT-Style Dataset

This section loads the test dataset used in BERT-Style.ipynb and processes it with our rules-based system.

In [None]:
# Load the test dataset
test_dataset = pd.read_csv('../corpus-raymond/usecase-test-hf.csv')

# Preview the test data
print("First few rows of the test dataset:")
print(test_dataset.head())
print(f"\nTotal examples in test dataset: {len(test_dataset)}")

# Check if 'tokens' and 'IOB_tag' are already evaluated lists or need to be converted from strings
sample_tokens = test_dataset['tokens'].iloc[0]
if isinstance(sample_tokens, str):
    print("\nConverting string representations to Python lists...")
    test_dataset['tokens'] = test_dataset['tokens'].apply(ast.literal_eval)
    test_dataset['IOB_tag'] = test_dataset['IOB_tag'].apply(ast.literal_eval)
else:
    print("\nTokens and IOB_tags are already in list format.")

# Show an example
print("\nExample tokens and their IOB tags from the test dataset:")
example_idx = 0
print(f"Tokens: {test_dataset['tokens'].iloc[example_idx]}")
print(f"True IOB tags: {test_dataset['IOB_tag'].iloc[example_idx]}")

In [None]:
# Process the test data using our rule-based tagger
def process_test_data(test_df):
    """
    Process test data with our rule-based tagger to generate predictions.
    
    Returns:
    - DataFrame with original tokens, true IOB tags, and predicted IOB tags
    - List of token-level true tags and predicted tags for evaluation
    """
    results = []
    token_level_true = []
    token_level_pred = []
    sequence_level_true = []
    sequence_level_pred = []
    
    for idx, row in test_df.iterrows():
        tokens = row['tokens']
        true_tags = row['IOB_tag']
        
        # Reconstruct sentence from tokens (simple space-joining)
        sentence = " ".join(tokens)
        
        # Get predicted tags using our rule-based system
        _, pred_tags = iob_tag_sentence(sentence)
        
        # Make sure pred_tags is the same length as tokens
        # (in case tokenization differs)
        if len(pred_tags) > len(tokens):
            pred_tags = pred_tags[:len(tokens)]
        elif len(pred_tags) < len(tokens):
            pred_tags.extend([0] * (len(tokens) - len(pred_tags)))
        
        # Save results
        results.append({
            'tokens': tokens,
            'true_tags': true_tags,
            'pred_tags': pred_tags
        })
        
        # Collect token-level tags for evaluation
        token_level_true.extend([label_names[tag] for tag in true_tags])
        token_level_pred.extend([label_names[tag] for tag in pred_tags])
        
        # Collect sequence-level tags for seqeval evaluation
        seq_true = [label_names[tag] for tag in true_tags]
        seq_pred = [label_names[tag] for tag in pred_tags]
        sequence_level_true.append(seq_true)
        sequence_level_pred.append(seq_pred)
    
    results_df = pd.DataFrame(results)
    return results_df, token_level_true, token_level_pred, sequence_level_true, sequence_level_pred

# Process the test data
results_df, token_true, token_pred, seq_true, seq_pred = process_test_data(test_dataset)

# Preview results
print("Preview of our rule-based tagger results:")
print(results_df.head())

# Save results to CSV
output_path = "rule_based_results.csv"
results_df.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

# Evaluation of Rule-Based Tagger

This section evaluates our rule-based tagger using the same metrics as in BERT-Style.ipynb.

In [None]:
# Compute token-level evaluation metrics
token_f1 = f1_score(token_true, token_pred, average='micro')
print(f"Token-level F1 score: {token_f1:.4f}")

# Compute precision, recall and f1 for each class
target_classes = ["B-actor", "I-actor", "B-usecase", "I-usecase", "O"]

precision, recall, f1, _ = precision_recall_fscore_support(
    token_true, token_pred, labels=target_classes, zero_division=0
)

# Compute overall accuracy
accuracy = accuracy_score(token_true, token_pred)

# Create a results DataFrame
metrics_df = pd.DataFrame({
    "Class": target_classes,
    "Precision": precision,
    "Recall": recall,
    "F1-Score": f1
})

# Add overall accuracy
metrics_df.loc[len(metrics_df)] = ["Overall Accuracy", accuracy, accuracy, accuracy]

# Display results
print("\nToken-level classification report:")
print(metrics_df)

# Save to a text file
with open("rule_based_metrics.txt", "w") as file:
    file.write(f"Token-level F1 score: {token_f1:.4f}\n\n")
    file.write(metrics_df.to_string(index=False) + "\n")

# Compute sequence-level evaluation using seqeval
print("\nSequence-level evaluation using seqeval:")
seq_f1 = seqeval_f1_score(seq_true, seq_pred)
print(f"Seqeval F1 score: {seq_f1:.4f}")

# Print seqeval classification report
print("\nSeqeval classification report:")
seqeval_report = classification_report(seq_true, seq_pred, digits=4)
print(seqeval_report)

# Append to the metrics file
with open("rule_based_metrics.txt", "a") as file:
    file.write(f"\nSeqeval F1 score: {seq_f1:.4f}\n\n")
    file.write("Seqeval classification report:\n")
    file.write(seqeval_report)

print(f"Metrics saved to rule_based_metrics.txt")

In [None]:
# Compare with BERT-Style results 
# You can load the saved results from BERT-Style.ipynb for direct comparison

def load_bert_results(result_path):
    try:
        bert_results = pd.read_csv(result_path)
        print(f"Loaded BERT-Style results from {result_path}")
        return bert_results
    except Exception as e:
        print(f"Could not load BERT-Style results: {e}")
        return None

# Path to BERT-Style results (adjust as needed)
bert_result_path = "BERT-Style-result/microsoft/deberta-v3-base-4-epoch-8bs-new/test-result-bert.csv"
bert_results = load_bert_results(bert_result_path)

if bert_results is not None:
    # Compare F1 scores
    bert_f1 = f1_score(bert_results['True'], bert_results['Pred'], average='micro')
    print(f"\nBERT-Style F1 score: {bert_f1:.4f}")
    print(f"Rule-based F1 score: {token_f1:.4f}")
    print(f"Difference: {bert_f1 - token_f1:.4f}")
    
    # Save comparison 
    with open("model_comparison.txt", "w") as file:
        file.write(f"BERT-Style F1 score: {bert_f1:.4f}\n")
        file.write(f"Rule-based F1 score: {token_f1:.4f}\n")
        file.write(f"Difference: {bert_f1 - token_f1:.4f}\n")
    
    print("Comparison saved to model_comparison.txt")

# Error Analysis

This section analyzes where our rule-based system made errors compared to the ground truth.

In [None]:
# Function to analyze errors
def analyze_errors(results_df):
    error_examples = []
    
    for idx, row in results_df.iterrows():
        tokens = row['tokens']
        true_tags = row['true_tags']
        pred_tags = row['pred_tags']
        
        # Check if there's any mismatch in the tags
        has_error = False
        for t, p in zip(true_tags, pred_tags):
            if t != p:
                has_error = True
                break
                
        if has_error:
            # Format the error example
            token_tag_pairs = []
            for token, true_tag, pred_tag in zip(tokens, true_tags, pred_tags):
                is_error = true_tag != pred_tag
                tag_info = f"{token} ({label_names[true_tag]} -> {label_names[pred_tag]})"
                if is_error:
                    tag_info = f"**{tag_info}**"  # Mark errors with bold
                token_tag_pairs.append(tag_info)
            
            error_examples.append({
                'example_id': idx,
                'tokens_with_errors': token_tag_pairs,
                'sentence': " ".join(tokens)
            })
    
    return pd.DataFrame(error_examples)

# Analyze errors
error_df = analyze_errors(results_df)

# Save top errors to a file
with open("error_analysis.txt", "w") as file:
    file.write(f"Total examples with errors: {len(error_df)} out of {len(results_df)} ({len(error_df)/len(results_df)*100:.2f}%)\n\n")
    
    # Print the first 10 errors
    for i, row in error_df.head(10).iterrows():
        file.write(f"Example {row['example_id']}:\n")
        file.write(f"Sentence: {row['sentence']}\n")
        file.write("Tokens with errors (true -> predicted):\n")
        for tag_info in row['tokens_with_errors']:
            file.write(f"  {tag_info}\n")
        file.write("\n")

print(f"Error analysis saved to error_analysis.txt")
print(f"Total examples with errors: {len(error_df)} out of {len(results_df)} ({len(error_df)/len(results_df)*100:.2f}%)")

In [None]:
# Example usage
if __name__ == "__main__":
    example_sentences = [
        "The doctor schedules an appointment for the patient.",
        "The system notifies the user of new messages.",
        "Admin updates the records weekly."
    ]
    process_sentences(example_sentences, "example_output.csv")
    print("\nExample predictions on sample sentences:")
    
    for sentence in example_sentences:
        tokens, tags = iob_tag_sentence(sentence)
        tag_names = [label_names[tag] for tag in tags]
        print(f"Sentence: {sentence}")
        print(f"Tokens: {tokens}")
        print(f"Tags: {tag_names}")
        print()