In [18]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
import torch
import numpy as np
import os
import json

## Load the label_list from assignment 2

In [19]:
with open('../assignment_2/label_list.txt', 'r') as f:
    label_list = [line.strip() for line in f]

In [20]:
print(label_list)

['ARG0', 'ARG1', 'ARG1-DSP', 'ARG2', 'ARG3', 'ARG4', 'ARG5', 'ARGA', 'ARGM-ADJ', 'ARGM-ADV', 'ARGM-CAU', 'ARGM-COM', 'ARGM-CXN', 'ARGM-DIR', 'ARGM-DIS', 'ARGM-EXT', 'ARGM-GOL', 'ARGM-LOC', 'ARGM-LVB', 'ARGM-MNR', 'ARGM-MOD', 'ARGM-NEG', 'ARGM-PRD', 'ARGM-PRP', 'ARGM-PRR', 'ARGM-REC', 'ARGM-TMP', 'C-ARG0', 'C-ARG1', 'C-ARG1-DSP', 'C-ARG2', 'C-ARG3', 'C-ARG4', 'C-ARGM-ADV', 'C-ARGM-COM', 'C-ARGM-CXN', 'C-ARGM-DIR', 'C-ARGM-EXT', 'C-ARGM-GOL', 'C-ARGM-LOC', 'C-ARGM-MNR', 'C-ARGM-PRP', 'C-ARGM-PRR', 'C-ARGM-TMP', 'C-V', 'O', 'R-ARG0', 'R-ARG1', 'R-ARG2', 'R-ARG3', 'R-ARG4', 'R-ARGM-ADV', 'R-ARGM-CAU', 'R-ARGM-COM', 'R-ARGM-DIR', 'R-ARGM-GOL', 'R-ARGM-LOC', 'R-ARGM-MNR', 'R-ARGM-TMP', 'V']


## Inference Function for Model 2

In [21]:
def perform_srl(sentence, predicate_mask, model, tokenizer, label_list):
    """
    Returns SRL roles for all tokens, maintaining original sentence indices.
    
    Args:
        sentence: List[str] - Tokenized words
        predicate_mask: List[int] - Binary mask with exactly one 1
        model: Fine-tuned SRL model
        tokenizer: Model's tokenizer
        label_list: List[str] - All SRL labels
        
    Returns:
        List[str] - Roles for each token (including 'O' and predicate)
        Example: ['O', 'ARG0', 'V', 'O', 'ARG1', 'ARGM-LOC', 'O']
    """
    # Validate single predicate
    if sum(predicate_mask) != 1:
        raise ValueError("Must specify exactly one predicate")
    
    # Prepare input
    predicate_idx = predicate_mask.index(1)
    inputs = tokenizer(
        sentence,
        [sentence[predicate_idx]],
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True
    )
    
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.cpu().numpy(), axis=2)[0]
    
    # Initialize with 'O' as default
    roles = ['O'] * len(sentence)
    word_ids = inputs.word_ids(0)
    current_word = None
    
    for token_idx, word_id in enumerate(word_ids):
        if word_id is None or word_id == current_word:
            continue
            
        current_word = word_id
        roles[word_id] = label_list[predictions[token_idx]]
    
    return roles

## Code to go load the model and tokenizer for our transformer based model

In [22]:
model_path = "../assignment_2/model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

## Function to go through the json dataset and test the model with the data (Also stores the predicted data in a model)

In [35]:
def test_srl_capability_and_save(json_file_path, output_directory="srl_results"):
    """
    Loads a JSON challenge set file, calculates the failure rate for the SRL model,
    adds predicted roles to the data, and saves the results to a new JSON file.

    Args:
        json_file_path (str): The path to the JSON file for a specific capability.
        output_directory (str): The directory where the result JSON files will be saved.

    Returns:
        dict: A dictionary containing the capability name, total test cases,
              total failures, and failure rate, or None if an error occurred.
    """
    os.makedirs(output_directory, exist_ok=True)

    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {json_file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {json_file_path}")
        return None

    capability_info = data[0]
    capability_name = capability_info["capability"]
    test_cases = capability_info["test_cases"]

    type_1_failure_per_alternation_pair = 0
    type_2_failure_per_alternation_pair = 0
    failures_per_alternation_pair = 0
    processed_test_cases = [] # To store test cases with predictions

    print(f"\n--- Testing Capability: {capability_name} ---")

    for i, test_instance in enumerate(test_cases):
        print(f"\n  Instance {i + 1}:")

        current_instance_failed = False
        processed_instance = test_instance.copy() # Create a copy to add predictions

        # Test the first alternation form
        sentence1_key = list(test_instance.keys())[0]
        predicate_mask1_key = list(test_instance.keys())[1]
        expected_roles1_key = list(test_instance.keys())[2]

        sentence1 = test_instance[sentence1_key]
        predicate_mask1 = test_instance[predicate_mask1_key]
        expected_roles1 = test_instance[expected_roles1_key]

        predicted_roles1 = list(perform_srl(sentence1, predicate_mask1, model, tokenizer, label_list))
        processed_instance[f"{sentence1_key.replace('sentence', 'predicted_roles')}"] = predicted_roles1 # Add predicted roles

        print(f"    Form 1: {' '.join(sentence1)}")
        print(f"      Predicate Mask: {predicate_mask1}")
        print(f"      Expected Roles: {expected_roles1}")
        print(f"      Predicted Roles: {predicted_roles1}")

        if predicted_roles1 != expected_roles1:
            current_instance_failed = True
            type_1_failure_per_alternation_pair += 1
            print("      -> FAILURE for Form 1")
        else:
            print("      -> PASS for Form 1")

        # Test the second alternation form
        sentence2_key = list(test_instance.keys())[3]
        predicate_mask2_key = list(test_instance.keys())[4]
        expected_roles2_key = list(test_instance.keys())[5]

        sentence2 = test_instance[sentence2_key]
        predicate_mask2 = test_instance[predicate_mask2_key]
        expected_roles2 = test_instance[expected_roles2_key]

        predicted_roles2 = list(perform_srl(sentence2, predicate_mask2, model, tokenizer, label_list))
        processed_instance[f"{sentence2_key.replace('sentence', 'predicted_roles')}"] = predicted_roles2 # Add predicted roles

        print(f"    Form 2: {' '.join(sentence2)}")
        print(f"      Predicate Mask: {predicate_mask2}")
        print(f"      Expected Roles: {expected_roles2}")
        print(f"      Predicted Roles: {predicted_roles2}")

        if predicted_roles2 != expected_roles2:
            current_instance_failed = True
            type_2_failure_per_alternation_pair += 1
            print("      -> FAILURE for Form 2")
        else:
            print("      -> PASS for Form 2")

        if current_instance_failed:
            failures_per_alternation_pair += 1
            processed_instance["failure"] = "true"
        else:
            processed_instance["failure"] = "false"
        
        processed_test_cases.append(processed_instance)

    total_alternation_pairs = len(test_cases)
    type_1_failure_rate = (type_1_failure_per_alternation_pair / total_alternation_pairs) * 100 if total_alternation_pairs > 0 else 0
    type_2_failure_rate = (type_2_failure_per_alternation_pair / total_alternation_pairs) * 100 if total_alternation_pairs > 0 else 0
    failure_rate = (failures_per_alternation_pair / total_alternation_pairs) * 100 if total_alternation_pairs > 0 else 0

    # Prepare the output data structure
    output_data = [
        {
            "id": capability_info["id"],
            "capability": capability_name,
            "description": capability_info["description"],
            "total_alternation_pairs": total_alternation_pairs,
            "failures_in_alternation_pairs": failures_per_alternation_pair,
            "type_1_failures": type_1_failure_per_alternation_pair,
            "type_1_failure_rate": f"{type_1_failure_rate:.2f}%",
            "type_2_failures": type_2_failure_per_alternation_pair,
            "type_2_failure_rate": f"{type_2_failure_rate:.2f}%",
            "failure_rate_percent": f"{failure_rate:.2f}%",
            "test_cases": processed_test_cases
        }
    ]

    # Define the output file path
    output_file_name = os.path.basename(json_file_path).replace(".json", "_results.json")
    output_file_path = os.path.join(output_directory, output_file_name)

    # Save the results to a new JSON file
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(output_data, outfile, indent=2, ensure_ascii=False)
    print(f"\nResults saved to: {output_file_path}")

    return {
        "capability": capability_name,
        "total_alternation_pairs": total_alternation_pairs,
        "failures_in_alternation_pairs": failures_per_alternation_pair,
        "failure_rate_percent": f"{failure_rate:.2f}%"
    }

In [36]:
json_files = [
    "active_passive_voice_alternation.json",
    "dative_alternation.json",
    "locative_alternation.json",
    "causative_inchoative_alternation.json",
    "temporal_placement_alternation.json"
]


summary_results = []
output_dir = "second_srl_model_results" # Directory to save new JSONs

print(f"Saving results to directory: {output_dir}")
for json_file in json_files:
    result = test_srl_capability_and_save(json_file, output_dir)
    if result:
        summary_results.append(result)

Saving results to directory: second_srl_model_results

--- Testing Capability: Active/Passive Voice Alternation ---

  Instance 1:
    Form 1: The dog chased the ball .
      Predicate Mask: [0, 0, 1, 0, 0, 0]
      Expected Roles: ['O', 'ARG0', 'V', 'O', 'ARG1', 'O']
      Predicted Roles: ['O', 'ARG0', 'V', 'O', 'ARG1', 'O']
      -> PASS for Form 1
    Form 2: The ball was chased by the dog .
      Predicate Mask: [0, 0, 0, 1, 0, 0, 0, 0]
      Expected Roles: ['O', 'ARG1', 'O', 'V', 'O', 'O', 'ARG0', 'O']
      Predicted Roles: ['O', 'ARG1', 'O', 'V', 'O', 'O', 'ARG0', 'O']
      -> PASS for Form 2

  Instance 2:
    Form 1: Anna loves Benjamin .
      Predicate Mask: [0, 1, 0, 0]
      Expected Roles: ['ARG0', 'V', 'ARG1', 'O']
      Predicted Roles: ['ARG0', 'V', 'ARG1', 'O']
      -> PASS for Form 1
    Form 2: Benjamin is loved by Anna .
      Predicate Mask: [0, 0, 1, 0, 0, 0]
      Expected Roles: ['ARG1', 'O', 'V', 'O', 'ARG0', 'O']
      Predicted Roles: ['ARG1', 'O', 'V', 

In [37]:
print("\n--- Overall Summary of All Capability Tests ---")
for r in summary_results:
    print(f"Capability: {r['capability']}")
    print(f"  Total Alternation Pairs: {r['total_alternation_pairs']}")
    print(f"  Failures in Alternation Pairs: {r['failures_in_alternation_pairs']}")
    print(f"  Failure Rate: {r['failure_rate_percent']}")
    print("-" * 30)


--- Overall Summary of All Capability Tests ---
Capability: Active/Passive Voice Alternation
  Total Alternation Pairs: 10
  Failures in Alternation Pairs: 1
  Failure Rate: 10.00%
------------------------------
Capability: Dative Alternation (Ditransitive Verbs)
  Total Alternation Pairs: 10
  Failures in Alternation Pairs: 7
  Failure Rate: 70.00%
------------------------------
Capability: Locative Alternation
  Total Alternation Pairs: 10
  Failures in Alternation Pairs: 10
  Failure Rate: 100.00%
------------------------------
Capability: Causative/Inchoative Alternation
  Total Alternation Pairs: 10
  Failures in Alternation Pairs: 0
  Failure Rate: 0.00%
------------------------------
Capability: Temporal Adverbial Fronting/Backing (ARGM-TMP)
  Total Alternation Pairs: 10
  Failures in Alternation Pairs: 1
  Failure Rate: 10.00%
------------------------------


In [9]:

sentence = ["The", "dog", "chased", "the", "ball", "in", "the", "park", "."]
predicate_mask = [0, 0, 1, 0, 0, 0, 0, 0, 0]

arguments = perform_srl(sentence, predicate_mask, model, tokenizer, label_list)
print("Argument roles:", arguments)

Argument roles: ['O', 'ARG0', 'V', 'O', 'ARG1', 'O', 'O', 'ARGM-LOC', 'O']
