In [6]:
import json
import os
import nltk
import pandas as pd
from nltk import pos_tag, word_tokenize
from collections import defaultdict

# Download NLTK resources (run only once)
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

# === Configuration ===
# Only include these variation types (exclude swapped)
valid_variations = [
    "Original",
    "Name 1 Female, Name 2 Male",
    "Name 1 Male, Name 2 Female",
    "Name 1 Female Younger, Name 2 Male Older",
    "Name 1 Male Younger, Name 2 Female Older"
]

# File paths and model names
files = {
    "GPT-4o": "gpt-4o_answers_RUN.json",
    "LLaMA 3.1": "llama3.1_answers_RUN.json",
    "Mistral": "mistral-small3.1_answers_RUN.json",
    "Phi-4": "phi4_answers_RUN.json",
    "Qwen2.5": "qwen2.5_32b_answers_RUN.json"
}

# === Helper Functions ===
def extract_adjectives(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    return [word.lower() for word, tag in tagged if tag in ("JJ", "JJR", "JJS")]

def split_response(response, name1, name2):
    """Heuristically split text based on character name mentions."""
    name1_text = ""
    name2_text = ""

    lower_resp = response.lower()
    if name1.lower() in lower_resp and name2.lower() in lower_resp:
        parts = lower_resp.split(name1.lower())
        if len(parts) > 1:
            name1_section = parts[1]
            if name2.lower() in name1_section:
                name1_text = name1_section.split(name2.lower())[0]
                name2_text = name1_section.split(name2.lower())[1]
            else:
                name1_text = name1_section
        parts = lower_resp.split(name2.lower())
        if len(parts) > 1:
            name2_text += parts[1]
    else:
        # fallback: try first/second half
        midpoint = len(response) // 2
        name1_text = response[:midpoint]
        name2_text = response[midpoint:]

    return name1_text, name2_text

# === Main Extraction ===
results = []

for model_name, filename in files.items():
    print(f"Processing {model_name}...")

    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    for scenario_id, variations in data.items():
        for variation, qa_dict in variations.items():
            if variation not in valid_variations:
                continue

            # Use Question 4 and 5 (usually descriptive)
            for q_key in ["Question_1", "Question_2", "Question_3", "Question_4", "Question_5"]:

                if q_key not in qa_dict:
                    continue

                answer = qa_dict[q_key]

                # Try to extract name1 and name2 from prompt, fallback to generic
                try:
                    name1, name2 = scenario_id.split(" and ")
                except:
                    name1, name2 = "Name 1", "Name 2"

                # Split response by name heuristics
                part1, part2 = split_response(answer, name1, name2)

                # Extract adjectives from each part
                adj1 = extract_adjectives(part1)
                adj2 = extract_adjectives(part2)

                results.append({
                    "Model": model_name,
                    "Scenario": scenario_id,
                    "Variation": variation,
                    "Question": q_key,
                    "Character": name1,
                    "Adjectives": adj1
                })
                results.append({
                    "Model": model_name,
                    "Scenario": scenario_id,
                    "Variation": variation,
                    "Question": q_key,
                    "Character": name2,
                    "Adjectives": adj2
                })

# === Save to CSV ===
df = pd.DataFrame(results)
df.to_csv("character_adjectives_by_variation.csv", index=False)
print("✅ Adjective extraction complete. Results saved to character_adjectives_by_variation.csv.")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SGhanbariHaez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SGhanbariHaez\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Processing GPT-4o...
Processing LLaMA 3.1...
Processing Mistral...
Processing Phi-4...
Processing Qwen2.5...
✅ Adjective extraction complete. Results saved to character_adjectives_by_variation.csv.
