In [3]:
%pip install torch --index-url https://download.pytorch.org/whl/cu121  # adjust cu version
%pip install sentence-transformers cross-encoder


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Could not find a version that satisfies the requirement cross-encoder (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cross-encoder[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import json
import torch
from tqdm.auto import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder

# --- Device setup ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- Load data (adjust filename as needed) ---
with open('MMAR-meta.json', 'r') as f:
    data = json.load(f)

# --- Define skills ---
SKILLS = {
    "perceptual": [
        {"name": "Speaker Identification", "description": "Recognizing, counting, or detecting speakers or speech events are present in audio.", "example": "How many speakers are there in the audio?"},
        {"name": "Speaker Demographics", "description": "Identifying characteristics of speakers like age, sex, or background.", "example": "How old is the second speaker?"},
        {"name": "Language Identification", "description": "Determining the language, dialect, or accent spoken.", "example": "Based how they talk, where are the interviewees from?"},
        {"name": "Lexical and Phrase-Level Recognition", "description": "Identifying words and short phrases accurately from speech.", "example": "What is the term the speaker uses to describe themselves?"},
        {"name": "Prosody Detection", "description": "Recognizing rhythm, stress, intonation, and emphasis in speech.", "example": "What word sounds important in the answer to the question?"},
        {"name": "Paralinguistic/Emotion Recognition", "description": "Detecting emotions or non-verbal cues from voice.", "example": "How confidently does the interviewee answer the question?"},
        {"name": "Speech Activity, Turn-Taking and Overlap Detection", "description": "Identifying who speaks when, how turns are exchanged, and detecting overlapping speech.", "example": "Who decides to end the conversation?"},
        {"name": "Audio Quality, Artifacts & Channel Characteristics", "description": "Recognizing sound quality issues, background noise, or distortions to speech.", "example": "Based on how they talk, what illness does the speaker have?"}
    ],
    "reasoning": [
        {"name": "Social Role and Relationship Inference", "description": "Inferring relationships among speakers.", "example": "What is the nickname given to the friend of the first speaker"},
        {"name": "Speaker Intent, Pragmatics and Causal Reasoning", "description": "Understanding why something was said, implied meanings, and cause-effect relationships.", "example": "Why does the first speaker slowly repeat everything the second speaker says"},
        {"name": "Quantitative Reasoning (Counting/Arithmetic Comparison)", "description": "Using numbers, counting, and basic math to understand spoken information.", "example": "How many times does the speaker use foul language?"},
        {"name": "Temporal and Ordering Reasoning", "description": "Understanding sequence, timing, and chronological order of events described.", "example": "Does the man speak before the woman in this clip?"},
        {"name": "Logical/Consistency Reasoning", "description": "Recognizing logical sequences/inconsistencies within spoken content.", "example": "How did the speaker misinterpret the directions given by the GPS?"},
        {"name": "Cross-frontier Entity Linking", "description": "Connecting spoken references to external entities or concepts beyond the immediate context.", "example": "What does the first speaker do that implies they are the CEO of the company?"},
        {"name": "Ground Truth and World Knowledge Integration", "description": "Using general knowledge to interpret and verify spoken content.", "example": "What is the capital of the country mentioned by the speaker?"},
        {"name": "Contextual/Causal Scenario Reasoning", "description": "Understanding situations or events described, including cause-and-effect relationships within a scenario.", "example": "What would happen to the second speaker if the things the first speaker says happen"},
        {"name": "Semantic Abstraction and Summarization", "description": "Identifying main ideas, themes, or concise summaries from spoken content.", "example": "What is the meaning of the poem read by the speaker?"},
        {"name": "Comparative and Preference-Based Judgments", "description": "Evaluating and comparing spoken information, identifying preferences or rankings.", "example": "Which of the phrases mentioned by the speaker is the shortest"}
    ]
}

# --- Initialize models ---
nli_model = CrossEncoder('cross-encoder/nli-deberta-base', device=device)
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

# --- Embedding prep for human label mapping ---
def skill_repr(skill):
    return f"{skill['name']}: {skill['description']}"

perc_skill_texts = [skill_repr(s) for s in SKILLS["perceptual"]]
reas_skill_texts = [skill_repr(s) for s in SKILLS["reasoning"]]
perc_embeddings = embed_model.encode(perc_skill_texts, normalize_embeddings=True)
reas_embeddings = embed_model.encode(reas_skill_texts, normalize_embeddings=True)
perc_names = [s["name"] for s in SKILLS["perceptual"]]
reas_names = [s["name"] for s in SKILLS["reasoning"]]

def map_human_labels_to_defined(human_labels, category):
    if category == "perceptual":
        target_embeddings = perc_embeddings
        target_names = perc_names
    else:
        target_embeddings = reas_embeddings
        target_names = reas_names
    mapped = set()
    for hl in human_labels or []:
        hl_emb = embed_model.encode(hl, normalize_embeddings=True)
        sims = np.dot(target_embeddings, hl_emb)
        best_idx = int(np.argmax(sims))
        mapped.add(target_names[best_idx])
    return mapped

# --- NLI batching ---
def batch_entailment_scores(premises, hypotheses, batch_size=32):
    assert len(premises) == len(hypotheses)
    pairs = list(zip(premises, hypotheses))
    entailment_probs = []
    with torch.no_grad():
        for i in range(0, len(pairs), batch_size):
            batch = pairs[i: i + batch_size]
            logits = nli_model.predict(batch, batch_size=len(batch))
            probs = torch.softmax(torch.tensor(logits), dim=-1)
            entailment_probs.extend([p[2].item() for p in probs])
    return entailment_probs

def best_skill_from_cat_label(cat_label, skills_list, category_name):
    if not cat_label:
        return None
    premise = f"The labeled {category_name} category is '{cat_label}'."
    hypotheses = []
    skill_names = []
    for skill in skills_list:
        if category_name == "perceptual":
            hyp = f"This requires perceptual skill: {skill['name']}. {skill['description']} Example: {skill['example']}"
        else:
            hyp = f"This requires reasoning skill: {skill['name']}. {skill['description']} Example: {skill['example']}"
        hypotheses.append(hyp)
        skill_names.append(skill["name"])
    premises = [premise] * len(hypotheses)
    scores = batch_entailment_scores(premises, hypotheses, batch_size=16)
    best_idx = int(np.argmax(scores))
    return skill_names[best_idx]

# --- Safe string helper ---
def safe_str(x):
    if isinstance(x, str):
        return x
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    return str(x)

# --- Thresholds ---
perc_thresh = 0.95
reas_thresh = 0.99

# --- Process entries ---
tagged_data = []

for entry in tqdm(data, desc="Tagging MMAR entries"):
    qid = entry.get("id")
    question = safe_str(entry.get("question", ""))
    answer = safe_str(entry.get("answer", ""))
    qa_text = f"{question} {answer}".strip()

    # No explicit human skill annotations in this format
    human_perc = set()
    human_reas = set()

    perceptual_skills = set()
    reasoning_skills = set()

    # NLI scoring against defined skill hypotheses
    perc_premises = []
    perc_hypotheses = []
    for skill in SKILLS["perceptual"]:
        hypothesis = f"This requires perceptual skill: {skill['name']}. {skill['description']} Example: {skill['example']}"
        perc_premises.append(qa_text)
        perc_hypotheses.append(hypothesis)

    reas_premises = []
    reas_hypotheses = []
    for skill in SKILLS["reasoning"]:
        hypothesis = f"This requires reasoning skill: {skill['name']}. {skill['description']} Example: {skill['example']}"
        reas_premises.append(qa_text)
        reas_hypotheses.append(hypothesis)

    perc_scores_list = batch_entailment_scores(perc_premises, perc_hypotheses, batch_size=16)
    reas_scores_list = batch_entailment_scores(reas_premises, reas_hypotheses, batch_size=16)

    perc_scores = list(zip([s["name"] for s in SKILLS["perceptual"]], perc_scores_list))
    reas_scores = list(zip([s["name"] for s in SKILLS["reasoning"]], reas_scores_list))

    for name, score in perc_scores:
        if score > perc_thresh:
            perceptual_skills.add(name)
    for name, score in reas_scores:
        if score > reas_thresh:
            reasoning_skills.add(name)

    # include (empty) human-mapped
    perceptual_skills |= human_perc
    reasoning_skills |= human_reas

    # derive cat_p / cat_r from category / sub-category per MMAR spec
    cat_p = None
    cat_r = None
    best_from_cat_p = None
    best_from_cat_r = None

    layer = entry.get("category", "")
    subcat = entry.get("sub-category") or entry.get("sub_category")  # defensive
    if layer == "Perception Layer":
        cat_p = safe_str(subcat)
        inferred = best_skill_from_cat_label(cat_p, SKILLS["perceptual"], "perceptual")
        if inferred:
            perceptual_skills.add(inferred)
            best_from_cat_p = inferred
    elif layer in ("Cultural Layer", "Semantic Layer"):
        cat_r = safe_str(subcat)
        inferred = best_skill_from_cat_label(cat_r, SKILLS["reasoning"], "reasoning")
        if inferred:
            reasoning_skills.add(inferred)
            best_from_cat_r = inferred

    # Fallback to ensure at least one per category
    if not perceptual_skills:
        best_perc = max(perc_scores, key=lambda x: x[1])[0]
        perceptual_skills.add(best_perc)
    if not reasoning_skills:
        best_reas = max(reas_scores, key=lambda x: x[1])[0]
        reasoning_skills.add(best_reas)

    tagged_data.append({
        "id": qid,
        "question_answer": qa_text,
        "perceptual_skills": sorted(list(perceptual_skills)),
        "reasoning_skills": sorted(list(reasoning_skills)),
        "human_mapped_perceptual": sorted(list(human_perc)),
        "human_mapped_reasoning": sorted(list(human_reas)),
        "cat_p": cat_p,
        "cat_r": cat_r,
        "best_from_cat_p": best_from_cat_p,
        "best_from_cat_r": best_from_cat_r,
        "perceptual_scores": {name: sc for name, sc in perc_scores},
        "reasoning_scores": {name: sc for name, sc in reas_scores},
    })

# --- Save output ---
with open('tagged_mmar_output.json', 'w') as f:
    json.dump(tagged_data, f, indent=2)

print("MMAR skill tagging complete and results saved.")


Using device: cuda


Tagging MMAR entries:   0%|          | 0/1000 [00:00<?, ?it/s]

MMAR skill tagging complete and results saved.
