In [1]:
%pip -q install transformers torch pandas numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np

In [3]:
import torch
from transformers import pipeline
import pandas as pd

print("CUDA available:", torch.cuda.is_available(), "| device count:", torch.cuda.device_count())

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: False | device count: 0


In [4]:
PERCEPTUAL = {
    "Speaker Identification": "Recognizing and counting how many speakers or speech events are present in audio.",
    "Speaker Demographics": "Identifying characteristics of speakers like age, sex, or background",
    "Language Identification": "Determining the language, dialect, or accent spoken.",
    "Lexical and Phrase-Level Recognition": "Identifying words and short phrases accurately from speech.",
    "Prosody Detection": "Recognizing rhythm, stress, intonation, and emphasis in speech.",
    "Paralinguistic/Emotion Recognition": "Detecting emotions or non-verbal cues (eg: laughter, anger) from voice.",
    "Speech Activity, Turn-Taking and Overlap Detection": "Identifying who speaks when, how turns are exchanged, and detecting overlapping speech.",
    "Audio Quality, Artifacts & Channel Characteristics": "Recognizing sound quality issues, background noise, distortions, or technical audio problems."
}

REASONING = {
    "Social Role and Relationship Inference": "Inferring relationships (ie family, professional roles) among speakers.",
    "Speaker Intent, Pragmatics and Causal Reasoning": "Understanding why something was said, implied meanings, and cause-effect relationships.",
    "Quantitative Reasoning (Counting/Arithmetic Comparison)": "Using numbers, counting, and basic math to understand spoken information.",
    "Temporal and Ordering Reasoning": "Understanding sequence, timing, and chronological order of events described.",
    "Logical/Consistency Reasoning": "Recognizing logical sequences/inconsistencies within spoken content.",
    "Cross-frontier Entity Linking": "Connecting spoken references to external entities or concepts beyond the immediate context.",
    "Ground Truth and World Knowledge Integration": "Using general knowledge to interpret and verify spoken content.",
    "Contextual/Causal Scenario Reasoning": "Understanding situations or events described, including cause-and-effect relationships within a scenario.",
    "Semantic Abstraction and Summarization": "Identifying main ideas, themes, or concise summaries from spoken content.",
    "Comparative and Preference-Based Judgments": "Evaluating and comparing spoken information, identifying preferences or rankings."
}


In [5]:
device_id = 0 #if torch.cuda.is_available()
zshot = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device_id
)
print("Using device:", device_id)

Device set to use mps:0


Using device: 0


In [6]:
def pick_dynamic_top(scores: list, max_keep: int = 5):
    """
    Given a descending list of scores (0..1), return how many to keep.
    Uses a knee/drop heuristic: first large drop above mean+std of diffs.
    """
    if len(scores) == 1:
        return 1
    diffs = np.array(scores[:-1]) - np.array(scores[1:])
    thr = diffs.mean() + diffs.std()          # dynamic, data-driven
    idx = np.where(diffs >= thr)[0]
    keep_n = (idx[0] + 1) if len(idx) > 0 else min(len(scores), max_keep)
    return max(1, keep_n)

def tag_with_set(question: str, label_dict: dict) -> pd.DataFrame:
    labels = list(label_dict.keys())
    res = zshot(
        question,
        candidate_labels=labels,
        multi_label=True,
        hypothesis_template="This requires {}."
    )
    # Sort already provided sorted; ensure DataFrame
    df = pd.DataFrame({"label": res["labels"], "score": res["scores"]})
    keep_n = pick_dynamic_top(df["score"].tolist())
    return df.head(keep_n).reset_index(drop=True)

def tag_perceptual(question: str) -> pd.DataFrame:
    return tag_with_set(question, PERCEPTUAL)

def tag_reasoning(question: str) -> pd.DataFrame:
    return tag_with_set(question, REASONING)


In [7]:
per_q = "How many distinct speakers are in the clip?"
rea_q = "Why did the coach bench the player at the end?"

print("Perceptual:", per_q)
display(tag_perceptual(per_q))

print("\nReasoning:", rea_q)
display(tag_reasoning(rea_q))


Perceptual: How many distinct speakers are in the clip?


Unnamed: 0,label,score
0,Speaker Identification,0.988764
1,Language Identification,0.909924
2,"Audio Quality, Artifacts & Channel Characteris...",0.889872
3,Prosody Detection,0.862415
4,Speaker Demographics,0.797803
5,Lexical and Phrase-Level Recognition,0.759536
6,Paralinguistic/Emotion Recognition,0.676452



Reasoning: Why did the coach bench the player at the end?


Unnamed: 0,label,score
0,Logical/Consistency Reasoning,0.998278
1,Contextual/Causal Scenario Reasoning,0.993717
2,Temporal and Ordering Reasoning,0.963775
3,"Speaker Intent, Pragmatics and Causal Reasoning",0.930191
4,Comparative and Preference-Based Judgments,0.900925
5,Social Role and Relationship Inference,0.728583
6,Cross-frontier Entity Linking,0.718939
7,Ground Truth and World Knowledge Integration,0.684365
8,Quantitative Reasoning (Counting/Arithmetic Co...,0.52237


In [8]:
def batch_tag(questions, tagger):
    rows = []
    for q in questions:
        df = tagger(q)
        for _, r in df.iterrows():
            rows.append({"question": q, "skill": r.label, "score": r.score})
    return pd.DataFrame(rows)

questions = [
    "Who laughed right after the question—was it Speaker A or B?",
    "What accent does the second speaker have?",
    "How many examples support the idea that extremes are not good?"
]

display(batch_tag(questions, tag_perceptual))
display(batch_tag(questions, tag_reasoning))


Unnamed: 0,question,skill,score
0,Who laughed right after the question—was it Sp...,Speaker Identification,0.996473
1,Who laughed right after the question—was it Sp...,Paralinguistic/Emotion Recognition,0.980186
2,Who laughed right after the question—was it Sp...,Language Identification,0.979326
3,Who laughed right after the question—was it Sp...,Prosody Detection,0.963374
4,Who laughed right after the question—was it Sp...,Lexical and Phrase-Level Recognition,0.926024
5,Who laughed right after the question—was it Sp...,"Speech Activity, Turn-Taking and Overlap Detec...",0.90021
6,Who laughed right after the question—was it Sp...,"Audio Quality, Artifacts & Channel Characteris...",0.865489
7,What accent does the second speaker have?,Speaker Identification,0.995584
8,What accent does the second speaker have?,Language Identification,0.991945
9,What accent does the second speaker have?,Prosody Detection,0.980641


Unnamed: 0,question,skill,score
0,Who laughed right after the question—was it Sp...,Contextual/Causal Scenario Reasoning,0.987703
1,Who laughed right after the question—was it Sp...,Logical/Consistency Reasoning,0.98441
2,Who laughed right after the question—was it Sp...,Comparative and Preference-Based Judgments,0.953439
3,Who laughed right after the question—was it Sp...,Temporal and Ordering Reasoning,0.931267
4,Who laughed right after the question—was it Sp...,Social Role and Relationship Inference,0.922288
5,Who laughed right after the question—was it Sp...,"Speaker Intent, Pragmatics and Causal Reasoning",0.875131
6,Who laughed right after the question—was it Sp...,Cross-frontier Entity Linking,0.829973
7,What accent does the second speaker have?,Comparative and Preference-Based Judgments,0.96883
8,What accent does the second speaker have?,Contextual/Causal Scenario Reasoning,0.894529
9,What accent does the second speaker have?,Logical/Consistency Reasoning,0.866948
