In [1]:
import json
import random
from dataclasses import dataclass
from typing import List, Dict, Callable, Optional

import numpy as np

# For NLI zero-shot
try:
    from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
except ImportError:
    AutoModelForSequenceClassification = None
    AutoTokenizer = None
    pipeline = None

# For calibration
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
{
  "SKILLS": [
    {
      "perceptual": [
        {
          "name": "Speaker Identification",
          "description": "Recognizing, counting, or detecting speakers or speech events are present in audio.",
          "example": "How many speakers are there in the audio??"
        },
        {
          "name": "Speaker Demographics",
          "description": "Identifying characteristics of speakers like age, sex, or background.",
          "example": "How old is the second speaker?"
        },
        {
          "name": "Language Identification",
          "description": "Determining the language, dialect, or accent spoken.",
          "example": "Based how they talk, where are the interviewees from?"
        },
        {
          "name": "Lexical and Phrase-Level Recognition",
          "description": "Identifying words and short phrases accurately from speech.",
          "example": "What is the term the speaker uses to describe themselves?"
        },
        {
          "name": "Prosody Detection",
          "description": "Recognizing rhythm, stress, intonation, and emphasis in speech.",
          "example": "What word sounds important in the answer to the question?"
        },
        {
          "name": "Paralinguistic/Emotion Recognition",
          "description": "Detecting emotions or non-verbal cues from voice.",
          "example": "How confidently does the interviewee answer the question?"
        },
        {
          "name": "Speech Activity, Turn-Taking and Overlap Detection",
          "description": "Identifying who speaks when, how turns are exchanged, and detecting overlapping speech.",
          "example": "Who decides to end the conversation?"
        },
        {
          "name": "Audio Quality, Artifacts & Channel Characteristics",
          "description": "Recognizing sound quality issues, background noise, or distortions to speech.",
          "example": "Based on how they talk, what illness does the speaker have?"
        }
      ],
      "reasoning": [
        {
          "name": "Social Role and Relationship Inference",
          "description": "Inferring relationships among speakers.",
          "example": "What is the nickname given to the friend of the first speaker"
        },
        {
          "name": "Speaker Intent, Pragmatics and Causal Reasoning",
          "description": "Understanding why something was said, implied meanings, and cause-effect relationships.",
          "example": "Why does the first speaker slowly repeat everything the second speaker says"
        },
        {
          "name": "Quantitative Reasoning (Counting/Arithmetic Comparison)",
          "description": "Using numbers, counting, and basic math to understand spoken information.",
          "example": "How many times does the speaker use foul language?"
        },
        {
          "name": "Temporal and Ordering Reasoning",
          "description": "Understanding sequence, timing, and chronological order of events described.",
          "example": "Does the man speak before the woman in this clip?"
        },
        {
          "name": "Logical/Consistency Reasoning",
          "description": "Recognizing logical sequences/inconsistencies within spoken content.",
          "example": "How did the speaker misinterpret the directions given by the GPS?"
        },
        {
          "name": "Cross-frontier Entity Linking",
          "description": "Connecting spoken references to external entities or concepts beyond the immediate context.",
          "example": "What does the first speaker do that implies they are the CEO of the company?"
        },
        {
          "name": "Ground Truth and World Knowledge Integration",
          "description": "Using general knowledge to interpret and verify spoken content.",
          "example": "What is the capital of the country mentioned by the speaker?"
        },
        {
          "name": "Contextual/Causal Scenario Reasoning",
          "description": "Understanding situations or events described, including cause-and-effect relationships within a scenario.",
          "example": "What would happen to the second speaker if the things the first speaker says happen"
        },
        {
          "name": "Semantic Abstraction and Summarization",
          "description": "Identifying main ideas, themes, or concise summaries from spoken content.",
          "example": "What is the meaning of the poem read by the speaker?"
        },
        {
          "name": "Comparative and Preference-Based Judgments",
          "description": "Evaluating and comparing spoken information, identifying preferences or rankings.",
          "example": "Which of the phrases mentioned by the speaker is the shortest"
        }
      ]
    }
  ]
}

{'SKILLS': [{'perceptual': [{'name': 'Speaker Identification',
     'description': 'Recognizing, counting, or detecting speakers or speech events are present in audio.',
     'example': 'How many speakers are there in the audio??'},
    {'name': 'Speaker Demographics',
     'description': 'Identifying characteristics of speakers like age, sex, or background.',
     'example': 'How old is the second speaker?'},
    {'name': 'Language Identification',
     'description': 'Determining the language, dialect, or accent spoken.',
     'example': 'Based how they talk, where are the interviewees from?'},
    {'name': 'Lexical and Phrase-Level Recognition',
     'description': 'Identifying words and short phrases accurately from speech.',
     'example': 'What is the term the speaker uses to describe themselves?'},
    {'name': 'Prosody Detection',
     'description': 'Recognizing rhythm, stress, intonation, and emphasis in speech.',
     'example': 'What word sounds important in the answer to 

In [3]:
class NLITagger:
    """Use an MNLI model to score (question, skill) pairs.

    The hypothesis template: "The question requires: {skill.description}"
    """
    def __init__(self, model_name: str = "microsoft/deberta-v3-large-mnli", device: int = -1):
        if pipeline is None:
            raise ImportError("transformers not installed. Please pip install transformers.")
        self.pipe = pipeline("text-classification", model=model_name, tokenizer=model_name, device=device, return_all_scores=True)
        # Map label names to a consistent order
        self.label_map = {"ENTAILMENT": "yes", "CONTRADICTION": "no", "NEUTRAL": "maybe"}
    
    def score(self, question: str, skill_desc: str) -> float:
        hyp = f"The question requires: {skill_desc}"
        outputs = self.pipe({"text": question, "text_pair": hyp})
        probs = {o['label'].lower(): o['score'] for o in outputs}
        entail = probs.get("yes") or probs.get("ENTAILMENT") or 0.0
        return float(entail)
    
    def tag(self, question: str, skills: List[Dict], threshold: float = 0.5):
        results = []
        for s in skills:
            p = self.score(question, s["description"])
            tag = "YES" if p >= threshold else "NO"
            results.append({"name": s["name"], "tag": tag, "confidence": p})
        return {"skills": results}


In [5]:
LLM_PROMPT_TEMPLATE = (
    "You are a classifier. Follow the JSON schema exactly.\n\n"
    "Question:\n{question}\n\n"
    # "Question-ID:\n{ID}\n\n"
    "Skills:\n{skills_list}\n\n"
    "Instruction:\nFor each skill, answer with:\n"
    '- tag: "YES" or "NO"\n'
    "- confidence: float between 0 and 1\n"
    "- brief_rationale: <= 20 words\n\n"
    "Return ONLY valid JSON of the form:\n"
    "{{\n"
    '  "skills": [\n'
    '    {{"name": "...", "tag": "YES|NO", "confidence": 0.0-1.0, "brief_rationale": "..."}}\n'
    "  ]\n"
    "}}\n"
)

def build_skills_list(skills: List[Dict]) -> str:
    lines = []
    for i, s in enumerate(skills, 1):
        lines.append(f"{i}. {s['name']} — {s['description']}")
    return "\n".join(lines)

def call_llm_zero_shot(question: str, skills: List[Dict]) -> Dict:
    prompt = LLM_PROMPT_TEMPLATE.format(question=question, skills_list=build_skills_list(skills))

    return {
        "skills": [
            {"name": s["name"], "tag": random.choice(["YES", "NO"]), "confidence": round(random.uniform(0.4, 0.9), 2), "brief_rationale": "placeholder"}
            for s in skills
        ]
    }


In [6]:
def self_consistency_vote(question: str, skills: List[Dict], infer_fn: Callable[[str, List[Dict]], Dict], k: int = 5):
    """Run k stochastic passes and aggregate majority vote and mean probabilities.

    infer_fn should return dict with structure {"skills":[{"name":..,"tag":..,"confidence":..},...]}

    """
    votes = {s['name']: [] for s in skills}
    confs = {s['name']: [] for s in skills}
    for _ in range(k):
        out = infer_fn(question, skills)
        for item in out['skills']:
            votes[item['name']].append(item['tag'])
            confs[item['name']].append(item.get('confidence', 0.5))
    results = []
    for s in skills:
        name = s['name']
        yes_count = votes[name].count('YES')
        no_count = votes[name].count('NO')
        tag = 'YES' if yes_count >= no_count else 'NO'
        conf = float(np.mean(confs[name]))
        results.append({"name": name, "tag": tag, "confidence": conf, "votes": votes[name]})
    return {"skills": results}


In [7]:
class TemperatureScaler:
    """Simple temperature scaling for binary probabilities."""
    def __init__(self):
        self.temperature_ = 1.0

    def fit(self, probs: np.ndarray, labels: np.ndarray):
        X = np.log(probs / (1 - probs)).reshape(-1, 1)
        lr = LogisticRegression(solver='lbfgs')
        lr.fit(X, labels)
        # temperature = 1 / coef
        self.temperature_ = 1.0 / lr.coef_[0][0]

    def transform(self, probs: np.ndarray) -> np.ndarray:
        logits = np.log(probs / (1 - probs))
        scaled = 1 / (1 + np.exp(-logits / self.temperature_))
        return scaled


In [8]:
def sanity_check(results: Dict) -> Dict:
    """Example consistency rule: if 'Speaker Identification' is NO, then 'Speech Activity...' shouldn't be YES.

    Add arbitrary domain rules here.
    """
    rmap = {r['name']: r for r in results['skills']}
    if rmap['Speaker Identification']['tag'] == 'NO' and rmap['Speech Activity, Turn-Taking and Overlap Detection']['tag'] == 'YES':
        # downgrade to NO with low confidence
        rmap['Speech Activity, Turn-Taking and Overlap Detection']['tag'] = 'NO'
        rmap['Speech Activity, Turn-Taking and Overlap Detection']['confidence'] = min(
            rmap['Speech Activity, Turn-Taking and Overlap Detection']['confidence'], 0.49
        )
    return {"skills": list(rmap.values())}


In [9]:
QUESTION = "In the recording, who speaks after the host interrupts, and how many different guests are there overall?"

raw_results = call_llm_zero_shot(QUESTION, SKILLS)


def approved_skill_tags(
    question: str,
    skills=SKILLS,
    infer_fn=call_llm_zero_shot,   
    k: int = 5,                    
    threshold: float = 0.5         
):
    """
    Returns a list of approved skill dicts (tag == 'YES') with name & confidence only.
    Votes, rationales, etc. are stripped out.
    """
    # Aggregate with self-consistency
    agg = self_consistency_vote(question, skills, infer_fn, k=k)


    final = sanity_check(agg)

    approved = [
        {"name": s["name"], "confidence": s.get("confidence", 0.0)}
        for s in final["skills"]
        if s["tag"] == "YES"
    ]
    return approved
    
QUESTION = "In the recording, who speaks after the host interrupts, and how many different guests are there overall?"
approved = approved_skill_tags(QUESTION)
print(approved)

agg_results = self_consistency_vote(QUESTION, SKILLS, call_llm_zero_shot, k=5)


final_results = sanity_check(agg_results)

print(json.dumps(final_results, indent=2))


NameError: name 'SKILLS' is not defined

In [None]:
# === Minimal cell: run tagging and return ONLY approved skills ===

def approved_skill_tags(
    question: str,
    skills=SKILLS,
    infer_fn=call_llm_zero_shot,   # swap with NLI or other backend if desired
    k: int = 5,                    # self-consistency passes
    threshold: float = 0.5         # (used only if your infer_fn returns raw probs)
):
    """
    Returns a list of approved skill dicts (tag == 'YES') with name & confidence only.
    Votes, rationales, etc. are stripped out.
    """
    # 1) Aggregate with self-consistency
    agg = self_consistency_vote(question, skills, infer_fn, k=k)

    # 2) Sanity / logic checks
    final = sanity_check(agg)

    # 3) Keep only YES-tagged skills
    approved = [
        {"name": s["name"], "confidence": s.get("confidence", 0.0)}
        for s in final["skills"]
        if s["tag"] == "YES"
    ]
    return approved

# ---- Example usage ----
QUESTION = "Why does the man in this say wait? answer: To open the door"
approved = approved_skill_tags(QUESTION)
print(approved)


[{'name': 'Speaker Identification', 'confidence': 0.6199999999999999}, {'name': 'Language Identification', 'confidence': 0.6040000000000001}, {'name': 'Lexical and Phrase-Level Recognition', 'confidence': 0.5940000000000001}, {'name': 'Prosody Detection', 'confidence': 0.63}, {'name': 'Paralinguistic/Emotion Recognition', 'confidence': 0.688}, {'name': 'Audio Quality, Artifacts & Channel Characteristics', 'confidence': 0.542}, {'name': 'Speaker Intent, Pragmatics and Causal Reasoning', 'confidence': 0.6799999999999999}, {'name': 'Temporal and Ordering Reasoning', 'confidence': 0.7}, {'name': 'Logical/Consistency Reasoning', 'confidence': 0.712}, {'name': 'Cross-frontier Entity Linking', 'confidence': 0.688}]


In [None]:
# Cell: Separate tagging for perceptual and reasoning skills

def approved_perceptual_tags(
    question: str,
    perceptual_skills=SKILLS[:8],
    infer_fn=call_llm_zero_shot,
    k: int = 5
):
    """
    Returns a list of approved perceptual skill dicts (tag == 'YES')
    with name & confidence only.
    """
    # Aggregate with self-consistency
    agg = self_consistency_vote(question, perceptual_skills, infer_fn, k=k)
    # Sanity check (domain rules for perceptual tasks)
    final = sanity_check(agg)
    # Filter YES tags
    return [
        {"name": s["name"], "confidence": s.get("confidence", 0.0)}
        for s in final["skills"]
        if s["tag"] == "YES"
    ]

def approved_reasoning_tags(
    question: str,
    reasoning_skills=SKILLS[8:],
    infer_fn=call_llm_zero_shot,
    k: int = 5
):
    """
    Returns a list of approved reasoning skill dicts (tag == 'YES')
    with name & confidence only.
    Independent of perceptual tagging.
    """
    # Aggregate with self-consistency
    agg = self_consistency_vote(question, reasoning_skills, infer_fn, k=k)
    # No shared sanity rules applied here to keep independence
    final = agg
    # Filter YES tags
    return [
        {"name": s["name"], "confidence": s.get("confidence", 0.0)}
        for s in final["skills"]
        if s["tag"] == "YES"
    ]

# ---- Example usage ----
QUESTION = "In the recording, who speaks after the host interrupts, and how many different guests are there overall?"

perceptual_results = approved_perceptual_tags(QUESTION)
reasoning_results   = approved_reasoning_tags(QUESTION)

print("Approved Perceptual Skills:\n", perceptual_results)
print("Approved Reasoning Skills:\n",   reasoning_results)


Approved Perceptual Skills:
 [{'name': 'Language Identification', 'confidence': 0.62}, {'name': 'Paralinguistic/Emotion Recognition', 'confidence': 0.58}]
Approved Reasoning Skills:
 [{'name': 'Quantitative Reasoning (Counting/Arithmetic Comparison)', 'confidence': 0.772}, {'name': 'Logical/Consistency Reasoning', 'confidence': 0.612}]


In [None]:
# Utility: export results to JSON file
def save_results(results: Dict, path: str = "results.json"):
    with open(path, 'w') as f:
        json.dump(results, f, indent=2)
    return path

# Example (commented):
save_results(final_results, "example_results.json")


'example_results.json'