# Features

## Input Data Structure

```json
{
    "metadata": {...}
    "transcription": {
      'full text as string', 
      "segments": [
        {
          "start": 0.0,
          "end": 30.0,
          "text": "...",
          "words": [],
          "speaker": "SPEAKER_00 / SPEAKER_01",
          "speaker_confidence": 0.736
        }
      }
    ]
}
```

Segments happen every 30 seconds. Only classifies one speaker per segment?

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import os
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import pipeline
from collections import Counter
from itertools import combinations
import math
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict

from transformers import (RobertaTokenizerFast, 
                          RobertaForSequenceClassification, 
                          TrainingArguments, 
                          Trainer)
from sklearn.metrics import accuracy_score, f1_score


from convokit import PolitenessStrategies

nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

W1207 20:15:28.046000 10540 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.





  import pkg_resources
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hofin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
file_name = "./diarizations/the_tucker_carlson_show/whisper_small/transcript_per_show/gina_carano.json"
output_name = "new_format_test3.json"

### Total Questions in episode
Regex better than model

In [21]:
#https://huggingface.co/shahrukhx01/question-vs-statement-classifier
model_name = "shahrukhx01/question-vs-statement-classifier"
classifier = pipeline("text-classification", model=model_name)

# Function to split full text into sentences
def split_into_sentences(text):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

# Function to classify if a sentence is a question using the model
def is_question_model(sentence):
    result = classifier(sentence)
    label = result[0]['label']
    score = result[0]['score']
    has_qmark = bool(re.search(r'\?\s*$', sentence))
    return (label == "LABEL_1" and score > 0.8) or has_qmark

# Function to classify if a sentence is a question using regex only
def is_question(sentence):
    pattern = r"\?\s*$" 
    return bool(re.search(pattern, sentence))

Device set to use cpu


In [22]:
with open(file_name, "r", encoding="utf-8") as f:
    data = json.load(f)

sentences = split_into_sentences(data["transcription"]["full_text"])

# Total amount of sentences
print(f"Total sentences: {len(sentences)}")

# # Total amount of questions
total_questions = []
total_questions_model = []

for sentence in sentences:
    if is_question(sentence):
        total_questions.append(sentence)
        
for sentence in sentences:
    if is_question_model(sentence):
        total_questions_model.append(sentence)
        
missing_in_regex = [q for q in total_questions_model if q not in total_questions]
print(missing_in_regex)

print(f"Total questions: {len(total_questions)}")

# Ratio 
question_ratio = len(total_questions) / len(sentences) if sentences else 0
print(f"Question ratio: {question_ratio:.2%}")

# Questions per minute
total_duration = data["metadata"]["duration_seconds"]
total_minutes = total_duration / 60
questions_per_minute = len(total_questions) / total_minutes if total_minutes else 0
print(f"Questions per minute: {questions_per_minute:.2f}")


Total sentences: 578
['Do it with everything!', 'yes immediately may they all rot']
Total questions: 39
Question ratio: 6.75%
Questions per minute: 0.89


### Questions per speaker

30 seconds slices only assign one speaker, if other speaker asks question in same segment it would be wrong assigned. 

In [9]:
def normalize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()


# Sometimes sentences are cut off and split into 2 segments, so I check the previous and next segment
# for the most words of that question in a segment and then just take the label of the speaker for that segment
def get_best_segment_match(question, segments):
    q_words = set(normalize(question).split())

    best_score = 0
    best_seg = None

    for seg in segments:
        seg_words = set(normalize(seg["text"]).split())
        score = len(q_words & seg_words)

        if score > best_score:
            best_score = score
            best_seg = seg

    return best_seg, best_score


question_speaker_dict = {}

for i, question in enumerate(total_questions):
    seg, score = get_best_segment_match(question, data["transcription"]["segments"])

    question_speaker_dict[i] = {
        "question": question,
        "start": seg["start"],
        "end": seg["end"],
        "speaker": seg["speaker"],
        "score": score,
    }


questions_per_speaker_count = {}

for info in question_speaker_dict.values():
    speaker = info["speaker"]
    questions_per_speaker_count[speaker] = (
        questions_per_speaker_count.get(speaker, 0) + 1
    )

print(questions_per_speaker_count)

questions_by_speaker = {}

for idx, entry in question_speaker_dict.items():
    speaker = entry["speaker"]
    question = entry["question"]

    if speaker not in questions_by_speaker:
        questions_by_speaker[speaker] = []

    questions_by_speaker[speaker].append(question)

print(questions_by_speaker)

{'SPEAKER_10': 9, 'SPEAKER_00': 29, 'SPEAKER_07': 1}
{'SPEAKER_10': ["If it's so great, why isn't it helping Black people?", 'Was it really fair?', 'So what do you think happened?', 'What happened next to Gina Chrono?', 'So what happened now?', 'Like what are the twists that your life has taken since?', "Yes, and why'd you go there?", 'What was your expectation?', 'So how?'], 'SPEAKER_00': ['What would be the difference?', "I mean, you know, Disney didn't, I mean, I didn't cave to Disney, you know what I mean?", "So what'd you do after that?", "I think you're just like a certain type of brain that you go forward, right?", 'and then So when you when you were fired from Disney for reasons that no person should ever be fired When you were fired from Disney for reasons that no person should ever be fired, resisting a Vax mandate or not putting your totally insane pronoun, in an email, did you consider suing them then?', "It's not simply that they fired you right?", "And I started actually 

Format for those features (new json, add to existing)? Per episode, expand on overall data (podcast, all podcasts)?

### Turn Taking Patterns

#### Turn Count per Speaker

In [10]:
# Detect how often the speaker changes after segments
# So Segment 1 has Speker 0, Segment 2 has speaker 1 -> One turn
# Then segment 3 and 4 also have Spekaer 1, but segment 5 has speaker 1 -> Second Turn
# Metrics: Count how often this changes (int)
# Did Speaker ask question in the last segment? (bool)

segments = data["transcription"]["segments"]

total_turns = 0
turn_id = 0
turn_object = {}

current_speaker = segments[0]["speaker"]
turn_start_time = segments[0]["start"]
turn_end_time = segments[0]["end"]
turn_text = segments[0]["text"] 

def check_for_question(text):
    sentences = split_into_sentences(text)
    for sentence in sentences:
        if is_question_model(sentence):
            return True
    return False


for i in range(1, len(segments)):
    seg = segments[i]

    if seg["speaker"] == current_speaker:
        # Extend the current turn
        turn_end_time = seg["end"]
        turn_text += " " + seg["text"] 

    else:
        # Speaker changed → close the previous turn
        question_asked = check_for_question(turn_text)
        total_turns += 1

        turn_object[turn_id] = {
            "turn": f"{current_speaker} to {seg['speaker']}",
            "start_time": turn_start_time,
            "end_time": turn_end_time,
            "turn_time_in_seconds": turn_end_time - turn_start_time,
            "question_asked": question_asked,
        }

        # Start a new turn
        turn_id += 1
        current_speaker = seg["speaker"]
        turn_start_time = seg["start"]
        turn_end_time = seg["end"]
        turn_text = seg["text"]

# Close last turn after loop ends
turn_object[turn_id] = {
    "turn": f"{current_speaker} to END",
    "start_time": turn_start_time,
    "end_time": turn_end_time,
    "turn_time_in_seconds": turn_end_time - turn_start_time,
    "question_asked": check_for_question(segments[-1]["text"]),
}

# Plus 1 because we count the first sentence as a turn aswell
print(total_turns + 1)

# for turn_id, turn_info in turn_object.items():
#     print(turn_id, turn_info)
    
print(turn_object)

# Total turns per speaker
turns_per_speaker = {}

for turn_id, turn_info in turn_object.items():
    turn_str = turn_info["turn"]
    speaker_before = turn_str.split(" to ")[0]

    if speaker_before not in turns_per_speaker:
        turns_per_speaker[speaker_before] = 0

    turns_per_speaker[speaker_before] += 1

print(turns_per_speaker)

15
{0: {'turn': 'SPEAKER_06 to SPEAKER_10', 'start_time': 0.0, 'end_time': 30.0, 'turn_time_in_seconds': 30.0, 'question_asked': False}, 1: {'turn': 'SPEAKER_10 to SPEAKER_01', 'start_time': 30.0, 'end_time': 90.0, 'turn_time_in_seconds': 60.0, 'question_asked': False}, 2: {'turn': 'SPEAKER_01 to SPEAKER_10', 'start_time': 90.0, 'end_time': 120.0, 'turn_time_in_seconds': 30.0, 'question_asked': False}, 3: {'turn': 'SPEAKER_10 to SPEAKER_00', 'start_time': 120.0, 'end_time': 210.0, 'turn_time_in_seconds': 90.0, 'question_asked': True}, 4: {'turn': 'SPEAKER_00 to SPEAKER_07', 'start_time': 210.0, 'end_time': 690.0, 'turn_time_in_seconds': 480.0, 'question_asked': True}, 5: {'turn': 'SPEAKER_07 to SPEAKER_05', 'start_time': 690.0, 'end_time': 720.0, 'turn_time_in_seconds': 30.0, 'question_asked': False}, 6: {'turn': 'SPEAKER_05 to SPEAKER_00', 'start_time': 720.0, 'end_time': 780.0, 'turn_time_in_seconds': 60.0, 'question_asked': False}, 7: {'turn': 'SPEAKER_00 to SPEAKER_10', 'start_time

#### Switch Rate and Dominance

In [11]:
#Average switch rate of the conversation
average_switch_time =  total_minutes / total_turns
print(average_switch_time)

#Speaking time per speaker
speaking_time = {}

for seg in segments:
    speaker = seg["speaker"]
    duration = seg["end"] - seg["start"]
    
    if speaker not in speaking_time:
        speaking_time[speaker] = 0.0
        
    speaking_time[speaker] += duration

print(speaking_time)

#Average turn time per Speaker
average_turn_time = {}

for speaker in speaking_time:
    avg_time = speaking_time[speaker] / turns_per_speaker[speaker]
    average_turn_time[speaker] = avg_time

print(average_turn_time)

3.128660714285714
{'SPEAKER_06': 30.0, 'SPEAKER_10': 240.0, 'SPEAKER_01': 30.0, 'SPEAKER_00': 2100.0, 'SPEAKER_07': 60.0, 'SPEAKER_05': 60.0, 'SPEAKER_04': 30.0, 'SPEAKER_UNKNOWN': 78.07999999999993}
{'SPEAKER_06': 30.0, 'SPEAKER_10': 60.0, 'SPEAKER_01': 30.0, 'SPEAKER_00': 525.0, 'SPEAKER_07': 30.0, 'SPEAKER_05': 60.0, 'SPEAKER_04': 30.0, 'SPEAKER_UNKNOWN': 78.07999999999993}


In [12]:
items = sorted(speaking_time.items(), key=lambda x: x[1], reverse=True)
first = items[0]
second = items[1]

print(first)
print(second)

dominance_index = (first[1] - second[1]) / total_duration
print(dominance_index)

('SPEAKER_00', 2100.0)
('SPEAKER_10', 240.0)
0.7077423589509433


#### Vocab Diversity

In [13]:
# Link every sentence to a speaker

sentences_dict = {}
print(data)


for i, sentence in enumerate(sentences, start=0):
    seg, score = get_best_segment_match(sentence, data["transcription"]["segments"])

    sentences_dict[i] = {
        "sentence": sentence,
        "start": seg["start"],
        "end": seg["end"],
        "speaker": seg["speaker"]
    }
    
sentences_per_speaker = {}

for idx, entry in sentences_dict.items(): 
    speaker = entry["speaker"]
    sentence = entry["sentence"]

    if speaker not in sentences_per_speaker:
        sentences_per_speaker[speaker] = []

    sentences_per_speaker[speaker].append(sentence)

print(sentences_per_speaker)

speakers_sorted = sorted(
    sentences_per_speaker.items(),
    key=lambda x: len(x[1]),
    reverse=True
)
top_two = speakers_sorted[:2]
sentences_actual_speakers = { speaker: sents for speaker, sents in top_two }

print(sentences_actual_speakers)

{'metadata': {'original_file': 'outputs/downloads/the_tucker_carlson_show/gina_carano.mp3', 'filename': 'gina_carano.mp3', 'model_used': 'small', 'processing_time_seconds': 564.18, 'timestamp': '2025-11-11T10:11:46.894471', 'file_size_mb': 100.31, 'duration_seconds': 2628.075, 'diarization_timestamp': '2025-11-17T05:51:36.622944', 'diarization_device': 'cuda:0', 'worker_rank': 3}, 'transcription': {'full_text': 'Life is long and weird and the longer it is, the weirder it gets. You may have noticed that. But even by that unchanging standard, Gina Chrono has had a pretty remarkable life packed into a relatively short amount of time. So in 2006, she began as a professional mixed martial arts fighter. In a few years she was starring in big Hollywood films like Fast and Furious. Then in 2019, not that long ago, she got one of the biggest roles for her career. She was on a Disney show called The Mandalorian. In case you didn\'t see it, here she is. Stay back, dropper. Easy. Drop your weapon.

In [14]:
stop_words = set(stopwords.words("english"))

def get_unique_words(text):
    # lowercase and extract alphabetic tokens
    words = re.findall(r"\b[a-zA-Z']+\b", text.lower())
    return set(words)

def get_unique_words_without_stopwords(text):
    words = re.findall(r"\b[a-zA-Z']+\b", text.lower())
    filtered = [w for w in words if w not in stop_words]
    return set(filtered)


# Extract full text of speakers
full_text_per_speaker = {
    speaker: " ".join(sentences) for speaker, sentences in sentences_actual_speakers.items()
}

# compute unique words per speaker (version to work with)
unique_words_per_speaker_raw = {
    speaker: get_unique_words(text) for speaker, text in full_text_per_speaker.items()
}

unique_words_per_speaker_raw_no_stopwords = {
    speaker: get_unique_words_without_stopwords(text) for speaker, text in full_text_per_speaker.items()
}

# make sure it is possible to store in json
unique_words_per_speaker = {
    speaker: list(words) for speaker, words in unique_words_per_speaker_raw.items()
}

unique_words_per_speaker_no_stopwords = {
    speaker: list(words) for speaker, words in unique_words_per_speaker_raw_no_stopwords.items()
}

# Count
unique_words_per_speaker_count = {
    speaker: len(words) for speaker, words in unique_words_per_speaker.items()
}

unique_words_per_speaker_count_no_stopwords = {
    speaker: len(words) for speaker, words in unique_words_per_speaker_no_stopwords.items()
}

# Store results for all pairs
all_words = list(unique_words_per_speaker_raw.values())
shared_all = set.intersection(*all_words)
total_all = set.union(*all_words)

# Same for stopwords
all_words_no_stop = list(unique_words_per_speaker_raw_no_stopwords.values())
shared_all_stop = set.intersection(*all_words_no_stop)
total_all_stop = set.union(*all_words_no_stop)

# Compute global overlap (JACCARD)
words_overlap_percentage_symmetric = len(shared_all) / len(total_all) * 100
words_overlap_percentage_symmetric = round(words_overlap_percentage_symmetric, 2)

# Compute global overlap (JACCARD) Stopwords
words_overlap_percentage_symmetric_stop = len(shared_all_stop) / len(total_all_stop) * 100
words_overlap_percentage_symmetric_stop = round(words_overlap_percentage_symmetric_stop, 2)

# Check by speaker
words_overlap_by_speaker = {}

for speaker, words in unique_words_per_speaker_raw.items():
    others = set().union(
        *[w for spk, w in unique_words_per_speaker_raw.items() if spk != speaker]
    )
    shared = words & others
    overlap_pct = len(shared) / len(words) * 100 if words else 0.0
    words_overlap_by_speaker[speaker] = round(overlap_pct, 2)

print(words_overlap_percentage_symmetric)
print(words_overlap_by_speaker)


# Check by speaker Stopwrods
words_overlap_by_speaker_stop = {}

for speaker, words in unique_words_per_speaker_raw_no_stopwords.items():
    others = set().union(
        *[
            w for spk, w in unique_words_per_speaker_raw_no_stopwords.items()
            if spk != speaker
        ]
    )
    shared = words & others
    overlap_pct = len(shared) / len(words) * 100 if words else 0.0
    words_overlap_by_speaker_stop[speaker] = round(overlap_pct, 2)
    

print(words_overlap_percentage_symmetric_stop)
print(words_overlap_by_speaker_stop)

16.68
{'SPEAKER_00': 17.68, 'SPEAKER_10': 74.71}
11.02
{'SPEAKER_00': 11.75, 'SPEAKER_10': 64.0}


In [15]:
stop_words = set(stopwords.words("english"))

def count_words(text):
    words = re.findall(r"\b[a-zA-Z']+\b", text.lower())
    return len(words)

def count_words_without_stopwords(text):
    words = re.findall(r"\b[a-zA-Z']+\b", text.lower())
    filtered = [w for w in words if w not in stop_words]
    return len(filtered)


total_words_per_speaker = {
    speaker: count_words(text) for speaker, text in full_text_per_speaker.items()
}

total_words_per_speaker_without_stopwords = {
    speaker: count_words_without_stopwords(text) for speaker, text in full_text_per_speaker.items()
}

print(total_words_per_speaker)
print(unique_words_per_speaker_count)

print(total_words_per_speaker_without_stopwords)
print(unique_words_per_speaker_count_no_stopwords)

# RTTR
rttr_per_speaker = {}

for speaker in total_words_per_speaker:
    total = total_words_per_speaker[speaker]
    unique = unique_words_per_speaker_count[speaker]

    if total == 0:
        rttr_per_speaker[speaker] = 0.0
    else:
        rttr_per_speaker[speaker] = unique / math.sqrt(total)


print(rttr_per_speaker)

# No stopwords RTTR
rttr_per_speaker_no_stopwords = {}

for speaker in total_words_per_speaker_without_stopwords:
    total = total_words_per_speaker_without_stopwords[speaker]
    unique = unique_words_per_speaker_count_no_stopwords[speaker]

    if total == 0:
        rttr_per_speaker_no_stopwords[speaker] = 0.0
    else:
        rttr_per_speaker_no_stopwords[speaker] = unique / math.sqrt(total)

print(rttr_per_speaker_no_stopwords)

#LTTR
lttr_per_speaker = {}

for speaker in total_words_per_speaker:
    total = total_words_per_speaker[speaker]
    unique = unique_words_per_speaker_count[speaker]

    if total == 0 or unique == 0:
        lttr_per_speaker[speaker] = 0.0
    else:
        lttr_per_speaker[speaker] = math.log(unique) / math.log(total)

print(lttr_per_speaker)

# No Stopwords LTTR
lttr_per_speaker_no_stopwords = {}

for speaker in total_words_per_speaker_without_stopwords:
    total = total_words_per_speaker_without_stopwords[speaker]
    unique = unique_words_per_speaker_count_no_stopwords[speaker]

    if total == 0 or unique == 0:
        lttr_per_speaker_no_stopwords[speaker] = 0.0
    else:
        lttr_per_speaker_no_stopwords[speaker] = math.log(unique) / math.log(total)


print(lttr_per_speaker_no_stopwords)

{'SPEAKER_00': 6135, 'SPEAKER_10': 643}
{'SPEAKER_00': 1086, 'SPEAKER_10': 257}
{'SPEAKER_00': 2652, 'SPEAKER_10': 287}
{'SPEAKER_00': 953, 'SPEAKER_10': 175}
{'SPEAKER_00': 13.865085170927225, 'SPEAKER_10': 10.135090636417521}
{'SPEAKER_00': 18.505726560593207, 'SPEAKER_10': 10.329923381766717}
{'SPEAKER_00': 0.8014726622709872, 'SPEAKER_10': 0.8581738147763891}
{'SPEAKER_00': 0.8701705640224989, 'SPEAKER_10': 0.9125898407351549}


## Politeness Feature

In [None]:
questions = [
    "Hello, could you please help me?",
    "Why did you decide to quit?",
    "I was wondering if you could explain that again?",
    "Tell me about your early days.",
]
ps = PolitenessStrategies()

results = []

for q in questions:
    utt = ps.transform_utterance(q, spacy_nlp=nlp)
    results.append({"text": q, "strategies": utt.meta["politeness_strategies"]})

for r in results:
    print(r)

WEIGHTS = {
    "Gratitude": 0.87,
    "Deference": 0.78,
    "Greeting": 0.43,
    "Positive_lexicon": 0.12,
    "Negative_lexicon": -0.13,
    "Apologizing": 0.36,
    "Please": 0.49,
    "Please_start": -0.30,
    "Indirect_(btw)": 0.63,
    "Direct_question": -0.27,
    "Direct_start": -0.43,
    "Counterfactual_modal": 0.47,
    "Indicative_modal": 0.09,
    "1st_person_start": 0.12,
    "1st_person_pl.": 0.08,
    "1st_person": 0.08,
    "2nd_person": 0.05,
    "2nd_person_start": -0.30,
    "Hedges": 0.14,
    "Factuality": -0.38,
}

FEATURE_MAP = {
    "==Please==": "Please",
    "==Please_start==": "Please_start",
    "==Indirect_(btw)==": "Indirect_(btw)",
    "==Hedges==": "Hedges",
    "==Factuality==": "Factuality",
    "==Deference==": "Deference",
    "==Gratitude==": "Gratitude",
    "==Apologizing==": "Apologizing",
    "==1st_person_pl.==": "1st_person_pl.",
    "==1st_person==": "1st_person",
    "==1st_person_start==": "1st_person_start",
    "==2nd_person==": "2nd_person",
    "==2nd_person_start==": "2nd_person_start",
    "==Indirect_(greeting)==": "Greeting",
    "==Direct_question==": "Direct_question",
    "==Direct_start==": "Direct_start",
    "==SUBJUNCTIVE==": "Counterfactual_modal",
    "==INDICATIVE==": "Indicative_modal",
}


def compute_politeness_score(strategy_dict):
    score = 0.0
    for feat, val in strategy_dict.items():
        if val == 0:
            continue

        # find matching strategy
        matched = False
        for substring, strat in FEATURE_MAP.items():
            if substring in feat:
                score += WEIGHTS[strat] * val
                matched = True
                break

        # ignore unmatched features (HASHEDGE, sentiment, etc.)
        if not matched:
            pass

    return score


for r in results:
    score = compute_politeness_score(r["strategies"])
    print(f"Text: {r['text']}")
    print(f"Politeness score: {score:.3f}")
    print()

{'text': 'Hello, could you please help me?', 'strategies': {'feature_politeness_==Please==': 1, 'feature_politeness_==Please_start==': 0, 'feature_politeness_==HASHEDGE==': 0, 'feature_politeness_==Indirect_(btw)==': 0, 'feature_politeness_==Hedges==': 0, 'feature_politeness_==Factuality==': 0, 'feature_politeness_==Deference==': 0, 'feature_politeness_==Gratitude==': 0, 'feature_politeness_==Apologizing==': 0, 'feature_politeness_==1st_person_pl.==': 0, 'feature_politeness_==1st_person==': 0, 'feature_politeness_==1st_person_start==': 0, 'feature_politeness_==2nd_person==': 1, 'feature_politeness_==2nd_person_start==': 0, 'feature_politeness_==Indirect_(greeting)==': 1, 'feature_politeness_==Direct_question==': 0, 'feature_politeness_==Direct_start==': 0, 'feature_politeness_==HASPOSITIVE==': 0, 'feature_politeness_==HASNEGATIVE==': 0, 'feature_politeness_==SUBJUNCTIVE==': 1, 'feature_politeness_==INDICATIVE==': 0}}
{'text': 'Why did you decide to quit?', 'strategies': {'feature_polit

## Type of Question
4 Types based on this [Paper](https://aclanthology.org/2024.lrec-main.1516/):
- DELIBERATIVE: These questions encourage participants in the conversation to share their perspectives on the topic broached in the question.
- RHETORICAL: Typically, rhetorical questions don’t seek an answer or already imply one. Their primary function is to underscore the speaker’s viewpoint.
- ISQ (Information-Seeking-Question): Designed to obtain factual details from participants.
- OTHER: A category for questions that don’t fit the criteria of the above NISQs labels.
- {0: 'Deliberative', 1: 'ISQ', 2: 'OTHERS', 3: 'Rhetorical'}

They show results for roBERTa model without context (best performance), BigBird (with 2 different scopes of sentences before and after the target sentence)  
  
They used gpt-3.5-turbo with a very long prompt in the structure
- Introduction to task
- Detailed description of each label
- present the task and formulate the question

They claim that the performance might be worse because they did not include speaker information. Including this might boost the performance.  

copyright?

In [2]:
# Load NISQ dataset
df_nisq = pd.read_csv("./NISQ_dataset/final_train.csv", sep=";")


# As said in the paper, providing the speaker information might improve performance
df_nisq = df_nisq[[
    "index",
    "question",
    "question_speaker",
    "ctx_after1_speaker",
    "label"
]]


df_nisq.head()

Unnamed: 0,index,question,question_speaker,ctx_after1_speaker,label
0,1,"How did you keep it a secret, that sort of thing?",UNIDENTIFIED MALE,UNIDENTIFIED MALE,Deliberative
1,2,And I talked to the local newspaper editor her...,DAN LOTHIAN,DAN LOTHIAN,OTHERS
2,3,You know what I mean?,PRESS,PRESS,Rhetorical
3,4,And your work has shown that were as polarized...,FRANK SESNO,FRANK SESNO,Rhetorical
4,5,Do journalists focus too much on the first lad...,"KURTZ, HOST (voice-over)","KURTZ, HOST (voice-over)",Deliberative


In [3]:
# Expects only one input text, so we build our input here

def build_text(row):
    # Speaker pseudo tokens
    spk_q = f"<SPK_Q:{str(row['question_speaker']).replace(' ', '_')}>"
    spk_after = f"<SPK_AFTER:{str(row['ctx_after1_speaker']).replace(' ', '_')}>"

    # Input sequence
    return f"{spk_q} {spk_after} {row['question']}"

df_nisq["text"] = df_nisq.apply(build_text, axis=1)

df_nisq.head()

Unnamed: 0,index,question,question_speaker,ctx_after1_speaker,label,text
0,1,"How did you keep it a secret, that sort of thing?",UNIDENTIFIED MALE,UNIDENTIFIED MALE,Deliberative,<SPK_Q:UNIDENTIFIED_MALE> <SPK_AFTER:UNIDENTIF...
1,2,And I talked to the local newspaper editor her...,DAN LOTHIAN,DAN LOTHIAN,OTHERS,<SPK_Q:DAN_LOTHIAN> <SPK_AFTER:DAN_LOTHIAN> An...
2,3,You know what I mean?,PRESS,PRESS,Rhetorical,<SPK_Q:PRESS> <SPK_AFTER:PRESS> You know what ...
3,4,And your work has shown that were as polarized...,FRANK SESNO,FRANK SESNO,Rhetorical,<SPK_Q:FRANK_SESNO> <SPK_AFTER:FRANK_SESNO> An...
4,5,Do journalists focus too much on the first lad...,"KURTZ, HOST (voice-over)","KURTZ, HOST (voice-over)",Deliberative,"<SPK_Q:KURTZ,_HOST_(voice-over)> <SPK_AFTER:KU..."


In [4]:
label_encoder = LabelEncoder()
df_nisq["label_id"] = label_encoder.fit_transform(df_nisq["label"])
num_labels = len(label_encoder.classes_)

print("Label mapping:", dict(enumerate(label_encoder.classes_)))

Label mapping: {0: 'Deliberative', 1: 'ISQ', 2: 'OTHERS', 3: 'Rhetorical'}


In [5]:
# 80/10/10 split

train_df, temp_df = train_test_split(
    df_nisq,
    test_size=0.20,
    random_state=42,
    stratify=df_nisq["label_id"]
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df["label_id"]
)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df),
})

In [6]:
# Load model and tokenizer

model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

# Collect unique speakers
special_tokens = {"additional_special_tokens": []}

for spk in df_nisq["question_speaker"].astype(str).unique():
    special_tokens["additional_special_tokens"].append(f"<SPK_Q:{spk.replace(' ', '_')}>")

for spk in df_nisq["ctx_after1_speaker"].astype(str).unique():
    special_tokens["additional_special_tokens"].append(f"<SPK_AFTER:{spk.replace(' ', '_')}>")

# Add tokens
tokenizer.add_special_tokens(special_tokens)

483

In [7]:
# Tokenize dataset

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128  # shorter is fine because input is small
    )

tokenized_dataset = dataset.map(tokenize, batched=True)

# Clean columns
tokenized_dataset = tokenized_dataset.remove_columns(
    [c for c in df_nisq.columns if c not in ["text", "label_id"]]
)
tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")
tokenized_dataset.set_format("torch")

Map:   0%|          | 0/1206 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

In [9]:
# model = RobertaForSequenceClassification.from_pretrained(
#     model_name,
#     num_labels=num_labels
# )

# model.resize_token_embeddings(len(tokenizer))

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("./roberta_minimal_final/")
model = RobertaForSequenceClassification.from_pretrained("./roberta_minimal_final/")

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

In [11]:
training_args = TrainingArguments(
    output_dir="./results_eval",
    per_device_eval_batch_size=32,
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

results = trainer.evaluate()
print(results)




{'eval_loss': 1.1857951879501343, 'eval_model_preparation_time': 0.0022, 'eval_accuracy': 0.5298013245033113, 'eval_f1': 0.3669619563659299, 'eval_runtime': 11.8248, 'eval_samples_per_second': 12.77, 'eval_steps_per_second': 0.423}


## Direct/Indirect Address

In [24]:
def detect_direct_address(text):
    doc = nlp(text)
    for token in doc:
        if token.lower_ in ["you", "your", "yours", "yourself"]:
            return 1
    return 0

def detect_imperative(text):
    doc = nlp(text)
    root = [token for token in doc if token.dep_ == "ROOT"]
    if root and root[0].tag_ == "VB":  # imperative verb
        return 1
    return 0


def detect_vocative(text):
    return 1 if re.match(r"^[A-Za-z]+,", text.strip()) else 0

def direct_address_score(text):
    return max(
        detect_direct_address(text),
        detect_imperative(text),
        detect_vocative(text)
    )
    
for s in sentences:
    da_score = direct_address_score(s)
    print(f"Text: {s}")
    print(f"Direct Address Score: {da_score}")
    print()

Text: Life is long and weird and the longer it is, the weirder it gets.
Direct Address Score: 0

Text: You may have noticed that.
Direct Address Score: 1

Text: But even by that unchanging standard, Gina Chrono has had a pretty remarkable life packed into a relatively short amount of time.
Direct Address Score: 0

Text: So in 2006, she began as a professional mixed martial arts fighter.
Direct Address Score: 0

Text: In a few years she was starring in big Hollywood films like Fast and Furious.
Direct Address Score: 0

Text: Then in 2019, not that long ago, she got one of the biggest roles for her career.
Direct Address Score: 0

Text: She was on a Disney show called The Mandalorian.
Direct Address Score: 0

Text: In case you didn't see it, here she is.
Direct Address Score: 1

Text: Stay back, dropper.
Direct Address Score: 1

Text: Easy.
Direct Address Score: 0

Text: Drop your weapon.
Direct Address Score: 1

Text: You're gonna wish you never left Alderaan.
Direct Address Score: 1

T

## Store All Metrics As Object
Storing all the features in the features object with the format:

```json
"features": {
    "speaking_time_per_speaker": speaking time per speaker in seconds (List, float),
    "dominance_index": Comparing the talking time between both speakers, the higher the value the more one speaker dominates (float)
    "questions": {
        "questions_asked": List of all questions in the episode (Array),
        "questions_total": Amount of total questions asked (int),
        "questions_ratio": Ratio of questions asked compared to total amount of sentences (float),
        "questions_per_minute": Total questions / total time (float),
        "questions_by_speaker": List of all questions in the episode per speaker (array),
        "questions_per_speaker_count": {
            "SPEAKER00": Amount of questions from speaker 1 (int),
            "SPEAKER01": Amount of questions from speaker 2 (int) 
        },
    }, 
    "turn_patterns": {
        "total_turns": Amount of total turns of the conversation (int),
        "total_turns_by_speaker": Amount of total turns by speaker (int),
        "average_switch_time": Average switch time of the speakers in total in minutes (float),
        "average_turn_time_per_speaker": Average time of speaker until next turn in seconds (float),
        "turn_details": {
            "0": { 
                "turn_format": From which speaker to which (SPEAKER_00 to SPEAKER_01 / SPEAKER_01 to SPEAKER_00) (string),
                "turn_time_in_segments": How long the speaker was talking before the turn in segments(int),
                "turn_time_in_seconds": How long the speaker was talking before the turn in seconds (int),
                "question_asked": Was a question asked before the turn? (bool)
            },
            "1": {},
            ...,
        }
    },
    "vocab_diversity": {
        "unique_words_per_speaker": List of unique words each speaker has used during the conversation (string List),
        "unique_words_per_speaker_count": Amount of unique words per speaker (int),
        "words_overlap_symmetric" Percentage of words overlapping between both speakers (float), (How big is the shared vocabulary relative to the whole conversation?)
        "words_overlap_by_speaker": comparing speaker01's unique words to speaker02's unique words and vice versa (List, float percentage) (How much of my vocab is shared by others?),
        "rttr": Root Type-Token Ratio, ratio of unique words to total words len(unique_words) / sqrt(total_words) (float),
        "lttr": Log scaled version, log(len(unique_words)) / log(total_words) (float),
        "words_overlap_symmetric_no_stopwords": Same but no stopwords (float),
        "words_overlap_by_speaker": Same but no stopwords (List, float percentage),
        "rttr_no_stopwords": Same but no stopwords (float),
        "lttr_no_stopwords" Same but no stopwords (float),
        "unique_words_per_speaker_no_stopwords" Same but no stopwords (string List),
        "unique_words_per_speaker_count_no_stopwords" Same but no stopwords (int)
    }
}
```

In [16]:
features = {
    "speaking_time_per_speaker": speaking_time,
    "dominance_index": dominance_index,
    "questions": {
        "questions_asked": total_questions,
        "questions_total": len(total_questions),
        "questions_ratio": question_ratio,
        "questions_per_minute": questions_per_minute,
        "questions_by_speaker": questions_by_speaker,
        "questions_per_speaker_count": questions_per_speaker_count
    },
    "turn_patterns": {
        "total_turns": total_turns,
        "total_turns_by_speaker": turns_per_speaker,
        "average_switch_time": average_switch_time,
        "average_turn_time_per_speaker": average_turn_time,
        "turn_details": turn_object
    },
    "vocab_diversity": {
        "unique_words_per_speaker": unique_words_per_speaker,
        "unique_words_per_speaker:no_stopwords": unique_words_per_speaker_no_stopwords,
        "unique_words_per_speaker_count": unique_words_per_speaker_count,        
        "unique_words_per_speaker_count_no_stopwords": unique_words_per_speaker_count_no_stopwords,
        "words_overlap_symmetric": words_overlap_percentage_symmetric,
        "words_overlap_symmetric_no_stopwords": words_overlap_percentage_symmetric_stop,
        "words_overlap_by_speaker": words_overlap_by_speaker,
        "words_overlap_by_speaker": words_overlap_by_speaker_stop,
        "rttr_per_speaker": rttr_per_speaker,
        "rttr_no_stopwords": rttr_per_speaker_no_stopwords,
        "lttr_per_speaker": lttr_per_speaker,
        "lttr_no_stopwords": lttr_per_speaker_no_stopwords
        
    }
}

with open(output_name, "w", encoding="utf-8") as f:
    json.dump(features, f, indent=4, ensure_ascii=False)