In [None]:
# cosmus_eval_major_label.py
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from transformers import pipeline

# -------------------------------------------------------------------- #
# 1.  Load the COSMUS dataset (Telegram RU/UA posts)                   #
# -------------------------------------------------------------------- #
ds = load_dataset("YShynkarov/COSMUS", split="train") 
ds = ds.filter(lambda x: x["annotator_sentiment"] != "mixed")
df = ds.to_pandas()[["document_content", "annotator_sentiment", "language"]]


Filter:   0%|          | 0/12224 [00:00<?, ? examples/s]

In [44]:
df

Unnamed: 0,document_content,annotator_sentiment,language
0,⚡️Українська делегація відправилася на перемов...,neutral,ua
1,"Вибухи на Одещині, попередньо — ППО.",neutral,ua
2,"А что делать тем ,кто лишился своего жилья ,по...",negative,ru
3,Тогда учись быстро бегать. Для меня вопрос сло...,negative,ru
4,Добрий день,neutral,ua
...,...,...,...
11611,"У меня три окна и двери выбило , даже и не дум...",negative,ru
11612,"Краще ""повинна бути зручнішою, ніж Uber чи Boo...",negative,ua
11613,"Питання, цей сертифікат можна вже використовув...",neutral,ua
11614,На Вугледарському напрямку загинув Рома Іванен...,negative,ua


In [31]:
df['document_content'].iloc[5]

'Бажаю удачі тим, хто цього потребує.'

'Я розумію. Але ви хоч уявляєте, скільки часу на це піде? І не буде там великої суми, бо рахуватимуть тільки вартість "коробки". Опис майна "до" ніхто не робив.'

In [45]:

# Gold labels → integers
label2id = {"negative": -1, "neutral": 0, "positive": 1}
df["annotator_sentiment"] = df["annotator_sentiment"].map(label2id)


In [33]:
df = df.dropna()
df

Unnamed: 0,document_content,annotator_sentiment,language
0,⚡️Українська делегація відправилася на перемов...,0.0,ua
1,"Вибухи на Одещині, попередньо — ППО.",0.0,ua
2,"А что делать тем ,кто лишился своего жилья ,по...",-1.0,ru
3,Тогда учись быстро бегать. Для меня вопрос сло...,-1.0,ru
4,Добрий день,0.0,ua
...,...,...,...
12218,"У меня три окна и двери выбило , даже и не дум...",-1.0,ru
12219,"Краще ""повинна бути зручнішою, ніж Uber чи Boo...",-1.0,ua
12221,"Питання, цей сертифікат можна вже використовув...",0.0,ua
12222,На Вугледарському напрямку загинув Рома Іванен...,-1.0,ua


In [None]:
# -------------------------------------------------------------------- #
# 2.  Sentiment model wrapper with major_label()                       #
# -------------------------------------------------------------------- #
class SentimentAnalyzer:
    """
    Multilingual twitter-XLM-RoBERTa sentiment wrapper.
    Provides polarity_scores() *and* major_label().
    """
    def __init__(self):
        mdl = "cardiffnlp/twitter-xlm-roberta-base-sentiment"       # 3-way (neg/neu/pos)
        self._pipe = pipeline(
            "sentiment-analysis",
            model=mdl, tokenizer=mdl,
            top_k=None                                              # returns all three scores  
        )

    # ---------- already supplied ----------
    def polarity_scores(self, text: str):
        res   = self._pipe(text)              # list[list[dict(label,score)]]
        scores = {d["label"]: d["score"] for d in res[0]}
        # compound = abs(scores.get("positive", 0) - scores.get("negative", 0))
        return {"neg": scores.get("negative", 0),
                "neu": scores.get("neutral",  0),
                "pos": scores.get("positive", 0),
                # "compound": compound,
                }

    # ---------- new method ----------
    def major_label(self, text: str):
        """
        Returns (text_label, int_label) where int_label ∈ {−1,0,1}.
        """
        sc   = self.polarity_scores(text)
        best = max(("neg", "neu", "pos"), key=sc.get)           
        text_label = {"neg": "negative", "neu": "neutral", "pos": "positive"}[best]
        return text_label, {"negative": -1, "neutral": 0, "positive": 1}[text_label]

analyzer = SentimentAnalyzer()

Device set to use mps:0


In [None]:
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer, pipeline
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

class UkrSentimentAnalyzer:
    """
    Ukrainian sentiment analysis model based on YShynkarov/ukr-roberta-cosmus-sentiment.
    Provides polarity_scores() and major_label().
    """
    map_labels = {
                'LABEL_0': 'mixed',
                'LABEL_1': 'negative',
                'LABEL_2': 'neutral',
                'LABEL_3': 'positive',
            }
    int_label_map = {
        "negative": -1.0,
        "neutral": 0.0,
        "positive": 1.0,
        "mixed": 0
    }

    def __init__(self):
        repo_id = "YShynkarov/ukr-roberta-cosmus-sentiment"
        safetensor = hf_hub_download(repo_id=repo_id,
                                     filename="ukrroberta_cosmus_sentiment.safetensors")

        config = RobertaConfig.from_pretrained("youscan/ukr-roberta-base", num_labels=4)
        tokenizer = RobertaTokenizer.from_pretrained("youscan/ukr-roberta-base")

        model = RobertaForSequenceClassification(config)
        state_dict = load_file(safetensor)
        model.load_state_dict(state_dict)
        model.eval()
        self._pipe = pipeline(
            "text-classification",
            model=model,
            tokenizer=tokenizer,
            device=-1,               
            return_all_scores=True,
            truncation=True,
        )

    def polarity_scores(self, text: str) -> dict:
        """
        Returns dict:
          {
            "negative": float_score,
            "neutral":  float_score,
            "positive": float_score,
            "mixed":    float_score
          }
        """
        # pipeline returns list[list[{"label":..., "score":...}, ...]]
        results = self._pipe(text)
        scores = {
            self.map_labels[item["label"]]: item["score"]
            for item in results[0]
        }
        return scores

    def major_label(self, text: str) -> tuple[str, int]:
        """
        Returns (text_label, int_label),
        where int_label ∈ {-1,0,1,2} for negative, neutral, positive, mixed.
        """
        scores = self.polarity_scores(text)
        best = max(scores, key=scores.get)
        return best, self.int_label_map[best]

analyzer = UkrSentimentAnalyzer()
print(analyzer.polarity_scores("Привіт! Все просто чудово"))
# → {'negative': 0.01, 'neutral': 0.05, 'positive': 0.90, 'mixed': 0.04}

print(analyzer.major_label("Привіт! Все просто чудово"))
# → ('positive', 1)

Device set to use cpu


{'mixed': 0.011259634047746658, 'negative': 0.0018209181725978851, 'neutral': 0.006787061225622892, 'positive': 0.9801324009895325}
('positive', 1.0)




In [54]:
# -------------------------------------------------------------------- #
# 3.  Inference → sentiment_pred column                                #
# -------------------------------------------------------------------- #
from tqdm import tqdm      # или просто `from tqdm import tqdm`
tqdm.pandas()

df["sentiment_pred_1epoch"] = df["document_content"].progress_apply(
    lambda txt: analyzer.major_label(txt)[1]     # keep numeric only
)

100%|██████████| 11616/11616 [12:04<00:00, 16.04it/s]


In [55]:
df=df.dropna(subset=["annotator_sentiment", 'sentiment_pred_1epoch'])
df



Unnamed: 0,document_content,annotator_sentiment,language,sentiment_pred_1epoch
0,⚡️Українська делегація відправилася на перемов...,0,ua,1.0
1,"Вибухи на Одещині, попередньо — ППО.",0,ua,0.0
2,"А что делать тем ,кто лишился своего жилья ,по...",-1,ru,-1.0
3,Тогда учись быстро бегать. Для меня вопрос сло...,-1,ru,-1.0
4,Добрий день,0,ua,1.0
...,...,...,...,...
11611,"У меня три окна и двери выбило , даже и не дум...",-1,ru,-1.0
11612,"Краще ""повинна бути зручнішою, ніж Uber чи Boo...",-1,ua,1.0
11613,"Питання, цей сертифікат можна вже використовув...",0,ua,0.0
11614,На Вугледарському напрямку загинув Рома Іванен...,-1,ua,1.0


In [56]:
df["sentiment_pred_1epoch"].value_counts()

sentiment_pred_1epoch
 0.0    5764
-1.0    3345
 1.0    2507
Name: count, dtype: int64

In [None]:
# -------------------------------------------------------------------- #
# 4.  Evaluation                                                       #
# -------------------------------------------------------------------- #
acc = accuracy_score(df["annotator_sentiment"], df["sentiment_pred_1epoch"])
print(f"Accuracy: {acc:.3%}")

print(classification_report(
      df["annotator_sentiment"], df["sentiment_pred_1epoch"],
      target_names=["negative (−1)", "neutral (0)", "positive (+1)"]))


Accuracy: 76.799%
               precision    recall  f1-score   support

negative (−1)       0.90      0.66      0.76      4541
  neutral (0)       0.71      0.87      0.78      4702
positive (+1)       0.73      0.77      0.75      2373

     accuracy                           0.77     11616
    macro avg       0.78      0.77      0.76     11616
 weighted avg       0.79      0.77      0.77     11616



# Ukr -> Eng -> Sentiment by Vader (default sentiment in Openwillis)

In [None]:
# uk2en_like_space.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "Yehor/kulyk-uk-en"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16

REVISION = None

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, revision=REVISION)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map=device,
    torch_dtype=torch_dtype,
    revision=REVISION,
)
model.eval()

def translate_like_space(text: str) -> str:
    prompt = "Translate the text to English:\n" + text 
    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)

    with torch.inference_mode():
        output = model.generate(
            input_ids,
            max_new_tokens=2048,
            do_sample=False,              # greedy
            repetition_penalty=1.05,     
        )

    gen = output[:, input_ids.shape[1]:]
    return tokenizer.batch_decode(gen, skip_special_tokens=True)[0].strip()


In [None]:

uk = "Над Україною збито ракету та 7 із 8 «Шахедів»"
print(translate_like_space(uk))

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

df['translated_text'] = df['document_content'].progress_apply(translate_like_space)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def vader_label_by_max(text: str) -> int:
    scores = analyzer.polarity_scores(text)
    top = max(('neg', 'neu', 'pos'), key=lambda k: scores[k])
    return {'neg': -1.0, 'neu': 0.0, 'pos': 1.0}[top]

In [None]:

df['vader_analysis'] = df['translated_text'].apply(vader_label_by_max)


In [None]:
df['vader_analysis'].value_counts()

In [None]:
df["annotator_sentiment"].value_counts()

In [None]:
# -------------------------------------------------------------------- #
# 4.  Evaluation                                                       #
# -------------------------------------------------------------------- #
acc = accuracy_score(df["annotator_sentiment"], df["vader_analysis"])
print(f"Accuracy: {acc:.3%}")

print(classification_report(
      df["annotator_sentiment"], df["vader_analysis"],
      target_names=["negative (−1)", "neutral (0)", "positive (+1)"]))


# Num Syllables 

In [None]:
!pip3 install spacy pyphen praat-parselmouth pingouin pandas numpy tqdm jsonlines pyctcdecode

In [None]:
!gdown --fuzzy "https://drive.google.com/file/d/1j9d91QqE7_WnOnmEmidtOG55tpmxQUeJ/view"

In [None]:
!unzip /content/dataset.zip

In [None]:
import jsonlines
from glob import glob
from tqdm import tqdm
import pandas as pd
from torchaudio import info as audiofile_info

tqdm.pandas(desc="Audio data processing")

with jsonlines.open("/content/labels.jsonl", 'r') as reader:
    for line in reader:
        labels = line

all_audio_files = glob("toronto_*/*.wav", recursive=True)

toronto_dataset = pd.DataFrame({
    "path": all_audio_files
})

toronto_dataset["transcript"] = toronto_dataset["path"].progress_apply(
    lambda x: labels["dataset/" + "/".join(x.split("/")[-2:])]
)

toronto_dataset["transcript_len"] = toronto_dataset["transcript"].progress_apply(len)

def get_audio_dur_sec(path):
    file_info = audiofile_info(path)
    return file_info.num_frames / file_info.sample_rate

toronto_dataset["audio_dur_sec"] = toronto_dataset["path"].progress_apply(get_audio_dur_sec)
import re

def extract_numbers(path):
    nums = re.findall(r"\d+", path)
    return int(nums[0]), int(nums[-1])

toronto_dataset = toronto_dataset.sort_values(by="path", key=lambda col: col.map(extract_numbers)).reset_index(drop=True)
toronto_dataset.to_csv("meta_toronto.csv", index=False)

Ukrainian syllable counting & evaluation pipeline
- syll_spacy         : spaCy-uk component (rule-based over Pyphen + patches)
- syll_pyphen        : Pyphen hyphenation baseline
- syll_nltk          : NLTK SyllableTokenizer with custom sonority hierarchy
- syll_praat_like    : "pure Python" Parselmouth (intensity peaks + voicing)
- syll_praat_original: original Praat script SyllableNucleiv3.praat via Parselmouth
- Metrics            : MAE vs references + ICC (absolute agreement)

In [None]:
import os, re, math, io, warnings
from dataclasses import dataclass
from typing import Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

# ---------------- Text libs ----------------
import spacy
from spacy.language import Language
from spacy.tokens import Token
import pyphen

# ---------------- Audio (Praat) ------------
import parselmouth
from parselmouth.praat import call

# ---------------- Stats --------------------
import pingouin as pg

# ---------------- NLTK ---------------------
import nltk
from nltk.tokenize import SyllableTokenizer

warnings.filterwarnings("ignore", category=UserWarning)

# =========================
# Config
# =========================
AUDIO_ROOT = "/content"
PRAAT_SCRIPT_PATH = "SyllableNucleiv3.praat"

MIN_PITCH_HZ = 75.0
MIN_SYLLABLE_SEP_SEC = 0.10
PEAK_PROMINENCE_DB = 2.0
INTENSITY_STEP = 0.01
VOWELS_UK = set("аеєиіїоуюяАЕЄИІЇОУЮЯ")

@dataclass
class Paths:
    audio_root: Optional[str] = None
    praat_script: Optional[str] = None

# =========================
# helpers
# =========================
def _to_text(x) -> str:
    return x if isinstance(x, str) else ""

def resolve_audio_path(p: str, audio_root: Optional[str]) -> str:
    return p if os.path.isabs(p) else os.path.join(audio_root or "", p)

def compute_spm(num_syll: float, dur_sec: float) -> float:
    if not dur_sec or dur_sec <= 0:
        return np.nan
    return num_syll / (dur_sec / 60.0)

# =========================
# 1) Pyphen-only (текст)
# =========================
_dic = pyphen.Pyphen(lang="uk_UA")
_word_re = re.compile(r"[А-ЩЬЮЯІЇЄҐа-щьюяіїєґʼ'’-]+", re.U)

def count_syll_pyphen(text: str) -> int:
    text = _to_text(text)
    if not text:
        return 0
    s = 0
    for w in _word_re.findall(text):
        w = w.replace("’", "'").replace("ʼ", "'")
        for p in w.split("-"):
            ins = _dic.inserted(p)
            s += (ins.count("-") + 1) if ins else 1
    return s

# =========================
# 2) spaCy-uk component
# =========================
def _syllables_word_uk(word: str) -> int:
    w = word.replace("’", "'").replace("ʼ", "'")
    if not any(ch in VOWELS_UK for ch in w):
        return 1 if re.search(r"[рРлЛ]", w) else 1
    total = 0
    for p in w.split("-"):
        ins = _dic.inserted(p)
        cnt = (ins.count("-") + 1) if ins else 1
        if re.search(r"(йо|ЙО|ьо|ЬО)", p):
            cnt = max(1, cnt)
        total += cnt
    return total

if not Token.has_extension("num_syllables"):
    Token.set_extension("num_syllables", default=0)

@Language.component("uk_syllable_counter")
def uk_syllable_counter(doc):
    for t in doc:
        if t.is_alpha or _word_re.fullmatch(t.text):
            t._.num_syllables = _syllables_word_uk(t.text)
        else:
            t._.num_syllables = 0
    return doc

def _build_nlp_uk():
    try:
        nlp = spacy.load("uk_core_news_sm")
    except Exception:
        nlp = spacy.blank("uk")
    if "uk_syllable_counter" not in nlp.pipe_names:
        nlp.add_pipe("uk_syllable_counter", last=True)
    return nlp

nlp_uk = _build_nlp_uk()

def count_syll_spacy(text: str) -> int:
    text = _to_text(text)
    if not text:
        return 0
    return sum(t._.num_syllables for t in nlp_uk(text))

# =========================
# 3) NLTK SSP (укр.)
# =========================
UKR_SONORITY = [
    "аеєиіїоуюя",   # vowels
    "йв",           # glides / approximants
    "рл",           # liquids
    "мн",           # nasals
    "жзшщсхгф",     # fricatives (г ≈ [ɦ])
    "бпдткґчц"      # stops/affricates
]
SSP_UK = SyllableTokenizer(lang="uk", sonority_hierarchy=UKR_SONORITY)
_APOS_DASH_MAP = str.maketrans({"’": "'", "ʼ": "'", "–": "-", "—": "-"})
WORD_RE_UK = re.compile(
    r"[А-ЩЬЮЯІЇЄҐа-щьюяіїєґ]+(?:'[А-ЩЬЮЯІЇЄҐа-щьюяіїєґ]+)?(?:-[А-ЩЬЮЯІЇЄҐа-щьюяіїєґ]+)*"
)

def count_syll_nltk(text: str) -> int:
    text = _to_text(text)
    if not text:
        return 0
    text = text.translate(_APOS_DASH_MAP)
    tokens = WORD_RE_UK.findall(text)
    total = 0
    for w in tokens:
        for part in w.split("-"):
            p = part.replace("'", "")
            if p:
                total += len(SSP_UK.tokenize(p))
    return total

# =========================
# 4A) «Praat-like» nuclei using Parselmouth (clear Python)
# =========================
def count_syllables_praat_like(
    audio_path: str,
    min_pitch_hz: float = MIN_PITCH_HZ,
    time_step_sec: float = INTENSITY_STEP,
    min_separation_sec: float = MIN_SYLLABLE_SEP_SEC,
    prominence_db: float = PEAK_PROMINENCE_DB
) -> int:
    snd = parselmouth.Sound(audio_path)
    intensity = snd.to_intensity(minimum_pitch=min_pitch_hz, time_step=time_step_sec)
    pitch = snd.to_pitch(time_step=time_step_sec, pitch_floor=min_pitch_hz)

    times = intensity.xs()
    vals = np.asarray(intensity.values).flatten()
    n = len(vals)
    if n < 3:
        return 0

    window = 3
    cand = []
    for i in range(window, n - window):
        v = vals[i]
        if vals[i-1] < v > vals[i+1]:
            local_min = np.min(vals[i - window:i + window + 1])
            if (v - local_min) >= prominence_db:
                t = times[i]
                f0 = pitch.get_value_at_time(t)
                if f0 and not math.isnan(f0):
                    cand.append((t, v))

    if not cand:
        return 0

    cand.sort()
    kept = []
    for t, vv in cand:
        if not kept or (t - kept[-1][0]) >= min_separation_sec:
            kept.append((t, vv))
        else:
            if vv > kept[-1][1]:
                kept[-1] = (t, vv)
    return len(kept)

# =========================
# 4B) Original script Syllable Nuclei v3 (Praat)
# =========================
def count_syllables_praat_original(
    audio_path: str,
    praat_script_path: str,
    *,
    detect_filled_pauses: bool = False,
    language: str = "English",
    silence_db: float = -25.0,
    min_dip_db: float = 2.0,
    min_pause_s: float = 0.4,
) -> int:
    """
    Call SyllableNucleiv3.praat and return syllables amount (nsyll).
    Requires v3-script file. Return Table and read nsyll column.
    """
    sound = parselmouth.Sound(audio_path)
    res = parselmouth.praat.run_file(
        sound, praat_script_path,
        '', 'None',
        float(silence_db),
        float(min_dip_db),
        float(min_pause_s),
        bool(detect_filled_pauses),
        str(language),
        1.0,                 # Filled_Pause_threshold (по умолчанию)
        'Table', 'OverWriteData', False
    )
    table = res[-1] if isinstance(res, (list, tuple)) else res

    # Method 1: TSV -> pandas, strip() on column names
    try:
        tsv = call(table, "List", False)
        df = pd.read_csv(io.StringIO(tsv), sep="\t")
        df.columns = df.columns.str.strip()
        if "nsyll" in df.columns:
            return int(df.loc[0, "nsyll"])
    except Exception:
        pass

    # Method 2: get the index of the column whose name after strip().lower() == "nsyll"
    ncol = call(table, "Get number of columns")
    target_idx = None
    for i in range(1, ncol + 1):
        lbl = call(table, "Get column label", i)
        if str(lbl).strip().lower() == "nsyll":
            target_idx = i
            break
    if target_idx is None:
        # fallback: sometimes there's a "voicedcount"
        for i in range(1, ncol + 1):
            lbl = call(table, "Get column label", i)
            if str(lbl).strip().lower() == "voicedcount":
                target_idx = i
                break
    if target_idx is None:
        labels = [call(table, "Get column label", i) for i in range(1, ncol + 1)]
        raise KeyError(f"nsyll column not found. Columns: {labels}")

    val = call(table, "Get value", 1, target_idx)
    return int(round(float(val)))

# =========================
# 5) Basic Runner
# =========================
def run_all(df: pd.DataFrame, paths: Paths) -> pd.DataFrame:
    """
    Waiting in df: path, transcript, audio_dur_sec
    Adding:
      syll_pyphen, syll_spacy, syll_nltk,
      syll_praat_like, syll_praat_original,
      spm_spacy
    """
    req = {"path", "transcript", "audio_dur_sec"}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"DataFrame lacks required columns: {missing}")

    df = df.copy()
    df["transcript"] = df["transcript"].apply(_to_text)
    df["_abs_path"] = df["path"].apply(lambda p: resolve_audio_path(p, paths.audio_root))

    # text methods
    tqdm.pandas(desc="syll_pyphen")
    df["syll_pyphen"] = df["transcript"].progress_apply(count_syll_pyphen)

    tqdm.pandas(desc="syll_spacy")
    df["syll_spacy"] = df["transcript"].progress_apply(count_syll_spacy)

    tqdm.pandas(desc="syll_nltk")
    df["syll_nltk"] = df["transcript"].progress_apply(count_syll_nltk)

    # audio methods
    tqdm.pandas(desc="syll_praat_like")
    df["syll_praat_like"] = df["_abs_path"].progress_apply(
        lambda p: np.nan if not os.path.exists(p) else count_syllables_praat_like(p)
    )

    tqdm.pandas(desc="syll_praat_original")
    def _safe_praat_orig(pth: str) -> float:
        try:
            if not os.path.exists(pth):
                return np.nan
            return float(count_syllables_praat_original(
                pth, paths.praat_script or PRAAT_SCRIPT_PATH,
                detect_filled_pauses=False, language="English"
            ))
        except Exception:
            return np.nan
    df["syll_praat_original"] = df["_abs_path"].progress_apply(_safe_praat_orig)

    # SPM for the main text method (spaCy)
    df["spm_spacy"] = df.apply(lambda r: compute_spm(r["syll_spacy"], r["audio_dur_sec"]), axis=1)

    return df

# =========================
# 6) Metrics
# =========================
def evaluate(df: pd.DataFrame) -> dict:
    out = {}
    # MAE between methods
    pairs = [
        ("MAE_spaCy_vs_PraatLike",  "syll_spacy", "syll_praat_like"),
        ("MAE_spaCy_vs_PraatOrig",  "syll_spacy", "syll_praat_original"),
        ("MAE_spaCy_vs_Pyphen",     "syll_spacy", "syll_pyphen"),
        ("MAE_spaCy_vs_NLTK",       "syll_spacy", "syll_nltk"),
        ("MAE_NLTK_vs_PraatOrig",   "syll_nltk",  "syll_praat_original"),
        ("MAE_Pyphen_vs_PraatOrig", "syll_pyphen","syll_praat_original"),
        ("MAE_PraatLike_vs_PraatOrig","syll_praat_like","syll_praat_original"),
    ]
    for name, a, b in pairs:
        out[name] = float(np.nanmean(np.abs(df[a] - df[b])))

    # ICC on 5 "raters"
    long = df[["path","syll_spacy","syll_pyphen","syll_nltk","syll_praat_like","syll_praat_original"]].melt(
        id_vars="path", var_name="rater", value_name="score"
    ).dropna()
    icc_table = pg.intraclass_corr(data=long, targets="path", raters="rater", ratings="score")
    # ICC2 ~ two-way random, absolute agreement, single rater
    icc2 = icc_table.loc[icc_table["Type"] == "ICC2", "ICC"].values[0]
    out["ICC2_(absolute_agreement)"] = float(icc2)
    out["ICC_table"] = icc_table
    return out

In [None]:
df = pd.read_csv("meta_toronto.csv")
df.dropna(inplace=True)
df.head()

In [None]:
results = run_all(df, Paths(audio_root=AUDIO_ROOT, praat_script=PRAAT_SCRIPT_PATH))

In [None]:
metrics = evaluate(results)

In [None]:
print({k: v for k, v in metrics.items() if k != "ICC_table"})

### Find errors

In [None]:
import numpy as np
import pandas as pd

df = results.copy()

# 1) Basec errors
df["err_abs_spaCy_praat"] = (df["syll_spacy"] - df["syll_praat_original"]).abs()
df["err_abs_pyphen_praat"] = (df["syll_pyphen"] - df["syll_praat_original"]).abs()
df["err_abs_nltk_praat"] = (df["syll_nltk"] - df["syll_praat_original"]).abs()

# 2) Normalization
df["err_rel_spaCy_praat"] = df["err_abs_spaCy_praat"] / df["syll_praat_original"].clip(lower=1)

# 3) SPM for audio
df["spm_praat"] = df["syll_praat_original"] / (df["audio_dur_sec"] / 60.0)

# 4) ΔSPM
df["d_spm"] = (df["spm_spacy"] - df["spm_praat"]).abs()

# 5) Tail by absolute error (top 100)
tail_abs = df.sort_values("err_abs_spaCy_praat", ascending=False).head(100)

# 6) Tail by ΔSPM (top 100)
tail_spm = df.sort_values("d_spm", ascending=False).head(100)

cols = ["path", "transcript", "audio_dur_sec",
        "syll_spacy", "syll_praat_original", "err_abs_spaCy_praat", "spm_spacy", "spm_praat", "d_spm"]
tail_abs[cols].to_csv("tail_abs_top100.csv", index=False)
tail_spm[cols].to_csv("tail_spm_top100.csv", index=False)

In [None]:
tail_abs.transcript.iloc[-1]

In [None]:
tail_abs

In [None]:
import numpy as np
import pandas as pd
from pingouin import intraclass_corr
from scipy.stats import pearsonr, spearmanr

df = results.copy()

# ---- utils ----
def pair_report(a, b, name_a, name_b):
    s1, s2 = df[a], df[b]
    mask = ~(s1.isna() | s2.isna())
    s1, s2 = s1[mask], s2[mask]
    mae  = float(np.mean(np.abs(s1 - s2)))
    bias = float(np.mean(s1 - s2))
    r,  _ = pearsonr(s1, s2)
    rho,_ = spearmanr(s1, s2)
    # Bland–Altman
    m = (s1 + s2) / 2
    d = s1 - s2
    md = float(np.mean(d))
    sd = float(np.std(d, ddof=1))
    loa = (md - 1.96*sd, md + 1.96*sd)
    return {
        "pair": f"{name_a} vs {name_b}",
        "n": int(mask.sum()),
        "MAE": mae, "Bias": bias,
        "Pearson_r": float(r), "Spearman_rho": float(rho),
        "BA_mean_diff": md, "BA_LOA_low": loa[0], "BA_LOA_high": loa[1],
    }

# ---- Audio inside ----
audio_rep = pair_report("syll_praat_like", "syll_praat_original", "PraatLike", "PraatOriginal")

# ---- Text-inside (all pairs) ----
text_pairs = [
    ("syll_spacy","syll_pyphen","spaCy","Pyphen"),
    ("syll_spacy","syll_nltk","spaCy","NLTK"),
    ("syll_pyphen","syll_nltk","Pyphen","NLTK"),
]
text_rep = [pair_report(*p) for p in text_pairs]

# ---- (optional) ICC within classes ----
def icc_for(cols, label):
    long = df[["path"] + cols].melt(id_vars="path", var_name="rater", value_name="score").dropna()
    icct = intraclass_corr(long, targets="path", raters="rater", ratings="score")
    icc2 = float(icct.loc[icct["Type"]=="ICC2","ICC"].iloc[0])
    return {"group": label, "ICC2": icc2}

icc_audio = icc_for(["syll_praat_like","syll_praat_original"], "audio")
icc_text  = icc_for(["syll_spacy","syll_pyphen","syll_nltk"], "text")

print("AUDIO inside-class:", audio_rep, icc_audio)
print("TEXT inside-class:", *text_rep, icc_text, sep="\n")