In [None]:
# cosmus_eval_major_label.py
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from transformers import pipeline

# -------------------------------------------------------------------- #
# 1.  Load the COSMUS dataset (Telegram RU/UA posts)                   #
# -------------------------------------------------------------------- #
ds = load_dataset("YShynkarov/COSMUS", split="train") 
ds = ds.filter(lambda x: x["annotator_sentiment"] != "mixed")
df = ds.to_pandas()[["document_content", "annotator_sentiment", "language"]]


Filter:   0%|          | 0/12224 [00:00<?, ? examples/s]

In [44]:
df

Unnamed: 0,document_content,annotator_sentiment,language
0,⚡️Українська делегація відправилася на перемов...,neutral,ua
1,"Вибухи на Одещині, попередньо — ППО.",neutral,ua
2,"А что делать тем ,кто лишился своего жилья ,по...",negative,ru
3,Тогда учись быстро бегать. Для меня вопрос сло...,negative,ru
4,Добрий день,neutral,ua
...,...,...,...
11611,"У меня три окна и двери выбило , даже и не дум...",negative,ru
11612,"Краще ""повинна бути зручнішою, ніж Uber чи Boo...",negative,ua
11613,"Питання, цей сертифікат можна вже використовув...",neutral,ua
11614,На Вугледарському напрямку загинув Рома Іванен...,negative,ua


In [31]:
df['document_content'].iloc[5]

'Бажаю удачі тим, хто цього потребує.'

'Я розумію. Але ви хоч уявляєте, скільки часу на це піде? І не буде там великої суми, бо рахуватимуть тільки вартість "коробки". Опис майна "до" ніхто не робив.'

In [45]:

# Gold labels → integers
label2id = {"negative": -1, "neutral": 0, "positive": 1}
df["annotator_sentiment"] = df["annotator_sentiment"].map(label2id)


In [33]:
df = df.dropna()
df

Unnamed: 0,document_content,annotator_sentiment,language
0,⚡️Українська делегація відправилася на перемов...,0.0,ua
1,"Вибухи на Одещині, попередньо — ППО.",0.0,ua
2,"А что делать тем ,кто лишился своего жилья ,по...",-1.0,ru
3,Тогда учись быстро бегать. Для меня вопрос сло...,-1.0,ru
4,Добрий день,0.0,ua
...,...,...,...
12218,"У меня три окна и двери выбило , даже и не дум...",-1.0,ru
12219,"Краще ""повинна бути зручнішою, ніж Uber чи Boo...",-1.0,ua
12221,"Питання, цей сертифікат можна вже використовув...",0.0,ua
12222,На Вугледарському напрямку загинув Рома Іванен...,-1.0,ua


In [None]:
# -------------------------------------------------------------------- #
# 2.  Sentiment model wrapper with major_label()                       #
# -------------------------------------------------------------------- #
class SentimentAnalyzer:
    """
    Multilingual twitter-XLM-RoBERTa sentiment wrapper.
    Provides polarity_scores() *and* major_label().
    """
    def __init__(self):
        mdl = "cardiffnlp/twitter-xlm-roberta-base-sentiment"       # 3-way (neg/neu/pos)
        self._pipe = pipeline(
            "sentiment-analysis",
            model=mdl, tokenizer=mdl,
            top_k=None                                              # returns all three scores  
        )

    # ---------- already supplied ----------
    def polarity_scores(self, text: str):
        res   = self._pipe(text)              # list[list[dict(label,score)]]
        scores = {d["label"]: d["score"] for d in res[0]}
        # compound = abs(scores.get("positive", 0) - scores.get("negative", 0))
        return {"neg": scores.get("negative", 0),
                "neu": scores.get("neutral",  0),
                "pos": scores.get("positive", 0),
                # "compound": compound,
                }

    # ---------- new method ----------
    def major_label(self, text: str):
        """
        Returns (text_label, int_label) where int_label ∈ {−1,0,1}.
        """
        sc   = self.polarity_scores(text)
        best = max(("neg", "neu", "pos"), key=sc.get)           
        text_label = {"neg": "negative", "neu": "neutral", "pos": "positive"}[best]
        return text_label, {"negative": -1, "neutral": 0, "positive": 1}[text_label]

analyzer = SentimentAnalyzer()

Device set to use mps:0


In [None]:
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer, pipeline
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

class UkrSentimentAnalyzer:
    """
    Ukrainian sentiment analysis model based on YShynkarov/ukr-roberta-cosmus-sentiment.
    Provides polarity_scores() and major_label().
    """
    map_labels = {
                'LABEL_0': 'mixed',
                'LABEL_1': 'negative',
                'LABEL_2': 'neutral',
                'LABEL_3': 'positive',
            }
    int_label_map = {
        "negative": -1.0,
        "neutral": 0.0,
        "positive": 1.0,
        "mixed": 0
    }

    def __init__(self):
        repo_id = "YShynkarov/ukr-roberta-cosmus-sentiment"
        safetensor = hf_hub_download(repo_id=repo_id,
                                     filename="ukrroberta_cosmus_sentiment.safetensors")

        config = RobertaConfig.from_pretrained("youscan/ukr-roberta-base", num_labels=4)
        tokenizer = RobertaTokenizer.from_pretrained("youscan/ukr-roberta-base")

        model = RobertaForSequenceClassification(config)
        state_dict = load_file(safetensor)
        model.load_state_dict(state_dict)
        model.eval()
        self._pipe = pipeline(
            "text-classification",
            model=model,
            tokenizer=tokenizer,
            device=-1,               
            return_all_scores=True,
            truncation=True,
        )

    def polarity_scores(self, text: str) -> dict:
        """
        Returns dict:
          {
            "negative": float_score,
            "neutral":  float_score,
            "positive": float_score,
            "mixed":    float_score
          }
        """
        # pipeline returns list[list[{"label":..., "score":...}, ...]]
        results = self._pipe(text)
        scores = {
            self.map_labels[item["label"]]: item["score"]
            for item in results[0]
        }
        return scores

    def major_label(self, text: str) -> tuple[str, int]:
        """
        Returns (text_label, int_label),
        where int_label ∈ {-1,0,1,2} for negative, neutral, positive, mixed.
        """
        scores = self.polarity_scores(text)
        best = max(scores, key=scores.get)
        return best, self.int_label_map[best]

analyzer = UkrSentimentAnalyzer()
print(analyzer.polarity_scores("Привіт! Все просто чудово"))
# → {'negative': 0.01, 'neutral': 0.05, 'positive': 0.90, 'mixed': 0.04}

print(analyzer.major_label("Привіт! Все просто чудово"))
# → ('positive', 1)

Device set to use cpu


{'mixed': 0.011259634047746658, 'negative': 0.0018209181725978851, 'neutral': 0.006787061225622892, 'positive': 0.9801324009895325}
('positive', 1.0)




In [54]:
# -------------------------------------------------------------------- #
# 3.  Inference → sentiment_pred column                                #
# -------------------------------------------------------------------- #
from tqdm import tqdm      # или просто `from tqdm import tqdm`
tqdm.pandas()

df["sentiment_pred_1epoch"] = df["document_content"].progress_apply(
    lambda txt: analyzer.major_label(txt)[1]     # keep numeric only
)

100%|██████████| 11616/11616 [12:04<00:00, 16.04it/s]


In [55]:
df=df.dropna(subset=["annotator_sentiment", 'sentiment_pred_1epoch'])
df



Unnamed: 0,document_content,annotator_sentiment,language,sentiment_pred_1epoch
0,⚡️Українська делегація відправилася на перемов...,0,ua,1.0
1,"Вибухи на Одещині, попередньо — ППО.",0,ua,0.0
2,"А что делать тем ,кто лишился своего жилья ,по...",-1,ru,-1.0
3,Тогда учись быстро бегать. Для меня вопрос сло...,-1,ru,-1.0
4,Добрий день,0,ua,1.0
...,...,...,...,...
11611,"У меня три окна и двери выбило , даже и не дум...",-1,ru,-1.0
11612,"Краще ""повинна бути зручнішою, ніж Uber чи Boo...",-1,ua,1.0
11613,"Питання, цей сертифікат можна вже використовув...",0,ua,0.0
11614,На Вугледарському напрямку загинув Рома Іванен...,-1,ua,1.0


In [56]:
df["sentiment_pred_1epoch"].value_counts()

sentiment_pred_1epoch
 0.0    5764
-1.0    3345
 1.0    2507
Name: count, dtype: int64

In [None]:
# -------------------------------------------------------------------- #
# 4.  Evaluation                                                       #
# -------------------------------------------------------------------- #
acc = accuracy_score(df["annotator_sentiment"], df["sentiment_pred_1epoch"])
print(f"Accuracy: {acc:.3%}")

print(classification_report(
      df["annotator_sentiment"], df["sentiment_pred_1epoch"],
      target_names=["negative (−1)", "neutral (0)", "positive (+1)"]))


Accuracy: 76.799%
               precision    recall  f1-score   support

negative (−1)       0.90      0.66      0.76      4541
  neutral (0)       0.71      0.87      0.78      4702
positive (+1)       0.73      0.77      0.75      2373

     accuracy                           0.77     11616
    macro avg       0.78      0.77      0.76     11616
 weighted avg       0.79      0.77      0.77     11616



# Ukr -> Eng -> Sentiment by Vader (default sentiment in Openwillis)

In [None]:
# uk2en_like_space.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "Yehor/kulyk-uk-en"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16

REVISION = None

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, revision=REVISION)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map=device,
    torch_dtype=torch_dtype,
    revision=REVISION,
)
model.eval()

def translate_like_space(text: str) -> str:
    prompt = "Translate the text to English:\n" + text 
    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)

    with torch.inference_mode():
        output = model.generate(
            input_ids,
            max_new_tokens=2048,
            do_sample=False,              # greedy
            repetition_penalty=1.05,     
        )

    gen = output[:, input_ids.shape[1]:]
    return tokenizer.batch_decode(gen, skip_special_tokens=True)[0].strip()


In [None]:

uk = "Над Україною збито ракету та 7 із 8 «Шахедів»"
print(translate_like_space(uk))

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

df['translated_text'] = df['document_content'].progress_apply(translate_like_space)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def vader_label_by_max(text: str) -> int:
    scores = analyzer.polarity_scores(text)
    top = max(('neg', 'neu', 'pos'), key=lambda k: scores[k])
    return {'neg': -1.0, 'neu': 0.0, 'pos': 1.0}[top]

In [None]:

df['vader_analysis'] = df['translated_text'].apply(vader_label_by_max)


In [None]:
df['vader_analysis'].value_counts()

In [None]:
df["annotator_sentiment"].value_counts()

In [None]:
# -------------------------------------------------------------------- #
# 4.  Evaluation                                                       #
# -------------------------------------------------------------------- #
acc = accuracy_score(df["annotator_sentiment"], df["vader_analysis"])
print(f"Accuracy: {acc:.3%}")

print(classification_report(
      df["annotator_sentiment"], df["vader_analysis"],
      target_names=["negative (−1)", "neutral (0)", "positive (+1)"]))


# Num Syllables 

In [None]:
!pip3 install spacy pyphen praat-parselmouth pingouin pandas numpy tqdm jsonlines pyctcdecode

In [None]:
!gdown --fuzzy "https://drive.google.com/file/d/1j9d91QqE7_WnOnmEmidtOG55tpmxQUeJ/view"

In [None]:
!unzip /content/dataset.zip

In [None]:
import jsonlines
from glob import glob
from tqdm import tqdm
import pandas as pd
from torchaudio import info as audiofile_info

tqdm.pandas(desc="Audio data processing")

with jsonlines.open("/content/labels.jsonl", 'r') as reader:
    for line in reader:
        labels = line

all_audio_files = glob("toronto_*/*.wav", recursive=True)

toronto_dataset = pd.DataFrame({
    "path": all_audio_files
})

toronto_dataset["transcript"] = toronto_dataset["path"].progress_apply(
    lambda x: labels["dataset/" + "/".join(x.split("/")[-2:])]
)

toronto_dataset["transcript_len"] = toronto_dataset["transcript"].progress_apply(len)

def get_audio_dur_sec(path):
    file_info = audiofile_info(path)
    return file_info.num_frames / file_info.sample_rate

toronto_dataset["audio_dur_sec"] = toronto_dataset["path"].progress_apply(get_audio_dur_sec)
import re

def extract_numbers(path):
    nums = re.findall(r"\d+", path)
    return int(nums[0]), int(nums[-1])

toronto_dataset = toronto_dataset.sort_values(by="path", key=lambda col: col.map(extract_numbers)).reset_index(drop=True)
toronto_dataset.to_csv("meta_toronto.csv", index=False)

Ukrainian syllable counting & evaluation pipeline
- syll_spacy         : spaCy-uk component (rule-based over Pyphen + patches)
- syll_pyphen        : Pyphen hyphenation baseline
- syll_nltk          : NLTK SyllableTokenizer with custom sonority hierarchy
- syll_praat_like    : "pure Python" Parselmouth (intensity peaks + voicing)
- syll_praat_original: original Praat script SyllableNucleiv3.praat via Parselmouth
- Metrics            : MAE vs references + ICC (absolute agreement)

In [None]:
import os, re, math, io, warnings
from dataclasses import dataclass
from typing import Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

# ---------------- Text libs ----------------
import spacy
from spacy.language import Language
from spacy.tokens import Token
import pyphen

# ---------------- Audio (Praat) ------------
import parselmouth
from parselmouth.praat import call

# ---------------- Stats --------------------
import pingouin as pg

# ---------------- NLTK ---------------------
import nltk
from nltk.tokenize import SyllableTokenizer

warnings.filterwarnings("ignore", category=UserWarning)

# =========================
# Config
# =========================
AUDIO_ROOT = "/content"
PRAAT_SCRIPT_PATH = "SyllableNucleiv3.praat"

MIN_PITCH_HZ = 75.0
MIN_SYLLABLE_SEP_SEC = 0.10
PEAK_PROMINENCE_DB = 2.0
INTENSITY_STEP = 0.01
VOWELS_UK = set("аеєиіїоуюяАЕЄИІЇОУЮЯ")

@dataclass
class Paths:
    audio_root: Optional[str] = None
    praat_script: Optional[str] = None

# =========================
# helpers
# =========================
def _to_text(x) -> str:
    return x if isinstance(x, str) else ""

def resolve_audio_path(p: str, audio_root: Optional[str]) -> str:
    return p if os.path.isabs(p) else os.path.join(audio_root or "", p)

def compute_spm(num_syll: float, dur_sec: float) -> float:
    if not dur_sec or dur_sec <= 0:
        return np.nan
    return num_syll / (dur_sec / 60.0)

# =========================
# 1) Pyphen-only (текст)
# =========================
_dic = pyphen.Pyphen(lang="uk_UA")
_word_re = re.compile(r"[А-ЩЬЮЯІЇЄҐа-щьюяіїєґʼ'’-]+", re.U)

def count_syll_pyphen(text: str) -> int:
    text = _to_text(text)
    if not text:
        return 0
    s = 0
    for w in _word_re.findall(text):
        w = w.replace("’", "'").replace("ʼ", "'")
        for p in w.split("-"):
            ins = _dic.inserted(p)
            s += (ins.count("-") + 1) if ins else 1
    return s

# =========================
# 2) spaCy-uk component
# =========================
def _syllables_word_uk(word: str) -> int:
    w = word.replace("’", "'").replace("ʼ", "'")
    if not any(ch in VOWELS_UK for ch in w):
        return 1 if re.search(r"[рРлЛ]", w) else 1
    total = 0
    for p in w.split("-"):
        ins = _dic.inserted(p)
        cnt = (ins.count("-") + 1) if ins else 1
        if re.search(r"(йо|ЙО|ьо|ЬО)", p):
            cnt = max(1, cnt)
        total += cnt
    return total

if not Token.has_extension("num_syllables"):
    Token.set_extension("num_syllables", default=0)

@Language.component("uk_syllable_counter")
def uk_syllable_counter(doc):
    for t in doc:
        if t.is_alpha or _word_re.fullmatch(t.text):
            t._.num_syllables = _syllables_word_uk(t.text)
        else:
            t._.num_syllables = 0
    return doc

def _build_nlp_uk():
    try:
        nlp = spacy.load("uk_core_news_sm")
    except Exception:
        nlp = spacy.blank("uk")
    if "uk_syllable_counter" not in nlp.pipe_names:
        nlp.add_pipe("uk_syllable_counter", last=True)
    return nlp

nlp_uk = _build_nlp_uk()

def count_syll_spacy(text: str) -> int:
    text = _to_text(text)
    if not text:
        return 0
    return sum(t._.num_syllables for t in nlp_uk(text))

# =========================
# 3) NLTK SSP (укр.)
# =========================
UKR_SONORITY = [
    "аеєиіїоуюя",   # vowels
    "йв",           # glides / approximants
    "рл",           # liquids
    "мн",           # nasals
    "жзшщсхгф",     # fricatives (г ≈ [ɦ])
    "бпдткґчц"      # stops/affricates
]
SSP_UK = SyllableTokenizer(lang="uk", sonority_hierarchy=UKR_SONORITY)
_APOS_DASH_MAP = str.maketrans({"’": "'", "ʼ": "'", "–": "-", "—": "-"})
WORD_RE_UK = re.compile(
    r"[А-ЩЬЮЯІЇЄҐа-щьюяіїєґ]+(?:'[А-ЩЬЮЯІЇЄҐа-щьюяіїєґ]+)?(?:-[А-ЩЬЮЯІЇЄҐа-щьюяіїєґ]+)*"
)

def count_syll_nltk(text: str) -> int:
    text = _to_text(text)
    if not text:
        return 0
    text = text.translate(_APOS_DASH_MAP)
    tokens = WORD_RE_UK.findall(text)
    total = 0
    for w in tokens:
        for part in w.split("-"):
            p = part.replace("'", "")
            if p:
                total += len(SSP_UK.tokenize(p))
    return total

# =========================
# 4A) «Praat-like» nuclei using Parselmouth (clear Python)
# =========================
def count_syllables_praat_like(
    audio_path: str,
    min_pitch_hz: float = MIN_PITCH_HZ,
    time_step_sec: float = INTENSITY_STEP,
    min_separation_sec: float = MIN_SYLLABLE_SEP_SEC,
    prominence_db: float = PEAK_PROMINENCE_DB
) -> int:
    snd = parselmouth.Sound(audio_path)
    intensity = snd.to_intensity(minimum_pitch=min_pitch_hz, time_step=time_step_sec)
    pitch = snd.to_pitch(time_step=time_step_sec, pitch_floor=min_pitch_hz)

    times = intensity.xs()
    vals = np.asarray(intensity.values).flatten()
    n = len(vals)
    if n < 3:
        return 0

    window = 3
    cand = []
    for i in range(window, n - window):
        v = vals[i]
        if vals[i-1] < v > vals[i+1]:
            local_min = np.min(vals[i - window:i + window + 1])
            if (v - local_min) >= prominence_db:
                t = times[i]
                f0 = pitch.get_value_at_time(t)
                if f0 and not math.isnan(f0):
                    cand.append((t, v))

    if not cand:
        return 0

    cand.sort()
    kept = []
    for t, vv in cand:
        if not kept or (t - kept[-1][0]) >= min_separation_sec:
            kept.append((t, vv))
        else:
            if vv > kept[-1][1]:
                kept[-1] = (t, vv)
    return len(kept)

# =========================
# 4B) Original script Syllable Nuclei v3 (Praat)
# =========================
def count_syllables_praat_original(
    audio_path: str,
    praat_script_path: str,
    *,
    detect_filled_pauses: bool = False,
    language: str = "English",
    silence_db: float = -25.0,
    min_dip_db: float = 2.0,
    min_pause_s: float = 0.4,
) -> int:
    """
    Call SyllableNucleiv3.praat and return syllables amount (nsyll).
    Requires v3-script file. Return Table and read nsyll column.
    """
    sound = parselmouth.Sound(audio_path)
    res = parselmouth.praat.run_file(
        sound, praat_script_path,
        '', 'None',
        float(silence_db),
        float(min_dip_db),
        float(min_pause_s),
        bool(detect_filled_pauses),
        str(language),
        1.0,                 # Filled_Pause_threshold (по умолчанию)
        'Table', 'OverWriteData', False
    )
    table = res[-1] if isinstance(res, (list, tuple)) else res

    # Method 1: TSV -> pandas, strip() on column names
    try:
        tsv = call(table, "List", False)
        df = pd.read_csv(io.StringIO(tsv), sep="\t")
        df.columns = df.columns.str.strip()
        if "nsyll" in df.columns:
            return int(df.loc[0, "nsyll"])
    except Exception:
        pass

    # Method 2: get the index of the column whose name after strip().lower() == "nsyll"
    ncol = call(table, "Get number of columns")
    target_idx = None
    for i in range(1, ncol + 1):
        lbl = call(table, "Get column label", i)
        if str(lbl).strip().lower() == "nsyll":
            target_idx = i
            break
    if target_idx is None:
        # fallback: sometimes there's a "voicedcount"
        for i in range(1, ncol + 1):
            lbl = call(table, "Get column label", i)
            if str(lbl).strip().lower() == "voicedcount":
                target_idx = i
                break
    if target_idx is None:
        labels = [call(table, "Get column label", i) for i in range(1, ncol + 1)]
        raise KeyError(f"nsyll column not found. Columns: {labels}")

    val = call(table, "Get value", 1, target_idx)
    return int(round(float(val)))

# =========================
# 5) Basic Runner
# =========================
def run_all(df: pd.DataFrame, paths: Paths) -> pd.DataFrame:
    """
    Waiting in df: path, transcript, audio_dur_sec
    Adding:
      syll_pyphen, syll_spacy, syll_nltk,
      syll_praat_like, syll_praat_original,
      spm_spacy
    """
    req = {"path", "transcript", "audio_dur_sec"}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"DataFrame lacks required columns: {missing}")

    df = df.copy()
    df["transcript"] = df["transcript"].apply(_to_text)
    df["_abs_path"] = df["path"].apply(lambda p: resolve_audio_path(p, paths.audio_root))

    # text methods
    tqdm.pandas(desc="syll_pyphen")
    df["syll_pyphen"] = df["transcript"].progress_apply(count_syll_pyphen)

    tqdm.pandas(desc="syll_spacy")
    df["syll_spacy"] = df["transcript"].progress_apply(count_syll_spacy)

    tqdm.pandas(desc="syll_nltk")
    df["syll_nltk"] = df["transcript"].progress_apply(count_syll_nltk)

    # audio methods
    tqdm.pandas(desc="syll_praat_like")
    df["syll_praat_like"] = df["_abs_path"].progress_apply(
        lambda p: np.nan if not os.path.exists(p) else count_syllables_praat_like(p)
    )

    tqdm.pandas(desc="syll_praat_original")
    def _safe_praat_orig(pth: str) -> float:
        try:
            if not os.path.exists(pth):
                return np.nan
            return float(count_syllables_praat_original(
                pth, paths.praat_script or PRAAT_SCRIPT_PATH,
                detect_filled_pauses=False, language="English"
            ))
        except Exception:
            return np.nan
    df["syll_praat_original"] = df["_abs_path"].progress_apply(_safe_praat_orig)

    # SPM for the main text method (spaCy)
    df["spm_spacy"] = df.apply(lambda r: compute_spm(r["syll_spacy"], r["audio_dur_sec"]), axis=1)

    return df

# =========================
# 6) Metrics
# =========================
def evaluate(df: pd.DataFrame) -> dict:
    out = {}
    # MAE between methods
    pairs = [
        ("MAE_spaCy_vs_PraatLike",  "syll_spacy", "syll_praat_like"),
        ("MAE_spaCy_vs_PraatOrig",  "syll_spacy", "syll_praat_original"),
        ("MAE_spaCy_vs_Pyphen",     "syll_spacy", "syll_pyphen"),
        ("MAE_spaCy_vs_NLTK",       "syll_spacy", "syll_nltk"),
        ("MAE_NLTK_vs_PraatOrig",   "syll_nltk",  "syll_praat_original"),
        ("MAE_Pyphen_vs_PraatOrig", "syll_pyphen","syll_praat_original"),
        ("MAE_PraatLike_vs_PraatOrig","syll_praat_like","syll_praat_original"),
    ]
    for name, a, b in pairs:
        out[name] = float(np.nanmean(np.abs(df[a] - df[b])))

    # ICC on 5 "raters"
    long = df[["path","syll_spacy","syll_pyphen","syll_nltk","syll_praat_like","syll_praat_original"]].melt(
        id_vars="path", var_name="rater", value_name="score"
    ).dropna()
    icc_table = pg.intraclass_corr(data=long, targets="path", raters="rater", ratings="score")
    # ICC2 ~ two-way random, absolute agreement, single rater
    icc2 = icc_table.loc[icc_table["Type"] == "ICC2", "ICC"].values[0]
    out["ICC2_(absolute_agreement)"] = float(icc2)
    out["ICC_table"] = icc_table
    return out

In [None]:
df = pd.read_csv("meta_toronto.csv")
df.dropna(inplace=True)
df.head()

In [None]:
results = run_all(df, Paths(audio_root=AUDIO_ROOT, praat_script=PRAAT_SCRIPT_PATH))

In [None]:
metrics = evaluate(results)

In [None]:
print({k: v for k, v in metrics.items() if k != "ICC_table"})

### Find errors

In [None]:
import numpy as np
import pandas as pd

df = results.copy()

# 1) Basec errors
df["err_abs_spaCy_praat"] = (df["syll_spacy"] - df["syll_praat_original"]).abs()
df["err_abs_pyphen_praat"] = (df["syll_pyphen"] - df["syll_praat_original"]).abs()
df["err_abs_nltk_praat"] = (df["syll_nltk"] - df["syll_praat_original"]).abs()

# 2) Normalization
df["err_rel_spaCy_praat"] = df["err_abs_spaCy_praat"] / df["syll_praat_original"].clip(lower=1)

# 3) SPM for audio
df["spm_praat"] = df["syll_praat_original"] / (df["audio_dur_sec"] / 60.0)

# 4) ΔSPM
df["d_spm"] = (df["spm_spacy"] - df["spm_praat"]).abs()

# 5) Tail by absolute error (top 100)
tail_abs = df.sort_values("err_abs_spaCy_praat", ascending=False).head(100)

# 6) Tail by ΔSPM (top 100)
tail_spm = df.sort_values("d_spm", ascending=False).head(100)

cols = ["path", "transcript", "audio_dur_sec",
        "syll_spacy", "syll_praat_original", "err_abs_spaCy_praat", "spm_spacy", "spm_praat", "d_spm"]
tail_abs[cols].to_csv("tail_abs_top100.csv", index=False)
tail_spm[cols].to_csv("tail_spm_top100.csv", index=False)

In [None]:
tail_abs.transcript.iloc[-1]

In [None]:
tail_abs

In [None]:
import numpy as np
import pandas as pd
from pingouin import intraclass_corr
from scipy.stats import pearsonr, spearmanr

df = results.copy()

# ---- utils ----
def pair_report(a, b, name_a, name_b):
    s1, s2 = df[a], df[b]
    mask = ~(s1.isna() | s2.isna())
    s1, s2 = s1[mask], s2[mask]
    mae  = float(np.mean(np.abs(s1 - s2)))
    bias = float(np.mean(s1 - s2))
    r,  _ = pearsonr(s1, s2)
    rho,_ = spearmanr(s1, s2)
    # Bland–Altman
    m = (s1 + s2) / 2
    d = s1 - s2
    md = float(np.mean(d))
    sd = float(np.std(d, ddof=1))
    loa = (md - 1.96*sd, md + 1.96*sd)
    return {
        "pair": f"{name_a} vs {name_b}",
        "n": int(mask.sum()),
        "MAE": mae, "Bias": bias,
        "Pearson_r": float(r), "Spearman_rho": float(rho),
        "BA_mean_diff": md, "BA_LOA_low": loa[0], "BA_LOA_high": loa[1],
    }

# ---- Audio inside ----
audio_rep = pair_report("syll_praat_like", "syll_praat_original", "PraatLike", "PraatOriginal")

# ---- Text-inside (all pairs) ----
text_pairs = [
    ("syll_spacy","syll_pyphen","spaCy","Pyphen"),
    ("syll_spacy","syll_nltk","spaCy","NLTK"),
    ("syll_pyphen","syll_nltk","Pyphen","NLTK"),
]
text_rep = [pair_report(*p) for p in text_pairs]

# ---- (optional) ICC within classes ----
def icc_for(cols, label):
    long = df[["path"] + cols].melt(id_vars="path", var_name="rater", value_name="score").dropna()
    icct = intraclass_corr(long, targets="path", raters="rater", ratings="score")
    icc2 = float(icct.loc[icct["Type"]=="ICC2","ICC"].iloc[0])
    return {"group": label, "ICC2": icc2}

icc_audio = icc_for(["syll_praat_like","syll_praat_original"], "audio")
icc_text  = icc_for(["syll_spacy","syll_pyphen","syll_nltk"], "text")

print("AUDIO inside-class:", audio_rep, icc_audio)
print("TEXT inside-class:", *text_rep, icc_text, sep="\n")

# translate DAIZWOX

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "Yehor/kulyk-en-uk"
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16

REVISION = None

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, revision=REVISION)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map=device,
    torch_dtype=torch_dtype,
    revision=REVISION,
)
model.eval()

def translate_like_space(text: str) -> str:
    prompt = "Translate the text to Ukrainian:\n" + text
    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)

    with torch.inference_mode():
        output = model.generate(
            input_ids,
            max_new_tokens=2048,
            do_sample=False,              # greedy
            repetition_penalty=1.05,
        )

    gen = output[:, input_ids.shape[1]:]
    return tokenizer.batch_decode(gen, skip_special_tokens=True)[0].strip()


In [None]:
import pandas as pd

df = pd.read_csv("/content/dcapswoz_all_transcripts.csv")

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

df['Text_ukr'] = df['Text'].progress_apply(translate_like_space)

In [None]:
df.to_csv("dcapwoz_all_plus_ukr.csv")

# Calculating perplexity, tangeniality, coherence

In [1]:
import pandas as pd
df_woz = pd.read_csv("/Users/pelmeshek1706/Desktop/projects/airest_notebooks/data/dcapwoz_all_plus_ukr.csv")
df_woz

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Start_Time,End_Time,Text,Confidence,File_number,Text_ukr
0,0,0,14.3,15.1,so I'm going to,0.934210,300,Тому я збираюся
1,1,1,20.3,21.1,interview in Spanish,0.608470,300,Інтерв'ю іспанською мовою
2,2,2,23.9,24.3,okay,0.690606,300,добре
3,3,3,62.1,62.7,good,0.951897,300,хороший
4,4,4,68.8,69.8,Atlanta Georgia,0.987629,300,Атланта Джорджія
...,...,...,...,...,...,...,...,...
26078,26078,26078,1171.6,1185.3,what I'm most proud of I can say that that th...,0.927357,718,"Що я найбільше пишаюся, можу сказати, що це ро..."
26079,26079,26079,1186.6,1238.6,I like when my kids now my oldest kid you kno...,0.969410,718,"Мені подобається, коли мої діти зараз найстарш..."
26080,26080,26080,1248.2,1248.9,you're welcome,0.982563,718,Вітаємо
26081,26081,26081,1252.8,1253.3,goodbye,0.875275,718,Прощання


In [None]:
import os
os.environ.setdefault(
    "PYTORCH_CUDA_ALLOC_CONF",
    "expandable_segments:True,max_split_size_mb:128"
)  # enables expandable segments and limits large block fragmentation
# See official docs on expandable_segments. Must be set **BEFORE** importing torch.  #  [oai_citation:1‡PyTorch Docs](https://docs.pytorch.org/docs/stable/notes/cuda.html?utm_source=chatgpt.com)

# ============================================================
# DiscourseMetrics — version with CPU embedding and memory-friendly SDPA
# ============================================================

import re, string, math
from typing import List, Tuple, Optional, Dict
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

class DiscourseMetrics:
    def __init__(
        self,
        language: str = "en",
        device: Optional[str] = None,      # device for LM (usually "cuda")
        emb_device: str = "cpu",           # embedder ONLY on CPU, as requested
        max_len_sent: int = 256,
        emb_model_id: str = "google/embeddinggemma-300m",
        ppl_model_id: str = "google/gemma-3-270m",
        ppl_max_tokens: int = 2048,
        dtype_auto: bool = True,
        use_tf32: bool = True,
        use_flash_attn2: bool = False,     # defaults to SDPA; enable FA2 if available
        compile_model: bool = False,
        emb_batch_size: int = 128,         # ↓ defaults lowered for memory spikes
        ppl_windows_bs: int = 256          # ↓ defaults lowered for memory spikes
    ):
        self.language = language
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.emb_device = emb_device
        self.max_len_sent = int(max_len_sent)
        self.ppl_max_tokens = int(ppl_max_tokens)
        self.emb_batch_size = int(emb_batch_size)
        self.ppl_windows_bs = int(ppl_windows_bs)

        # ---- matrix multiply accelerators (Ampere/Ada/Hopper) ----
        if self.device == "cuda" and use_tf32:
            try:
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.set_float32_matmul_precision("high")
            except Exception:
                pass

        # === EMBEDDINGS (on CPU!) ===
        # SentenceTransformer supports explicit device selection: "cpu" / "cuda:0" / list of devices.  [oai_citation:2‡SentenceTransformers](https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html?utm_source=chatgpt.com)
        self.emb = SentenceTransformer(emb_model_id, device=self.emb_device)
        self.emb_dim = self.emb.get_sentence_embedding_dimension()

        # === Tokenizer ===
        self.tok = AutoTokenizer.from_pretrained(ppl_model_id)
        if self.tok.pad_token is None and self.tok.eos_token is not None:
            self.tok.pad_token = self.tok.eos_token
        self.tok.padding_side = "left"  # left-padding is better for windowed batching

        # === dtype ===
        if dtype_auto:
            dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
        else:
            dtype = torch.float32
        self._dtype = dtype

        # === Load causal LM with SDPA (or FA2, if available) ===
        self.causal = self._load_causal_resilient(
            ppl_model_id,
            dtype=dtype,
            want_flash_attn2=(self.device == "cuda" and use_flash_attn2)
        ).to(self.device).eval()
        self.causal.config.use_cache = False
        self.model_max_length = getattr(self.causal.config, "max_position_embeddings", 32768)

        if compile_model:
            try:
                self.causal = torch.compile(self.causal, mode="reduce-overhead", fullgraph=False)
            except Exception:
                pass

        # legacy API
        self.measures: Dict[str, object] = {
            "english_langs": {"en", "english", "EN", "ENG"},
            "supported_langs_bert": {
                "en", "english", "ru", "uk", "ua", "de", "fr", "es", "it", "pt",
                "nl", "pl", "sv", "tr", "ar", "zh", "ja", "ko"
            },
            "words_texts": "words_texts",
        }

    # --------- resilient loaders (dtype + SDPA/FA2) ---------
    @staticmethod
    def _from_pretrained_with_dtype(model_id: str, dtype, **kwargs):
        try:
            return AutoModelForCausalLM.from_pretrained(model_id, dtype=dtype, **kwargs)
        except TypeError:
            return AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, **kwargs)

    def _load_causal_resilient(self, model_id: str, dtype, want_flash_attn2: bool):
        attn = "flash_attention_2" if want_flash_attn2 else "sdpa"
        try:
            # SDPA — fast native attention in PyTorch/Transformers; FA2 requires fp16/bf16.  [oai_citation:3‡Hugging Face](https://huggingface.co/docs/transformers/en/perf_infer_gpu_one?utm_source=chatgpt.com)
            return self._from_pretrained_with_dtype(model_id, dtype=dtype, attn_implementation=attn)
        except (ImportError, OSError, RuntimeError) as e:
            if "flash_attn" in str(e) or "flash_attn_2_cuda" in str(e) or "FlashAttention" in str(e):
                # fallback to SDPA
                return self._from_pretrained_with_dtype(model_id, dtype=dtype, attn_implementation="sdpa")
            raise
        except TypeError as e:
            if "attn_implementation" in str(e):
                return self._from_pretrained_with_dtype(model_id, dtype=dtype)
            raise

    # ---------------------- utils ----------------------
    @staticmethod
    def _clean_text(text: str) -> str:
        clean = text.translate(str.maketrans('', '', string.punctuation))
        clean = re.sub(r'\s+', ' ', clean).strip()
        return clean

    def _encode_texts(self, texts: List[str]) -> np.ndarray:
        if not texts:
            return np.zeros((0, self.emb_dim), dtype=np.float32)
        vecs = self.emb.encode(
            texts,
            batch_size=self.emb_batch_size,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=False,
        )
        return vecs.astype(np.float32)

    def _tokenize(self, text: str):
        enc = self.tok(text, return_tensors="pt", add_special_tokens=True)
        ids = enc["input_ids"].to(self.device)
        attn = enc["attention_mask"].to(self.device)
        return ids, attn

    def _truncate(self, ids: torch.Tensor, attn: torch.Tensor):
        max_len = min(self.model_max_length, self.ppl_max_tokens)
        if ids.size(1) > max_len:
            ids = ids[:, :max_len]
            attn = attn[:, :max_len]
        return ids, attn

    # ---------------------- PPL (one-pass) ----------------------
    @torch.inference_mode()
    def _perplexity_full(self, text: str) -> float:
        ids, attn = self._tokenize(text)
        ids, attn = self._truncate(ids, attn)
        if ids.size(1) < 2:
            return float("nan")
        out = self.causal(input_ids=ids, attention_mask=attn)
        logits = out.logits
        shift_logits = logits[:, :-1, :]
        shift_labels = ids[:, 1:]
        shift_mask = attn[:, 1:].to(dtype=shift_logits.dtype)
        logp = F.log_softmax(shift_logits, dim=-1)
        nll = -logp.gather(dim=-1, index=shift_labels.unsqueeze(-1)).squeeze(-1)
        nll = (nll * shift_mask).sum() / shift_mask.sum().clamp_min(1.0)
        return float(torch.exp(nll).item())

    # ---------------------- PPL (windowed, batched windows) ----------------------
    @torch.inference_mode()
    def _batch_next_logprobs(self, batch_ids: torch.Tensor, lens: torch.Tensor, next_ids: torch.Tensor) -> torch.Tensor:
        attn_mask = (batch_ids != self.tok.pad_token_id).to(batch_ids.dtype)
        out = self.causal(input_ids=batch_ids, attention_mask=attn_mask)
        idx = lens - 1
        last_logits = out.logits[torch.arange(batch_ids.size(0), device=batch_ids.device), idx]
        logp = F.log_softmax(last_logits, dim=-1)
        return logp[torch.arange(batch_ids.size(0), device=batch_ids.device), next_ids]

    @torch.inference_mode()
    def _ppl_k_batched(self, ids: torch.Tensor, k: int) -> float:
        ids = ids[:, :min(ids.size(1), self.ppl_max_tokens)]
        T = ids.size(1)
        if T < 2:
            return float("nan")
        k_eff = min(k, self.model_max_length - 1)
        pad = self.tok.pad_token_id
        chunks = []
        for start_t in range(1, T, self.ppl_windows_bs):
            t_positions = list(range(start_t, min(T, start_t + self.ppl_windows_bs)))
            lens, next_ids, seqs = [], [], []
            for t in t_positions:
                L = min(k_eff, t)
                lens.append(L)
                next_ids.append(int(ids[0, t].item()))
                seqs.append(ids[0, t - L:t])
            maxL = max(lens)
            batch = torch.full((len(seqs), maxL), pad, dtype=torch.long, device=self.device)
            for b, seq in enumerate(seqs):
                batch[b, -lens[b]:] = seq
            lens_t = torch.tensor(lens, device=self.device)
            next_t = torch.tensor(next_ids, device=self.device)
            chunks.append(self._batch_next_logprobs(batch, lens_t, next_t))
        logps = torch.cat(chunks)
        return float(math.exp(-logps.mean().item()))

    def calculate_perplexity(self, text: str, windows: List[int] = None, mode: str = "full") -> Tuple[float, float, float, float]:
        if windows is None:
            windows = [256, 2, 5, 7]
        ids, attn = self._tokenize(text)
        ids, attn = self._truncate(ids, attn)

        results = {}
        if mode == "full":
            results[256] = self._perplexity_full(text)
            for k in [w for w in windows if w != 256]:
                results[k] = self._ppl_k_batched(ids, k)
        else:
            for k in windows:
                results[k] = self._ppl_k_batched(ids, k)

        return (results.get(256, float("nan")),
                results.get(2, float("nan")),
                results.get(5, float("nan")),
                results.get(7, float("nan")))

    # ---------------------- Tangentiality (batched) ----------------------
    @staticmethod
    def _split_phrases(text: str) -> List[str]:
        parts = re.split(r'(?<=[\.\!\?\n])\s+|\n+', text)
        return [p.strip() for p in parts if p and p.strip()]

    @staticmethod
    def _split_words(text: str) -> List[str]:
        words = re.sub(r'\s+', ' ', text.strip()).split(' ')
        return [w for w in words if w]

    def compute_turn_tangentiality_list(self, texts: List[str]) -> List[float]:
        if not texts:
            return []
        first_text_phr = self._split_phrases(texts[0])
        if len(first_text_phr) >= 2:
            pe0 = self._encode_texts(first_text_phr)
            s0 = float(np.mean([np.dot(pe0[j-1], pe0[j]) for j in range(1, len(pe0))]))
        else:
            s0 = float("nan")

        last_phr, first_phr = [], []
        for i in range(1, len(texts)):
            prev_phr = self._split_phrases(texts[i-1])
            curr_phr = self._split_phrases(texts[i])
            last_phr.append(prev_phr[-1] if prev_phr else "")
            first_phr.append(curr_phr[0] if curr_phr else "")

        pair_embs = self._encode_texts(last_phr + first_phr)
        A, B = pair_embs[:len(last_phr)], pair_embs[len(last_phr):]
        sims = (A * B).sum(axis=1)
        return [s0] + sims.tolist()

    # ---------------------- Word coherence ----------------------
    def get_word_embeddings(self, word_list: List[str], *_, **__) -> np.ndarray:
        if len(word_list) == 0:
            return np.zeros((0, self.emb_dim), dtype=np.float32)
        vecs = self.emb.encode(
            word_list,
            batch_size=128,              # conservative
            convert_to_numpy=True,
            normalize_embeddings=False,
            show_progress_bar=False,
        )
        return vecs.astype(np.float32)

    def get_word_coherence_utterance(self, row: dict, measures: dict = None, **kwargs):
        measures = measures or {"words_texts": "words_texts"}
        words_texts = row.get(measures['words_texts'], [])
        if len(words_texts) == 0:
            return [np.nan]*0, [np.nan]*0, [np.nan]*0, {k: [np.nan]*0 for k in range(2, 11)}

        word_embeddings = self.get_word_embeddings(words_texts)
        norms = np.linalg.norm(word_embeddings, axis=1, keepdims=True).clip(min=1e-9)
        we = word_embeddings / norms
        sim = we @ we.T

        if len(words_texts) > 1:
            word_coh = [np.nan] + [float(sim[j, j-1]) for j in range(1, len(words_texts))]
        else:
            word_coh = [np.nan]*len(words_texts)

        if len(words_texts) > 5:
            arr = [float(np.mean(sim[j-2:j+3, j])) for j in range(2, len(words_texts)-2)]
            word_coh_5 = [np.nan]*2 + arr + [np.nan]*2
        else:
            word_coh_5 = [np.nan]*len(words_texts)

        if len(words_texts) > 10:
            arr = [float(np.mean(sim[j-5:j+6, j])) for j in range(5, len(words_texts)-5)]
            word_coh_10 = [np.nan]*5 + arr + [np.nan]*5
        else:
            word_coh_10 = [np.nan]*len(words_texts)

        variability = {}
        for k in range(2, 11):
            if len(words_texts) > k:
                variability[k] = [float(sim[j, j+k]) for j in range(len(words_texts)-k)] + [np.nan]*k
            else:
                variability[k] = [np.nan]*len(words_texts)

        return word_coh, word_coh_5, word_coh_10, variability

    # ---------------------- Compatible wrappers ----------------------
    def compute_perplexity_metric(self, text: str) -> float:
        return float(self._perplexity_full(text))

    def compute_tangentiality_metric(self, text: str) -> float:
        phrases = self._split_phrases(text)
        if len(phrases) >= 2:
            pe = self._encode_texts(phrases)
            return float(np.mean([np.dot(pe[j-1], pe[j]) for j in range(1, len(pe))]))
        return float("nan")

    def compute_coherence_metric(self, text: str) -> float:
        words = self._split_words(text)
        row_like = {self.measures["words_texts"]: words}
        word_coh, _, _, _ = self.get_word_coherence_utterance(row=row_like, measures=self.measures)
        return float(np.nanmean(word_coh)) if len(word_coh) else float("nan")

    def compute_dataset(self, texts: List[str]) -> pd.DataFrame:
        tangen_list = self.compute_turn_tangentiality_list(texts)
        rows = []
        for t, tang in tqdm(zip(texts, tangen_list)):
            try:
                perplexity = self.compute_perplexity_metric(t)
            except Exception:
                perplexity = None
            try:
                coherence = self.compute_coherence_metric(t)
            except Exception:
                coherence = None
            rows.append({
                "text": t,
                "perplexity": perplexity,
                "tangeniality": tang,
                "coherence": coherence,
            })
        return pd.DataFrame(rows, columns=["text", "perplexity", "tangeniality", "coherence"])

In [None]:
dm = DiscourseMetrics(
    language="uk",
    emb_model_id="google/embeddinggemma-300m",
    ppl_model_id="google/gemma-3-270m",
    ppl_max_tokens=2048,
    device="mps",        
    emb_device="cpu",     
    use_flash_attn2=False
)

In [None]:
import os
import gc
import numpy as np
import pandas as pd
from typing import Optional
from tqdm import tqdm
import torch

def build_file_level_metrics(
    df: pd.DataFrame,
    meter: "DiscourseMetrics",
    file_col: str = "File_number",
    text_col: Optional[str] = None,
    checkpoint_every: int = 1,
    # Where to store subfolders "5", "10", etc.; by default — current directory
    checkpoint_base_dir: str = "parts",
    # Checkpoint file base name (without extension)
    checkpoint_name: str = "metrics_partial",
) -> pd.DataFrame:
    """
    Every `checkpoint_every` groups:
      - saves an intermediate result to <checkpoint_base_dir>/<iters_done>/
      - clears memory (gc + torch.{mps|cuda}.empty_cache())

    Returns the final combined DataFrame.
    """
    if file_col not in df.columns:
        raise ValueError(f"Column `{file_col}` not found in df")

    # Auto-detect text column
    if text_col is None:
        if "Text" in df.columns:
            text_col = "Text"
        elif "text" in df.columns:
            text_col = "text"
        else:
            raise ValueError("Text column not found. Provide `text_col` or add 'Text'/'text' to df.")

    # Prepare groups (fix order by file_col values)
    # Avoid .groupby(sort=True) on large data — extract and sort unique values first
    unique_files = sorted(df[file_col].dropna().unique().tolist())

    results = []
    iters_done = 0

    # Helper: save checkpoint
    def _save_checkpoint():
        nonlocal results, iters_done
        if iters_done == 0:
            return
        out_df_partial = (
            pd.DataFrame(results)
            .sort_values("file_number")
            .reset_index(drop=True)
        )

        ckpt_dir = os.path.join(checkpoint_base_dir, str(iters_done))
        os.makedirs(ckpt_dir, exist_ok=True)
        # Save as both parquet (fast & type-safe) and CSV (just in case)
        pq_path = os.path.join(ckpt_dir, f"{checkpoint_name}.parquet")
        csv_path = os.path.join(ckpt_dir, f"{checkpoint_name}.csv")
        try:
            out_df_partial.to_parquet(pq_path, index=False)
        except Exception:
            # If pyarrow/fastparquet is missing — at least save as CSV
            pass
        out_df_partial.to_csv(csv_path, index=False)

    # Helper: aggressively free up memory
    def _flush_mem(*extra_to_del):
        # Delete any provided large local objects
        for obj in extra_to_del:
            try:
                del obj
            except Exception:
                pass

        # Garbage collection
        try:
            gc.collect()
        except Exception:
            pass

        # Sync and empty cache on MPS/CUDA (if present)
        try:
            if torch.backends.mps.is_available():
                try:
                    torch.mps.synchronize()
                except Exception:
                    pass
                try:
                    torch.mps.empty_cache()  # frees unused MPS cache
                except Exception:
                    pass
            if torch.cuda.is_available():
                try:
                    torch.cuda.synchronize()
                except Exception:
                    pass
                try:
                    # Since PyTorch 2.8, recommended: torch.cuda.memory.empty_cache
                    # but classic torch.cuda.empty_cache() is still supported and maps to that
                    torch.cuda.empty_cache()
                except Exception:
                    try:
                        from torch.cuda import memory as _cuda_memory
                        _cuda_memory.empty_cache()
                    except Exception:
                        pass
        except Exception:
            # No torch, or unexpected config
            pass

    # Main loop
    for file_number in tqdm(unique_files, total=len(unique_files)):
        g = df[df[file_col] == file_number]
        g_sorted = g.sort_index()
        texts = g_sorted[text_col].astype(str).tolist()

        # Compute metrics at the utterance level
        per_text = meter.compute_dataset(texts)

        out_row = {
            "file_number": int(file_number),
            "mean_perplexity": float(np.nanmean(per_text["perplexity"])) if len(per_text) else np.nan,
            "mean_tangeniality": float(np.nanmean(per_text["tangeniality"])) if len(per_text) else np.nan,
            "mean_coherence": float(np.nanmean(per_text["coherence"])) if len(per_text) else np.nan,
        }
        results.append(out_row)
        iters_done += 1

        # Every N iterations — checkpoint + memory cleanup
        if checkpoint_every and (iters_done % checkpoint_every == 0):
            _save_checkpoint()
            _flush_mem(per_text, texts, g_sorted, g)

    # Final DataFrame
    out_df = (
        pd.DataFrame(results)
        .sort_values("file_number")
        .reset_index(drop=True)
    )

    # Final checkpoint + memory cleanup, to clear everything after heavy work
    _save_checkpoint()
    _flush_mem()

    return out_df

In [None]:
file_level_df = build_file_level_metrics(df_woz, dm, file_col="File_number", text_col="Text")
file_level_df.to_csv("dcwoz_eng_new_gemma.csv", index=False)

In [None]:
file_level_df = build_file_level_metrics(df_woz, dm, file_col="File_number", text_col="Text_ukr")
file_level_df.to_csv("dcwoz_ukr_new_gemma.csv", index=False)

# run tests

In [None]:
# Load the two CSVs, inspect, and run EN-vs-UK evaluation at the session level.
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, ttest_rel, wilcoxon

ENG_PATH = "dcwoz_eng_new_gemma.csv"
UKR_PATH = "dcwoz_ukr_new_gemma.csv"

df_en = pd.read_csv(ENG_PATH)
df_uk = pd.read_csv(UKR_PATH)

# --- Helpers ---
def _dropna_pair(a: np.ndarray, b: np.ndarray):
    mask = (~np.isnan(a)) & (~np.isnan(b))
    return a[mask], b[mask]

def safe_pearson(a, b):
    x, y = _dropna_pair(np.asarray(a, float), np.asarray(b, float))
    if len(x) < 3:
        return np.nan, len(x)
    return float(pearsonr(x, y)[0]), len(x)

def safe_spearman(a, b):
    x, y = _dropna_pair(np.asarray(a, float), np.asarray(b, float))
    if len(x) < 3:
        return np.nan, len(x)
    return float(spearmanr(x, y)[0]), len(x)

def mae_rmse(a, b):
    x, y = _dropna_pair(np.asarray(a, float), np.asarray(b, float))
    if len(x) == 0:
        return np.nan, np.nan, 0
    mae = float(np.mean(np.abs(x - y)))
    rmse = float(np.sqrt(np.mean((x - y) ** 2)))
    return mae, rmse, len(x)

def rel_mean_diff_percent(a, b):
    a = np.asarray(a, float)
    b = np.asarray(b, float)
    mask = np.abs(a) > 1e-9
    if mask.sum() == 0:
        return np.nan
    return float(np.mean((b[mask] - a[mask]) / np.abs(a[mask]) * 100.0))

def paired_tests(a, b):
    x, y = _dropna_pair(np.asarray(a, float), np.asarray(b, float))
    if len(x) < 3:
        return {"t_p": np.nan, "wilcoxon_p": np.nan}
    try:
        t_p = float(ttest_rel(x, y, nan_policy="omit").pvalue)
    except Exception:
        t_p = np.nan
    try:
        d = x - y
        if np.allclose(d, 0):
            w_p = 1.0
        else:
            w_p = float(wilcoxon(x, y, zero_method="wilcox", alternative="two-sided").pvalue)
    except Exception:
        w_p = np.nan
    return {"t_p": t_p, "wilcoxon_p": w_p}

# ICC(2,1): two-way random, absolute agreement (McGraw & Wong 1996)
def icc2_1(wide: pd.DataFrame) -> float:
    X = wide.dropna().to_numpy(float)
    if X.shape[0] < 3 or X.shape[1] != 2:
        return np.nan
    n, k = X.shape
    mean_target = X.mean(axis=1, keepdims=True)
    mean_rater = X.mean(axis=0, keepdims=True)
    grand_mean = X.mean()

    ss_total = ((X - grand_mean) ** 2).sum()
    ss_between_targets = (k * ((mean_target - grand_mean) ** 2)).sum()
    ss_between_raters = (n * ((mean_rater - grand_mean) ** 2)).sum()
    ss_error = ss_total - ss_between_targets - ss_between_raters

    ms_between_targets = ss_between_targets / (n - 1)
    ms_between_raters = ss_between_raters / (k - 1)
    ms_error = ss_error / ((n - 1) * (k - 1))

    icc = (ms_between_targets - ms_error) / (
        ms_between_targets + (k - 1) * ms_error + (k * (ms_between_raters - ms_error) / n)
    )
    return float(icc)

# --- Harmonize and merge ---
# Try common columns: expect ['file_number','mean_perplexity','mean_tangeniality','mean_coherence']
# Normalize column names to lower
df_en.columns = [c.lower() for c in df_en.columns]
df_uk.columns = [c.lower() for c in df_uk.columns]

# Robust rename if needed
rename_map = {
    "file_number": "file_number",
    "filenumber": "file_number",
    "file": "file_number",
    "mean_perplexity": "mean_perplexity",
    "mean_tangeniality": "mean_tangeniality",
    "mean_tangentiality": "mean_tangeniality",
    "mean_coherence": "mean_coherence",
}
df_en = df_en.rename(columns=rename_map)
df_uk = df_uk.rename(columns=rename_map)

# Keep only expected cols
keep = ["file_number", "mean_perplexity", "mean_tangeniality", "mean_coherence"]
missing_en = [c for c in keep if c not in df_en.columns]
missing_uk = [c for c in keep if c not in df_uk.columns]

summary = {
    "missing_columns_en": missing_en,
    "missing_columns_uk": missing_uk,
    "n_rows_en": len(df_en),
    "n_rows_uk": len(df_uk),
}

# Merge
merged = df_en[keep].merge(df_uk[keep], on="file_number", how="inner", suffixes=("_en", "_uk"))
# display_dataframe_to_user("Merged EN-UK session means (head)", merged.head(30))

# --- Evaluate for each metric on session means ---
rows = []
for metric in ["perplexity", "tangeniality", "coherence"]:
    a = pd.to_numeric(merged[f"mean_{metric}_en"], errors="coerce").to_numpy(float)
    b = pd.to_numeric(merged[f"mean_{metric}_uk"], errors="coerce").to_numpy(float)

    r, n_r = safe_pearson(a, b)
    rho, n_rho = safe_spearman(a, b)
    mae, rmse, n_e = mae_rmse(a, b)

    # ICC on a wide dataframe with two columns
    wide = pd.DataFrame({"EN": a, "UK": b})
    icc = icc2_1(wide)

    bias_pct = rel_mean_diff_percent(a, b)
    tests = paired_tests(a, b)

    rows.append({
        "metric": metric,
        "N": int(min(n_r, n_rho, n_e)),
        "Pearson_r": r,
        "Spearman_rho": rho,
        "MAE": mae,
        "RMSE": rmse,
        "ICC2_1": icc,
        "mean_diff_% (UK_vs_EN)": bias_pct,
        "t_p": tests["t_p"],
        "wilcoxon_p": tests["wilcoxon_p"],
    })

eval_df = pd.DataFrame(rows).sort_values("metric").reset_index(drop=True)
eval_df


Unnamed: 0,metric,N,Pearson_r,Spearman_rho,MAE,RMSE,ICC2_1,mean_diff_% (UK_vs_EN),t_p,wilcoxon_p
0,coherence,275,0.242285,0.202955,0.02097768,0.02118642,0.005617,-2.219679,5.625587e-236,7.488097e-47
1,perplexity,275,0.060095,0.262961,90596920.0,193752500.0,0.058709,398.242392,0.1732284,2.671626e-07
2,tangeniality,275,0.973861,0.972289,0.01273736,0.01548681,0.953386,-1.36896,1.11884e-34,1.5844260000000002e-28
