5/14 (Tue) | Experiment

# Preliminary Analyses of Annoation

## 1. Introduction

This notebook conducts preliminary analyses.
The goal of current analyses is to fill the following table.

| Task | WER | N disfluency (Manual / Automatic) | N MCP (Manual / Automatic) | N ECP (Manual / Automatic) |
| - | - | - | - | - |
| Arg_Oly |  |  |  |  |
| Cartoon |  |  |  |  |
| RtSwithoutRAA |  |  |  |  |
| RtSwithRAA |  |  |  |  |
| Monologue |  |  |  |  |
| WoZ_Interview |  |  |  |  |
| ALL |  |  |  |  |

Before starting the analyses, the following code block loads required packages and define global variables.

In [1]:
from typing import List, Tuple, Dict, Generator, Optional
from pathlib import Path

import numpy as np
import pandas as pd
from jiwer import wer

from utils.mfr import logit_2_rating

DATA_DIR = Path("/home/matsuura/Development/app/feature_extraction_api/experiment/data")

MONOLOGUE_TASK = ["Arg_Oly", "Cartoon", "RtSwithoutRAA", "RtSwithRAA"]
DIALOGUE_TASK = ["WoZ_Interview"]

FILLER = {"uh", "ah", "um", "mm", "hmm", "oh", "mm-hmm", "er", "mhm", "uh-huh", "er", "erm", "huh", "uhu", "mmhmm", "uhhuh"}

---

## 2. Define Functions

This section defines functions for the preliminary analyses.
The following code block defines two functions; one generates csv file paths of manual and automatic annotation results; and another one loads them.

In [2]:
def annotation_result_csv_path_generator(
        task: str, 
        asr_service: str ="rev",
        rating_filter: Optional[List[int]] =None
) -> Generator[Tuple[Path, Path], None, None]:
    load_dir = DATA_DIR / f"{task}/10_SCTK_Inputs"

    if rating_filter is None:
        for manu_csv_path in load_dir.glob("*_manu.csv"):
            filename = manu_csv_path.stem.removesuffix("_manu")

            if asr_service == "rev":
                auto_csv_path = load_dir / f"{filename}_auto.csv"
            elif asr_service == "whisper":
                auto_csv_path = DATA_DIR / f"{task}/14_ASR_Whisper/{filename}_auto.csv"
            elif asr_service == "google":
                if task in MONOLOGUE_TASK:
                    auto_csv_path = DATA_DIR / f"{task}/13_ASR_Google/{filename}_auto.csv"
                else:
                    manu_csv_path = DATA_DIR / f"{task}/01_Manual_TextGrid/{filename[:3]}.csv"
                    auto_csv_path = DATA_DIR / f"{task}/13_ASR_Google/{filename[:3]}.csv"

            yield manu_csv_path, auto_csv_path
    else:
        pf_path = DATA_DIR / f"{task}/12_PF_Rating/pf_rating.csv"
        df_pf = pd.read_csv(pf_path)
        uid_list = df_pf["uid"].to_numpy()

        logit_path = pf_path.parent / "logit.csv"
        threshold_path = logit_path.parent / "threshold.csv"
        
        df_logit = pd.read_csv(logit_path, index_col=0)
        rating_list = logit_2_rating(df_logit["theta"], threshold_path)

        mask = np.full(rating_list.shape, False, dtype=bool)
        for rating in rating_filter:
            mask = mask | (rating_list == rating)
        
        uid_list = uid_list[mask]

        for uid in uid_list:
            if task == "WoZ_Interview":
                uid = str(int(uid)).zfill(3)

            filename_pattern = f"{uid}*_manu.csv"
            for manu_csv_path in load_dir.glob(filename_pattern):
                filename = manu_csv_path.stem.removesuffix("_manu")
                
                if asr_service == "rev":
                    auto_csv_path = load_dir / f"{filename}_auto.csv"
                elif asr_service == "whisper":
                    auto_csv_path = DATA_DIR / f"{task}/14_ASR_Whisper/{filename}_auto.csv"
                elif asr_service == "google":
                    auto_csv_path = DATA_DIR / f"{task}/13_ASR_Google/{filename}_auto.csv"

                yield manu_csv_path, auto_csv_path

def load_dataset(
        asr_service: str ="rev",
        rating_filter_monologue: Optional[List[int]] =None,
        rating_filter_dialogue: Optional[List[int]] =None,
) -> Dict[str, Dict[str, List[Dict[str, pd.DataFrame]]]]:
    dataset = {
        "monologue": {},
        "dialogue": {}
    }
    
    for monologue_task in MONOLOGUE_TASK:
        dataset["monologue"][monologue_task] = []
        
        for manu_csv_path, auto_csv_path in annotation_result_csv_path_generator(monologue_task, asr_service=asr_service, rating_filter=rating_filter_monologue):
            df_manu = pd.read_csv(manu_csv_path)
            df_auto = pd.DataFrame([], columns=["text"])
            if auto_csv_path.exists():
                df_auto = pd.read_csv(auto_csv_path)

            dataset["monologue"][monologue_task].append({
                "manual": df_manu,
                "automatic": df_auto
            })

    for dialogue_task in DIALOGUE_TASK:
        dataset["dialogue"][dialogue_task] = []

        for manu_csv_path, auto_csv_path in annotation_result_csv_path_generator(dialogue_task, asr_service=asr_service, rating_filter=rating_filter_dialogue):
            df_manu = pd.read_csv(manu_csv_path, na_values=["", " "], keep_default_na=False)
            df_auto = pd.DataFrame([], columns=["text"])
            if auto_csv_path.exists():
                df_auto = pd.read_csv(auto_csv_path, na_values=["", " "], keep_default_na=False)

            dataset["dialogue"][dialogue_task].append({
                "manual": df_manu,
                "automatic": df_auto
            })

    return dataset

The following code block defines a function to calculate WER.

In [3]:
def convert_google_dialog(df_manu: pd.DataFrame, df_auto: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    mask_manu = (df_manu["speaker"] == "user") & (df_manu["topic"] != "intro") & (df_manu["topic"] != "closing")
    mask_auto = (df_auto["speaker"] == "user") & (df_auto["phase"] != "intro") & (df_auto["phase"] != "closing")

    df_manu_masked = df_manu[mask_manu]
    df_auto_masked = df_auto[mask_auto]

    text_manu = " ".join(df_manu_masked["transcript"])
    text_auto = " ".join(df_auto_masked["text"])

    while "  " in text_manu:
        text_manu = text_manu.replace("  ", " ")
    while "  " in text_auto:
        text_auto = text_auto.replace("  ", " ")
    
    if text_manu[0] == " ":
        text_manu = text_manu[1:]
    if text_auto[0] == " ":
        text_auto = text_auto[1:]

    if text_manu[-1] == " ":
        text_manu = text_manu[:-1]
    if text_manu[-1] == " ":
        text_manu = text_manu[:-1]

    token_manu = np.array([text_manu.split(" "), text_manu.split(" ")]).T
    token_auto = np.array([text_auto.split(" "), text_auto.split(" ")]).T

    df_manu = pd.DataFrame(token_manu, columns=["text", "hoge"])
    df_auto = pd.DataFrame(token_auto, columns=["text", "hoge"])

    return df_manu, df_auto

def calculate_wer(annotation_results: List[Dict[str, pd.DataFrame]], remove_filer: bool =False, google_dialog: bool =False) -> float:
    ref = []
    hyp = []

    for annotation_result in annotation_results:
        df_manu = annotation_result["manual"]
        df_auto = annotation_result["automatic"]

        if google_dialog:
            df_manu, df_auto = convert_google_dialog(df_manu, df_auto)

        mask_tag_manu = df_manu["text"].astype(str).str.endswith(">")
        mask_tag_auto = df_auto["text"].astype(str).str.endswith(">")

        df_manu = df_manu[~mask_tag_manu]
        df_auto = df_auto[~mask_tag_auto]

        if remove_filer:
            for filler in FILLER:
                mask_filler_manu = (df_manu["text"] == filler)
                df_manu = df_manu[~mask_filler_manu]

                mask_filler_auto = (df_auto["text"] == filler)
                df_auto = df_auto[~mask_filler_auto]

        text_manu = " ".join(df_manu["text"].astype(str))
        text_auto = " ".join(df_auto["text"].astype(str))

        if len(text_manu) == 0 or len(text_auto) == 0:
            continue

        ref.append(text_manu)
        hyp.append(text_auto)

    return wer(ref, hyp)

---

## 3. Preliminary Analyses (Whisper)

This section conducts the preliminary analyses.
The following code block loads entire dataset.

In [4]:
dataset = load_dataset(asr_service="whisper")

### 3.1. WER

The following code block calculate WER of monologue tasks.

In [5]:
monologue_data = []
for monologue_task in MONOLOGUE_TASK:
    annotation_results = dataset["monologue"][monologue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {monologue_task} = {res}")

    monologue_data += annotation_results

res = calculate_wer(monologue_data, remove_filer=True)
print(f"WER of monologue task = {res}")

WER of Arg_Oly = 0.19619777063530056
WER of Cartoon = 0.19581162963750398
WER of RtSwithoutRAA = 0.23387767838568754
WER of RtSwithRAA = 0.2385330632174952
WER of monologue task = 0.21750375438011013


The following code block calculate WER of a dialogue task.

In [6]:
dialogue_data = []
for dialogue_task in DIALOGUE_TASK:
    annotation_results = dataset["dialogue"][dialogue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {dialogue_task} = {res}")

    dialogue_data += annotation_results

WER of WoZ_Interview = 0.17853543836076588


The following code block calcualte WER of the entire tasks.

In [7]:
all_task_data = monologue_data + dialogue_data

res = calculate_wer(all_task_data, remove_filer=True)
print(f"WER of all tasks = {res}")

WER of all tasks = 0.20457079152731328


---

## 5. Additional Analyses

This section conducts the same analyses for each PF groups.

### 5.1. Beginners

The following code block loads beginners' speech.

In [8]:
beginner_dataset = load_dataset(asr_service="whisper", rating_filter_monologue=[0, 1, 2], rating_filter_dialogue=[0, 1])

The following code block calculates WER of beginners' speech.

In [9]:
monologue_data = []
for monologue_task in MONOLOGUE_TASK:
    annotation_results = beginner_dataset["monologue"][monologue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {monologue_task} = {res}")

    monologue_data += annotation_results

res = calculate_wer(monologue_data, remove_filer=True)
print(f"WER of monologue task = {res}")

dialogue_data = []
for dialogue_task in DIALOGUE_TASK:
    annotation_results = beginner_dataset["dialogue"][dialogue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {dialogue_task} = {res}")

    dialogue_data += annotation_results

all_task_data = monologue_data + dialogue_data

res = calculate_wer(all_task_data, remove_filer=True)
print(f"WER of all tasks = {res}")

WER of Arg_Oly = 0.279711508309815
WER of Cartoon = 0.28061224489795916
WER of RtSwithoutRAA = 0.30720235178833905
WER of RtSwithRAA = 0.27638190954773867
WER of monologue task = 0.28927903038944836
WER of WoZ_Interview = 0.2614270629054156
WER of all tasks = 0.2789355742296919


### 5.2. Intemediate

The following code block loads intermediate group's speech.

In [10]:
intemediate_dataset = load_dataset(asr_service="whisper", rating_filter_monologue=[3, 4, 5], rating_filter_dialogue=[2, 3])

The following code block calculates WER of intemediate learners' speech.

In [11]:
monologue_data = []
for monologue_task in MONOLOGUE_TASK:
    annotation_results = intemediate_dataset["monologue"][monologue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {monologue_task} = {res}")

    monologue_data += annotation_results

res = calculate_wer(monologue_data, remove_filer=True)
print(f"WER of monologue task = {res}")

dialogue_data = []
for dialogue_task in DIALOGUE_TASK:
    annotation_results = intemediate_dataset["dialogue"][dialogue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {dialogue_task} = {res}")

    dialogue_data += annotation_results

all_task_data = monologue_data + dialogue_data

res = calculate_wer(all_task_data, remove_filer=True)
print(f"WER of all tasks = {res}")

WER of Arg_Oly = 0.18277082199237887
WER of Cartoon = 0.20087855770110735
WER of RtSwithoutRAA = 0.23568212752591353
WER of RtSwithRAA = 0.2473844076949038
WER of monologue task = 0.21979434447300772
WER of WoZ_Interview = 0.17749749050757213
WER of all tasks = 0.20431052387723475


### 5.3. Advanced 

The following code block loads advanced learners' speech.

In [12]:
advanced_dataset = load_dataset(asr_service="whisper", rating_filter_monologue=[6, 7, 8], rating_filter_dialogue=[4, 5])

The following code block calculates WER of advanced learners' speech.

In [13]:
monologue_data = []
for monologue_task in MONOLOGUE_TASK:
    annotation_results = advanced_dataset["monologue"][monologue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {monologue_task} = {res}")

    monologue_data += annotation_results

res = calculate_wer(monologue_data, remove_filer=True)
print(f"WER of monologue task = {res}")

dialogue_data = []
for dialogue_task in DIALOGUE_TASK:
    annotation_results = advanced_dataset["dialogue"][dialogue_task]

    res = calculate_wer(annotation_results, remove_filer=True)

    print(f"WER of {dialogue_task} = {res}")

    dialogue_data += annotation_results

all_task_data = monologue_data + dialogue_data

res = calculate_wer(all_task_data, remove_filer=True)
print(f"WER of all tasks = {res}")

WER of Arg_Oly = 0.15516397454723446
WER of Cartoon = 0.15842753500927956
WER of RtSwithoutRAA = 0.1773011617515639
WER of RtSwithRAA = 0.20521353300055464
WER of monologue task = 0.17485844792310987
WER of WoZ_Interview = 0.09349725008087997
WER of all tasks = 0.15636604286922312
