# Building Dialog Structure from Two‑Channel Speech Data

This notebook walks you through taking the word‑level transcripts and laughter probabilities you generated earlier to build a structured dialogue table for personality prediction.

> **Before you begin:**  
> Run all previous preprocessing notebooks before proceeding.

---


In [1]:
import warnings
warnings.filterwarnings("ignore")

# from pathlib import Path
import librosa
import numpy as np
import os
import pandas as pd

import sys
sys.path.append("../sho_util/pyfiles/")
from basic import get_bool_base_on_conditions

sys.path.append('./../../../laughter-detection/')
import laugh_segmenter

sys.path.append('../pyfiles/')
from dialog import save_audio, EnglishTextNormalizer, update_information, concatenate_close_voice, delete_fully_overlap, get_timeshifted_for_overlap, get_sentence_start, get_fully_overlap

tempfile = "temp.wav"
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 504)
pd.set_option('display.max_colwidth', None)

normalizer = EnglishTextNormalizer()

---

In this example, we will preprocess the transcriptions and laughter probabilities and construct a dialog table, which will be used for personality prediction. Please adjust the following variables:

- `audiopath`: A string containing the path to the two‑channel speech file.  
- `feature_dir`: A string specifying the path of the directory where all preprocessed results will be saved.  
- `sr`: An integer specifying the sampling rate used in silent section detection.  

We also provide several editable parameters for laughter detection, silent-section removal, and dialog construction. The default values were chosen based on our experiments and the literature, so we recommend keeping them as is—but feel free to adjust if needed.

---

In [2]:
###########################################
########## Adjustable Parameters ##########
###########################################

audiopath = "../audio/sample.wav"
feature_dir = "../audio/features/sample/"
sr = 16000 

#################################################
##### Editable Parameters (Not recommended) #####
#################################################

### Silent section deletion
minimum_length = 0.01
top_db = 15
trim_window = 256
trim_stride = 128

### Laughter Analysis

# The following two variables are used in the laughter detection. I recommend the following default values but feel free to change it if necessary.
threshold = 0.5 
min_length = 0.2

include_laughter = True
always_word_timestamp = True # We include the laughter tokens but the time stamp follows the word ones. 
exclude_short_laughs = False

### Dialog Generation
concatenation_threshold1 = 0.7 # [s] the threshold duration whether we concatenate the consecutive transcriptions (this is used to connect words to construct a sentence)
# turn-taking
thres_duration_fo = np.inf # [s] the threshold duration to be deleted when they have fully-overlaps.
thres_overlap_timeshift = concatenation_threshold1 # [s] threshold that allows different speakers to do overlap [s]. The following utterances will be shifted in the overlapped time.
concatenation_threshold2 = concatenation_threshold1 # [s] the threshold duration whether we concatenate the consecutive transcriptions
thres_sentence_length = 3.0 # [s] the minimum silent length in a single conversation
# backchannel
fully_margin_interjection = 0.0
backchannel_threshold = np.inf
# controlling interjection
margin_interjection = concatenation_threshold1
minimum_duration_ci = 0.0

backchannel_dict = {
    "emotive": [
        "wow", "ouch", "yay", "ew", "ooh", "phew", "ugh", "aha", "eek", "really", "sorry", "oh really", 
        "[Laugh]", "oh",
    ],
    "cognitive": [
        "okay", "mm hmm", "true", "ok", "mm hmm, exactly", "exactly", "certainly", "absolutely",
        "oh i see", "mm", "hmm", "ah", "mm hmm exactly", "yeah", "right", "true okay", "well", "um", 
        "let's see", "oh yeah", "yes", "sure", "absolutely yeah", "yep", "that's right", "uh huh",
        "oh okay", "you think", "yeah uh huh", "uh huh yeah", "mm hmm yeah", "yeah mm hmm", "that's true"
    ],
}
nocontolling_list =  []

################################################################################
################################################################################
################################################################################

print("#####################")
print("### Load all data ###")
print("#####################")
resultpath = feature_dir + "whisper/" + os.path.basename(audiopath[:-4]) + f".npy"
laughpath = feature_dir + "laughs/" + os.path.basename(audiopath[:-4]) + f".npy"
tablepath = laughpath.replace("laughs", "results")

a, _ = librosa.load(audiopath, sr=sr, mono=False)
audio = []
for i in range(2):
    save_audio(tempfile, a[i], sr)
    audio += [librosa.load(tempfile, sr=sr)[0]]
audio += [librosa.load(audiopath, sr=sr)[0]]

results = [None, None]
results[0], results[1] = np.load(resultpath, allow_pickle=True)
for ch in range(2):
    a = results[ch]["segments"]
    for i in range(len(a)):
        b = a[i]["words"]
        for j in range(len(b)):
            text = b[j]["word"]
            results[ch]["segments"][i]["words"][j]["word"] = normalizer(text)

a = np.load(laughpath)
fps = a[0][0]
probs = a[:, 1:]
laughs = []
for i in range(2):
    laugh = laugh_segmenter.get_laughter_instances(probs[i], threshold=threshold, min_length=float(min_length), fps=fps)
    laughs += [np.round(np.array(laugh), 2)]

print("##########################")
print("### delete silent sections")
print("##########################")
texts = []
for j in range(2):
    texts += [[]]
    for i in range(len(results[j]["segments"])):
        texts[j] += results[j]["segments"][i]["words"] 
    texts[j] = {a["start"]:a for a in texts[j]}

newtexts = []
for ch in range(2):
    text = texts[ch]
    newtext = {}
    # for key in tqdm(text):
    for key in text:
        array = text[key]
        start = array["start"]
        end = array["end"]
        segment = audio[ch][int(sr*start):int(sr*end)]
        _, spans = librosa.effects.trim(segment, top_db=top_db, frame_length=trim_window, hop_length=trim_stride)
        newstart, newend = start + spans/sr
        if newend-newstart<minimum_length:
            continue
        newtext[newstart] = {
            "word": array["word"],
            "start": newstart,
            "end": newend,
            "probability": array["probability"],
        }
    newtexts += [newtext]
texts = newtexts

starts, ends = [], []
for i in range(2):
    starts += [np.array(list(texts[i].keys()))]
    ends += [np.array([texts[i][key]["end"] for key in texts[i]])]

print("#############################")
print("##### Laughter Analysis #####")
print("#############################")

if include_laughter:
    ### Initialization
    startlaughs, endlaughs = [], []
    for i in range(2):
        if len(laughs[i])==0:
            startlaughs += [np.array([])]
            endlaughs += [np.array([])]
            continue
        startlaughs += [np.array(laughs[i])[:,0]]
        endlaughs += [np.array(laughs[i])[:,1]]

        bl = startlaughs[i]<starts[i][-1]
        startlaughs[i] = startlaughs[i][bl]
        endlaughs[i] = endlaughs[i][bl]

        bl = endlaughs[i]>ends[i][-1]
        endlaughs[i][bl] = ends[i][-1]

    for cl in range(2):
        if len(laughs[cl])==0:
            continue

        ### Determine Short Laughs without Overlap to Words
        a = starts[cl].reshape(1, -1)-startlaughs[cl].reshape(-1, 1)
        a[a<0] = np.inf
        indices = np.argmin(a, axis=1)

        shortlaugh = starts[cl][indices]>endlaughs[cl]
        for idx in np.arange(len(shortlaugh))[shortlaugh]:
            start = startlaughs[cl][idx]
            if always_word_timestamp:
                end = start + 0.01
            else:
                end = endlaughs[cl][idx]
            if not(exclude_short_laughs):
                texts[cl][start] = {"word": '[Laugh]', "start": start, "end": end, "probability": 0.8, }

        ### Update
        bl = (1-shortlaugh).astype(bool)
        startlaughs[cl] = startlaughs[cl][bl]
        endlaughs[cl] = endlaughs[cl][bl]

        shift = 0
        while True:
            ### When the end of laugh is within the spoken word
            a = starts[cl].reshape(1, -1)-startlaughs[cl].reshape(-1, 1)
            a[a<0] = np.inf
            indices = np.argmin(a, axis=1)
            getlaughs = ends[cl][indices+shift]>=endlaughs[cl]
            # print(getlaughs)
            for idx in np.arange(len(getlaughs))[getlaughs]:
                if always_word_timestamp:
                    start = starts[cl][indices][idx]-0.01
                else:
                    start = startlaughs[cl][idx]
                end = starts[cl][indices][idx]
                texts[cl][start] = {"word": '[StartLaugh]', "start": start, "end": end, "probability": 0.8, }

                start = ends[cl][indices+shift][idx]
                end = start
                texts[cl][start] = {"word": '[EndLaugh]', "start": start, "end": end, "probability": 0.8, }

            ### Update
            bl = (1-getlaughs).astype(bool)
            startlaughs[cl] = startlaughs[cl][bl]
            endlaughs[cl] = endlaughs[cl][bl]
            if len(startlaughs[cl])<=0:
                break


            ### When the end of laugh is in the silent section
            a = starts[cl].reshape(1, -1)-startlaughs[cl].reshape(-1, 1)
            a[a<0] = np.inf
            indices = np.argmin(a, axis=1)
            getlaughs = starts[cl][indices+shift+1]>endlaughs[cl]
            for idx in np.arange(len(getlaughs))[getlaughs]:
                # start = startlaughs[cl][idx]
                start = starts[cl][indices][idx]-0.01
                end = starts[cl][indices][idx]
                texts[cl][start] = {"word": '[StartLaugh]', "start": start, "end": end, "probability": 0.8, }

                start = ends[cl][indices+shift][idx]
                if always_word_timestamp:
                    end = ends[cl][indices+shift][idx]+0.01
                else:
                    end = endlaughs[cl][idx]
                texts[cl][start] = {"word": '[EndLaugh]', "start": start, "end": end, "probability": 0.8, }

            ### Update
            bl = (1-getlaughs).astype(bool)
            startlaughs[cl] = startlaughs[cl][bl]
            endlaughs[cl] = endlaughs[cl][bl]
            if len(startlaughs[cl])<=0:
                break

            shift += 1

    newstart = []
    for cl in range(2):
        texts[cl] = dict(sorted(texts[cl].items()))
        newstart += [np.array(list(texts[cl].keys()))]
else:
    newstart = starts

print("###########################################################")
print("### Preprocess the word-level time stamp into sentences ###")
print("###########################################################")

arrays = []
i, j = 0, 0
while i<len(newstart[0]) and j<len(newstart[1]):
    if newstart[0][i] <= newstart[1][j]:
        arrays += [[*list(texts[0][newstart[0][i]].values())[:3], "A"]]
        i += 1
    else:
        arrays += [[*list(texts[1][newstart[1][j]].values())[:3], "B"]]
        j += 1

# concatenation_threshold1: From previous literature, language generation takes 0.6 [s]
data = pd.DataFrame(arrays, columns=["transcription", "start", "end", "speaker"])
data = data[["start", "end", "speaker", "transcription"]]

data = update_information(data, [""])
rawdata = data.copy()
data = concatenate_close_voice(data, concatenation_threshold1, if_consecutive=False)
data = update_information(data, [""])
original_data = data.copy() # 

print("#########################################")
print("### Get Dialog with Shifted Timestamp ###")
print("#########################################")

# Initialize Dialog
data1 = original_data.copy()

# Delete the fully-overlapped utterance
data1 = delete_fully_overlap(data1, thres_duration_fo)
data1 = update_information(data1, [""])

# Time Shifting based on Overlaps
data1 = update_information(data1, [""])
data1 = get_timeshifted_for_overlap(data1, thres_overlap_timeshift)
data1 = update_information(data1)

startkey = "start-timeshift"
endkey = "end-timeshift"
btkey = "duration-before-talking-timeshift"
overlapkey = "Overlap-timeshift"
fokey = "Fully-Overlap-timeshift"

# Concatenation of Close Voices from the same speaker
data1 = concatenate_close_voice(data1, concatenation_threshold2, btkey, if_consecutive=True)
data1 = update_information(data1)

# Delete the fully-overlapped utterance and Concatenate again based on the deletion
data1 = delete_fully_overlap(data1, thres_duration_fo, fokey=fokey)
data1 = update_information(data1)
data1 = concatenate_close_voice(data1, concatenation_threshold2, btkey, if_consecutive=True) 
data1 = update_information(data1)

# Get Sentence Segmentation
data1 = get_sentence_start(data1, thres_sentence_length)

print("######################################")
print("### Get Candidates of Backchannels ###")
print("######################################")

# Initialize Dialog
data2 = original_data.copy()

data2 = concatenate_close_voice(data2, concatenation_threshold2, if_consecutive=False)
data2 = update_information(data2, [""], margin_fully_overlap=fully_margin_interjection)
data2["BC-Candidates"] = False
bl = get_bool_base_on_conditions(data2, {"Fully-Overlap": [True]})*(data2["duration"]<=backchannel_threshold)
data2.loc[bl, "BC-Candidates"]  = True

print("#######################################")
print("### Get Candidates of Interjections ###")
print("#######################################")

rawdata_dir = {
    "A": rawdata[get_bool_base_on_conditions(rawdata, {"speaker": ["A"]})],
    "B": rawdata[get_bool_base_on_conditions(rawdata, {"speaker": ["B"]})],
}

data3 = original_data.copy()
data3 = update_information(data3, [""], margin_overlap=margin_interjection)

### Delete the fully-overlapped ones since it may not be the controlling interjection.
fo = data3[get_bool_base_on_conditions(data3, {"Fully-Overlap": [True]})]
for i in range(len(fo)):
    array = fo.iloc[i]
    idx = array.name
    data3.loc[idx, "Overlap"] = ""
    delol = int(array["Overlap"].split("-")[0])
    arrayol = data3.loc[delol]
    newol = []
    for ol in arrayol["Overlap"].split("-"):
        if ol!=str(idx):
            newol += [ol]
    newol = "-".join(newol)
    data3.loc[delol, "Overlap"] = newol

data3 = get_fully_overlap(data3)
data3 = data3[(1-get_bool_base_on_conditions(data3, {"Overlap":[""]})).astype(bool)]

if len(data3)>0:

    ### Split all sentences into words and Delete the common backchannels 
    df_list = []
    for i in range(len(data3)):
        array = data3.iloc[i]
        speaker = array["speaker"]
        start = np.argmin(np.abs(rawdata_dir[speaker]["start"]-array["start"]))
        end = np.argmin(np.abs(rawdata_dir[speaker]["end"]-array["end"]))
        df_list += [rawdata_dir[speaker].iloc[start:end+1]]
    dfci = pd.concat(df_list, axis=0)

    texts = []
    for i in range(len(dfci)):
        array = dfci.iloc[i]
        text = array["transcription"]
        a = ""
        for key in backchannel_dict:
            if text in backchannel_dict[key]:
                texts += [key]
                a = key
                break
        if text in nocontolling_list:
            texts += ["not controlling"]
            a = "not controlling"
        if a=="":
            texts += [""]
    dfci["BC-Labels"] = texts
    dfci = dfci[get_bool_base_on_conditions(dfci, {"BC-Labels": [""]})]
    dfci = dfci.sort_values("start")
    dfci = dfci.reset_index(drop=True)

    if len(dfci)==0:
        dfci = None
    else:
        dfci = dfci[["start", "end", "speaker", "transcription"]]
        dfci = update_information(dfci, [""], margin_overlap=margin_interjection)
        dfci = concatenate_close_voice(dfci, concatenation_threshold1, if_consecutive=False)
        dfci = update_information(dfci, [""], margin_overlap=margin_interjection)
        dfci = dfci[(1-get_bool_base_on_conditions(dfci, {"Overlap":[""]})).astype(bool)]

        # Only get the interjections and delete the ones getting interjected
        ci_list = []
        fo = dfci.copy()
        for i in range(len(fo)):
            array = fo.iloc[i]
            if array["Overlap"]=="":
                continue
            idx = array.name
            fo.loc[idx, "Overlap"] = ""
            for delol in array["Overlap"].split("-"):
                delol = int(delol)
                arrayol = dfci.loc[delol]
                newol = []
                for ol in arrayol["Overlap"].split("-"):
                    if ol!=str(idx):
                        newol += [ol]
                newol = "-".join(newol)
                fo.loc[delol, "Overlap"] = newol
                ci_list += [max(delol, idx)]
        dfci = dfci.loc[ci_list]
        dfci = dfci[dfci["Fully-Overlap"]==False]
else:
    dfci = None

# rawdata focuses on the word-level annotation
# data1 focuses on the turn-taking behaviors
# data2 focuses on the backchannels behaviors
# data3 and dfci focus on the interjection behaviors
os.makedirs(os.path.dirname(tablepath), exist_ok=True)
data_dict = {'rawdata': rawdata, 'data1': data1, 'data2': data2, 'data3': data3, 'dfci': dfci}
np.save(tablepath, data_dict)

#####################
### Load all data ###
#####################
##########################
### delete silent sections
##########################
#############################
##### Laughter Analysis #####
#############################
###########################################################
### Preprocess the word-level time stamp into sentences ###
###########################################################
#########################################
### Get Dialog with Shifted Timestamp ###
#########################################
######################################
### Get Candidates of Backchannels ###
######################################
#######################################
### Get Candidates of Interjections ###
#######################################
