# Preprocessing Two‑Channel Speech Data

This notebook shows you how to preprocess a two‑channel speech dialogue file and produce:

1. **Word‑level transcriptions** via [OpenAI Whisper](https://github.com/openai/whisper)
2. **Laughter probability scores** for each word via our [Laughter Detector](https://github.com/jrgillick/laughter-detection)

> **Before you begin:**  
> Install both Whisper and the laughter detector by following the instructions in our [README](https://github.com/shinshoji01/Personality-Prediction-for-Conversation-Agents/tree/main/implementation).


In [2]:
import warnings
warnings.filterwarnings("ignore")

import torch
import librosa
import numpy as np
import glob
import os
import whisper
import pandas as pd

import sys
sys.path.append("../sho_util/pyfiles/")

from sound import play_audio
from basic import get_bool_base_on_conditions

from tqdm import tqdm
sys.path.append('./../../../laughter-detection/')
sys.path.append('./../../../laughter-detection/utils/')
import configs
# import torch_utils

sys.path.append('../pyfiles/')
from dialog import GetLaughs, save_audio

tempfile = "temp.wav"

---
Edit the two variables below to point to your data and output folder:

- `audiopath`: A string containing the our two‑channel audio file (e.g. WAV with separate speaker channels).
- `feature_dir`: A string specifying the path of the directory where all preprocessed outputs will be saved.

---

In [3]:
###########################################
########## Adjustable Parameters ##########
###########################################

audiopath = "../audio/sample.wav"
feature_dir = "../audio/features/sample/"

###########################################
###########################################
###########################################

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}")

print("##########################################")
print("########## Whisper Transciption ##########")
print("##########################################")

##### Model Preparation #####

whisper_size = "turbo"
whisper_dir = feature_dir + "whisper/"
model = whisper.load_model(whisper_size, device=device)

##### Get Transcriptions #####
savepath = whisper_dir + os.path.basename(audiopath[:-4]) + f".npy"
os.makedirs(os.path.dirname(savepath), exist_ok=True)
a, fs = librosa.load(audiopath, sr=None, mono=False)
save_audio(tempfile, a[0], fs)
result1 = whisper.transcribe(model, tempfile, temperature=0.0, word_timestamps=True, condition_on_previous_text=False)
save_audio(tempfile, a[1], fs)
result2 = whisper.transcribe(model, tempfile, temperature=0.0, word_timestamps=True, condition_on_previous_text=False)
np.save(savepath, [result1, result2])

Using device cuda
##########################################
########## Whisper Transciption ##########
##########################################


In [4]:
print("########################################")
print("########## Laughter Detection ##########")
print("########################################")

##### Model Preparation #####
laughter_dir = feature_dir + "laughs/"
repo_dir = "../../../laughter-detection/"
model_path = repo_dir + "checkpoints/in_use/resnet_with_augmentation"
config = "resnet_with_augmentation"
config = configs.CONFIG_MAP[config]
sample_rate = 8000 # This is the defaul value used in the laughter detection.
laugh_detector = GetLaughs(config, sample_rate, device, model_path)
    
##### Get Laughter #####
savepath = laughter_dir + os.path.basename(audiopath[:-4]) + f".npy"
os.makedirs(os.path.dirname(savepath), exist_ok=True)
laughs = []
a, _ = librosa.load(audiopath, sr=sample_rate, mono=False)
for i in range(2):
    save_audio(tempfile, a[i], sample_rate)
    audio_path = tempfile
    probs, fps = laugh_detector.get(audio_path)
    output = np.concatenate([[fps], probs])
    laughs += [output]
np.save(savepath, laughs)

########################################
########## Laughter Detection ##########
########################################
training with dropout=0.0
Loading checkpoint at: ../../../laughter-detection/checkpoints/in_use/resnet_with_augmentation/best.pth.tar
Loading checkpoint at step:  60600


In [9]:
for idx, arr in enumerate(laughs):
    print(f"laughs[{idx}].shape: {arr.shape}")

laughs[0].shape: (5119,)
laughs[1].shape: (5119,)


In [10]:
print(a[:, :10])

[[-0.00024414 -0.00024414  0.00048828  0.00048828  0.00048828 -0.00048828
  -0.00048828  0.00048828  0.00048828 -0.00048828]
 [-0.00048828  0.00048828  0.00341797  0.00146484 -0.00048828  0.00097656
   0.00170898  0.00170898  0.00097656  0.00170898]]
