<a href="https://colab.research.google.com/github/PrswpkWirom/FinalProject_temp/blob/main/DD_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydub



In [2]:
import os
import pandas as pd
import wave
import random
from IPython.display import Audio, display
from io import BytesIO
import numpy as np
from transformers import RobertaTokenizer, RobertaModel
import torch
import pickle
from google.colab import files

In [3]:
database_dir = '/content/drive/MyDrive/DAIC_WOZ'

# Dataset Loading

In [4]:
file_path = "/content/drive/MyDrive/DAIC_WOZ/303_P/303_TRANSCRIPT.csv"
df = pd.read_csv(file_path, usecols=[0], header=None)
df_split_test = df[0].str.split('\t', expand=True)

In [5]:
df_split_test.head()

Unnamed: 0,0,1,2,3
0,start_time,stop_time,speaker,value
1,26.276,48.696,Ellie,hi i'm ellie thanks for coming in today i was ...
2,49.256,50.406,Ellie,how are you doing today
3,50.686,51.836,Participant,okay how 'bout yourself
4,52.576,54.136,Ellie,i'm great thanks


In [6]:
#create the dictionary containing the whole dataset, the query is participant id and the value is transcript dataframe, .wav and etc.
#to access the value: data_dictionary['XXX_P']['column']
#column={'transcript_df','audio_data'}

data_dictionary = {}
for folder_name in os.listdir(database_dir):
    folder_path = os.path.join(database_dir, folder_name)

    if os.path.isdir(folder_path):
        data_dictionary[folder_name] = {}
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            if file_name.endswith("_TRANSCRIPT.csv"):
                df = pd.read_csv(file_path, usecols=[0], header=None)
                df_split = df[0].str.split('\t', expand=True)
                data_dictionary[folder_name]["transcript_df"] = df_split
                print(f" {folder_name}_df completed")

            elif file_name.endswith(".wav"):
                with wave.open(file_path, "rb") as audio:
                    audio_data = audio.readframes(audio.getnframes())
                print(f" {folder_name}_wav completed")
                data_dictionary[folder_name]["audio_data"] = audio_data

 362_P_wav completed
 362_P_df completed
 357_P_wav completed
 357_P_df completed
 344_P_wav completed
 344_P_df completed
 340_P_wav completed
 340_P_df completed
 338_P_wav completed
 338_P_df completed
 321_P_wav completed
 321_P_df completed
 320_P_wav completed
 320_P_df completed
 319_P_wav completed
 319_P_df completed
 315_P_wav completed
 315_P_df completed
 303_P_wav completed
 303_P_df completed


In [7]:
df_test = data_dictionary["315_P"]["transcript_df"]
df_test.head()

Unnamed: 0,0,1,2,3
0,start_time,stop_time,speaker,value
1,55.305,58.305,Ellie,hi i'm ellie thanks for coming in today
2,56.86,57.14,Participant,alright
3,58.98,62.39,Ellie,i was created to talk to people in a safe and ...
4,63.24,69.605,Ellie,i'm not a therapist but i'm here to learn abou...


# Data Preprocessing

In [8]:
#removing Ellie from the dataframe
for id, data in data_dictionary.items():
    transcript_df = data["transcript_df"]
    transcript_df.columns = ["start_time", "stop_time", "speaker", "value"]
    filtered_df = transcript_df[transcript_df["speaker"] != "Ellie"]
    data_dictionary[id]["transcript_df"] = filtered_df

In [9]:
df_test = data_dictionary["315_P"]["transcript_df"]
df_test.head()

Unnamed: 0,start_time,stop_time,speaker,value
0,start_time,stop_time,speaker,value
2,56.86,57.14,Participant,alright
7,77.36,77.79,Participant,yes
10,82.4,83.09,Participant,okay and you
13,88.735,89.885,Participant,inglewood california


In [10]:
def play_audio_for_id(id):
    # Extract the base id without the "_P" suffix
    base_id = id.split('_')[0]  # This will give "315" from "315_P"

    # Construct the path to the .wav file within the folder
    audio_file_path = os.path.join(database_dir, id, f"{base_id}_AUDIO.wav")

    # Check if the audio file exists
    if os.path.exists(audio_file_path):
        # Display and play the audio
        print(f"Playing audio for id: {id}")
        display(Audio(audio_file_path, autoplay=True))
    else:
        print(f"Audio file not found for id {id}: {audio_file_path}")

# Example: Play audio for a specific id
#play_audio_for_id("315_P")  # Replace with the desired id

In [11]:
from pydub import AudioSegment
session_id = "315_P"
base_id = session_id.split('_')[0]  # Extract base id (e.g., "315")

# Construct the path to the .wav file within the folder
audio_file_path = f"{database_dir}/{session_id}/{base_id}_AUDIO.wav"

# Check if the audio file exists
if not os.path.exists(audio_file_path):
    print(f"Audio file not found for session {session_id}: {audio_file_path}")
else:
    # Load the full audio file
    full_audio = AudioSegment.from_wav(audio_file_path)

    # Assume data_dictionary contains the DataFrame for 315_P
    transcript_df = data_dictionary[session_id]["transcript_df"]
    transcript_df.columns = ["start_time", "stop_time", "speaker", "value"]

    # Initialize list for storing aligned data
    aligned_data = []

    # Iterate over each row in the transcript DataFrame
    for _, row in transcript_df.iterrows():
        try:
            # Convert start and stop times to floats
            start_time = float(row["start_time"])
            stop_time = float(row["stop_time"])
            sentence = row["value"]

            # Convert start and stop times to milliseconds
            start_ms = int(start_time * 1000)
            stop_ms = int(stop_time * 1000)

            # Extract the audio segment for this sentence
            audio_segment = full_audio[start_ms:stop_ms]

            # Append the structured data for this sentence
            aligned_data.append({
                "start_time": start_time,
                "stop_time": stop_time,
                "sentence": sentence,
                "audio_segment": audio_segment
            })

        except ValueError:
            # Skip the row if there is a conversion error
            print(f"Skipping row due to conversion error: {row}")
            continue

    # Store the aligned data in the dictionary
    data_dictionary[session_id]["aligned_data"] = aligned_data
    print(f"{session_id} processing completed")

    # Function to check random samples
    def check_random_samples(num_samples=5):
        aligned_data = data_dictionary[session_id]["aligned_data"]
        num_samples = min(num_samples, len(aligned_data))

        # Randomly select samples
        random_samples = random.sample(aligned_data, num_samples)

        # Display each random sample
        for sample in random_samples:
            sentence = sample["sentence"]
            audio_segment = sample["audio_segment"]

            # Display the sentence
            print(f"Sentence: {sentence}")

            # Convert the audio segment to a playable format and play it
            audio_bytes = BytesIO()
            audio_segment.export(audio_bytes, format="wav")
            audio_bytes.seek(0)  # Move cursor to the beginning

            # Play the audio
            display(Audio(audio_bytes.read(), rate=audio_segment.frame_rate))

    # Check a few random samples to verify
    check_random_samples()

Skipping row due to conversion error: start_time    start_time
stop_time      stop_time
speaker          speaker
value              value
Name: 0, dtype: object
315_P processing completed
Sentence: i'm actually living it um


Sentence: angry upset you know when you're in a argument that's what you feel so


Sentence: uh school went to school grew up together i've been knowing all my friends for pretty much all my life 


Sentence: that


Sentence: and uh that started a future in in athletics and that's what i wanted to do so i accomplished my goals so at the time it was a great a great thing and it still is a great thing it's a great accomplishment in my life  


In [12]:
for id, data in data_dictionary.items():
    # Extract the base id without the "_P" suffix
    base_id = id.split('_')[0]

    # Construct the path to the .wav file within the folder
    audio_file_path = f"{database_dir}/{id}/{base_id}_AUDIO.wav"

    # Check if the audio file exists
    if not os.path.exists(audio_file_path):
        print(f"Audio file not found for id {id}: {audio_file_path}")
        continue  # Skip this entry if the audio file is missing

    # Load the full audio file
    full_audio = AudioSegment.from_wav(audio_file_path)

    # Get the transcript DataFrame and ensure it has the correct columns
    transcript_df = data["transcript_df"]
    transcript_df.columns = ["start_time", "stop_time", "speaker", "value"]

    # Initialize list for storing aligned data
    aligned_data = []

    # Iterate over each row in the transcript DataFrame
    for _, row in transcript_df.iterrows():
        try:
            # Convert start and stop times to floats
            start_time = float(row["start_time"])
            stop_time = float(row["stop_time"])
            sentence = row["value"]

            # Convert start and stop times to milliseconds
            start_ms = int(start_time * 1000)
            stop_ms = int(stop_time * 1000)

            # Extract the audio segment for this sentence
            audio_segment = full_audio[start_ms:stop_ms]

            # Append the structured data for this sentence
            aligned_data.append({
                "start_time": start_time,
                "stop_time": stop_time,
                "sentence": sentence,
                "audio_segment": audio_segment
            })

        except ValueError:
            # Skip the row if there is a conversion error
            print(f"Skipping row due to conversion error: {row}")
            continue

    # Store the aligned data in the dictionary
    data["aligned_data"] = aligned_data
    print(f"{id} processing completed")

Skipping row due to conversion error: start_time    start_time
stop_time      stop_time
speaker          speaker
value              value
Name: 0, dtype: object
362_P processing completed
Skipping row due to conversion error: start_time    start_time
stop_time      stop_time
speaker          speaker
value              value
Name: 0, dtype: object
357_P processing completed
Skipping row due to conversion error: start_time    start_time
stop_time      stop_time
speaker          speaker
value              value
Name: 0, dtype: object
344_P processing completed
Skipping row due to conversion error: start_time    start_time
stop_time      stop_time
speaker          speaker
value              value
Name: 0, dtype: object
340_P processing completed
Skipping row due to conversion error: start_time    start_time
stop_time      stop_time
speaker          speaker
value              value
Name: 0, dtype: object
338_P processing completed
Skipping row due to conversion error: start_time    start_ti

In [13]:
print(data_dictionary["315_P"]["aligned_data"])

[{'start_time': 56.86, 'stop_time': 57.14, 'sentence': 'alright', 'audio_segment': <pydub.audio_segment.AudioSegment object at 0x7b0438a95270>}, {'start_time': 77.36, 'stop_time': 77.79, 'sentence': 'yes', 'audio_segment': <pydub.audio_segment.AudioSegment object at 0x7b042f3293f0>}, {'start_time': 82.4, 'stop_time': 83.09, 'sentence': 'okay and you', 'audio_segment': <pydub.audio_segment.AudioSegment object at 0x7b0438a95e70>}, {'start_time': 88.735, 'stop_time': 89.885, 'sentence': 'inglewood california', 'audio_segment': <pydub.audio_segment.AudioSegment object at 0x7b042f329300>}, {'start_time': 93.675, 'stop_time': 94.045, 'sentence': 'yes', 'audio_segment': <pydub.audio_segment.AudioSegment object at 0x7b0438a975b0>}, {'start_time': 95.425, 'stop_time': 95.755, 'sentence': 'no', 'audio_segment': <pydub.audio_segment.AudioSegment object at 0x7b042f329360>}, {'start_time': 99.12, 'stop_time': 99.77, 'sentence': "uh it's okay", 'audio_segment': <pydub.audio_segment.AudioSegment obje

In [14]:
def check_random_samples(session_id, num_samples=5):
    aligned_data = data_dictionary[session_id]["aligned_data"]
    num_samples = min(num_samples, len(aligned_data))
    random_samples = random.sample(aligned_data, num_samples)

    for sample in random_samples:
        sentence = sample["sentence"]
        audio_segment = sample["audio_segment"]
        print(f"Sentence: {sentence}")
        audio_bytes = BytesIO()
        audio_segment.export(audio_bytes, format="wav")
        audio_bytes.seek(0)  # Move cursor to the beginning

        display(Audio(audio_bytes.read(), rate=audio_segment.frame_rate))

In [15]:
check_random_samples("362_P")
#time stamp บางอันมีปัญหา ex. 321,362

Sentence: yes 


Sentence: i'm very good at controlling my temper 


Sentence: use the tools that have been given to me in therapy um


Sentence: everything that being a parent has made me a better person it's made me a better everything a better employee um i've


Sentence: a productive member of society <s> of society without therapy 


In [16]:
#delete the things from memory
for id, data in data_dictionary.items():
    if "transcript_df" in data:
        del data["transcript_df"]
    if "audio_data" in data:
        del data["audio_data"]
    print(f"{id} deleted")

362_P deleted
357_P deleted
344_P deleted
340_P deleted
338_P deleted
321_P deleted
320_P deleted
319_P deleted
315_P deleted
303_P deleted


# Embedding generation

In [17]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

def get_text_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    word_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (num_words, embedding_dim)
    return word_embeddings  #this is word level!! = matrix of embedding representing each word


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def get_sentence_level_text_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (1, embedding_dim)
    return sentence_embedding.squeeze(0)  # Remove batch dimension for easier use

In [19]:
from transformers import Wav2Vec2FeatureExtractor, WavLMModel
import torch
import numpy as np

# Use Wav2Vec2FeatureExtractor instead of Wav2Vec2Processor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
wavlm_model = WavLMModel.from_pretrained("microsoft/wavlm-base")

def get_audio_embedding(audio_segment):
    # Convert AudioSegment to numpy array and ensure it has int16 format
    audio_data = np.array(audio_segment.get_array_of_samples()).astype(np.int16)

    # Convert audio data to a tensor and specify the dtype as float32
    audio_data_tensor = torch.tensor(audio_data, dtype=torch.float32)

    # Preprocess audio for WavLM using the feature extractor
    inputs = feature_extractor(
        audio_data_tensor,
        sampling_rate=audio_segment.frame_rate,
        return_tensors="pt",
        padding=True
    )

    with torch.no_grad():
        outputs = wavlm_model(**inputs)

    # Get audio embeddings (using the last hidden state)
    audio_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: (num_frames, embedding_dim)

    # Aggregate the embeddings over time to get a single vector
    audio_embedding_vector = audio_embeddings.mean(dim=0)  # Shape: (embedding_dim,)

    return audio_embedding_vector



In [20]:
for id, data in data_dictionary.items():
    aligned_data = data["aligned_data"]

    for entry in aligned_data:
        text_embedding = get_sentence_level_text_embedding(entry["sentence"])
        audio_embedding = get_audio_embedding(entry["audio_segment"])
        entry["text_embedding"] = text_embedding
        entry["audio_embedding"] = audio_embedding
    print(f"{id} completed")
    #cpu 3 mins/session, L4 15 s/session




362_P completed
357_P completed
344_P completed
340_P completed
338_P completed
321_P completed
320_P completed
319_P completed
315_P completed
303_P completed


In [21]:
def pad_sequence(embeddings, max_len):
    padded_embeddings = []
    for emb in embeddings:
        if emb.size(0)<max_len:
            padding = torch.zeros(max_len-emb.size(0), emb.size(1))
            emb = torch.cat([emb, padding], dim=0)
        padded_embeddings.append(emb)
    return torch.stack(padded_embeddings)

In [22]:
# Initialize a dictionary to store the sequence of embeddings for each session
sequences_dict = {}

for session_id, session_data in data_dictionary.items():
    # Prepare lists to gather text and audio embeddings for each timestamp within the session
    text_embeddings = []
    audio_embeddings = []

    # Collect embeddings in the order of their appearance in the session
    for entry in session_data["aligned_data"]:
        text_embeddings.append(entry["text_embedding"])
        audio_embeddings.append(entry["audio_embedding"])

    # Find the maximum length in the session to pad all embeddings to the same length
    max_len_text = max(emb.size(0) for emb in text_embeddings)
    max_len_audio = max(emb.size(0) for emb in audio_embeddings)

    # Pad the embeddings and stack them into tensors (sequence_length, max_embedding_dim)
    text_sequence = pad_sequence(text_embeddings, max_len_text)
    audio_sequence = pad_sequence(audio_embeddings, max_len_audio)

    # Store the tensors in the dictionary, with one tensor per modality for each session
    sequences_dict[session_id] = {
        "text_sequence": text_sequence,
        "audio_sequence": audio_sequence
    }

In [24]:
print(sequences_dict["315_P"]["text_sequence"].shape)
print(sequences_dict["315_P"]["audio_sequence"].shape)

torch.Size([135, 768])
torch.Size([135, 768])


In [23]:
pickle_filename = "data_dictionary.pkl"

with open(pickle_filename, "wb") as f:
    pickle.dump(data_dictionary, f)
files.download(pickle_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>