In [2]:
'''
Go through the df_iemocap.csv file, for every row, use roberta ,hubert and librosa to extract features, store in .pkl file.:
Use sliding windows, create sequence data, contain current sentence and history (past two sentence)
in order to deal with history, use the index column.
    if current index >=3, then HISTORY_LENGTH = 2, use past two sentences .
    if current index <3, then history num = (index-1), 
    index=1, means current sentence is the first sentence, HISTORY_LENGTH=0, 
    index =2, means current sentence is the second sentence, HISTORY_LENGTH=1

'''
# --- Imoort libraries ---
import os
import soundfile as sf
import pandas as pd
import torch
from transformers import RobertaTokenizer, HubertModel, HubertConfig
import numpy as np
from tqdm import tqdm
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- Configuration ---
IEMOCAP_DIR = "../../dataset/IEMOCAP_full_release"  
SESSIONS = ["Session1", "Session2", "Session3", "Session4", "Session5"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#OUTPUT_CSV_DIR = "data/preprocess/"
OUTPUT_PKL_DIR = "../../data"
OUTPUT_PKL_FILENAME = "iemocap_preprocessed_data.pkl"
TRANSCRIPT_CSV = "../../data/df_iemocap.csv"  
HISTORY_LENGTH = 3  # Number of previous utterances to consider

In [4]:
# --- Models and Tokenizers ---
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
hubert_config = HubertConfig.from_pretrained("facebook/hubert-base-ls960")
hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(DEVICE)
hubert_model.eval()

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

In [5]:
# use Hubert to extract audio feature
def extract_audio_features(audio_path):
    """Extract HuBERT features from audio."""
    audio, sr = sf.read(audio_path)
    audio_tensor = torch.tensor(audio).unsqueeze(0).to(DEVICE).float()
    with torch.no_grad():
        outputs = hubert_model(audio_tensor)
    return outputs.last_hidden_state.squeeze(0).cpu().numpy()

#  get transcript for audio file
def get_transcription(audio_file_path, df_iemocap):
    """Get transcription from the provided CSV DataFrame."""
    audio_filename = os.path.basename(audio_file_path)
    print (audio_filename)
    if audio_filename in df_iemocap['wav_file'].values:
        return df_iemocap.loc[df_iemocap['wav_file'] == audio_filename, 'script'].iloc[0]
    else:
        print(f"Warning: Transcription not found for {audio_filename}")
        return "Transcription not found."  # Or handle missing transcript as needed

In [6]:
def load_preprocessed_data(pkl_dir):
    """Load preprocessed data from .pkl file."""
    pkl_path = os.path.join(pkl_dir, OUTPUT_PKL_FILENAME )
    if os.path.exists(pkl_path):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        return data
    else:
        return []

In [10]:
def create_sequence_pairs(data):
    sequence_data = []
    print("Create sequence data...")   
    for i in range(len(data) ):
        current_data = data[i]   
        print(current_data["index"])
        history = []

        # first sentence no history, second sentence only has one history, third or later sentence has 2 history
        if current_data["index"]<3:
            history_length = current_data["index"]-1
        else:
            history_length = HISTORY_LENGTH      

        for j in range(max(0, i - history_length), i):
            history.append({
                "input_ids": data[j]["input_ids"],
                "attention_mask": data[j]["attention_mask"],
                "hubert_features": data[j]["hubert_features"],               
                "label": data[j]["label"],              
                "next_label": data[j]["next_label"]
            })
        
        print("hubert_features in sequence data",current_data["hubert_features"])
        sequence_data.append({
            "history": history,
            "current_hubert": current_data["hubert_features"],         
            "current_text": current_data["transcription"],
            "current_label": current_data["label"],
            "current_next_label": current_data["next_label"],
        })

    return sequence_data

 

In [11]:
# test on 1 rows
def test_preprocess_iemocap(df_iemocap, output_dir):
    """Preprocess data using audio filenames from the transcript CSV."""
    os.makedirs(output_dir, exist_ok=True)
    data = []
    
    row = df_iemocap.iloc[0]
    audio_filename = row['wav_file']
    audio_subpath = audio_filename[:audio_filename.rfind("_")]
    audio_path = IEMOCAP_DIR +"/Session" + audio_filename[4]+ "/sentences/wav/"+ audio_subpath + "/"+ audio_filename+".wav"
    #audio_path = os.path.join(IEMOCAP_DIR, "Session" + audio_filename[3], "sentences", "wav", audio_filename)  # Reconstruct audio path.
        
    if os.path.exists(audio_path):       
       
        transcription = get_transcription(audio_filename, df_iemocap)
        hubert_features  = extract_audio_features(audio_path)     

        text_tokens = roberta_tokenizer(transcription, padding=True, truncation=True, return_tensors="pt")
        text_tokens = {key: value.squeeze(0).cpu().numpy() for key, value in text_tokens.items()}       
        label = row['label'] 
        index = row['index']
        next_label = row['next_label'] 
        print("label",label)

        data.append({
            "audio_file": audio_filename,
            "hubert_features": hubert_features,           
            "input_ids": text_tokens["input_ids"],
            "attention_mask": text_tokens["attention_mask"],
            "transcription": transcription,
            "audio_path": audio_path,
            "label": label,    
            "next_label": next_label,   
            "index": index       
        })            
        
        
    
    sequence_data = create_sequence_pairs(data)
    
    pkl_path = os.path.join(output_dir, "iemocap_preprocessed_data.pkl")
    with open(pkl_path, 'wb') as f:
        pickle.dump(sequence_data, f)
    print(f"Data preprocessed and saved to {pkl_path}")
    return


df_iemocap = pd.read_csv(TRANSCRIPT_CSV)  # Load the transcript CSV
#df_iemocap.tail()
test_preprocess_iemocap(df_iemocap, OUTPUT_PKL_DIR)
loaded_data = load_preprocessed_data(OUTPUT_PKL_DIR)

if loaded_data:
    example_data = loaded_data[0]
    print("Example Loaded Data:")
    print(f"History Length: {len(example_data['history'])}")
    print(f"Current Hubert Shape: {example_data['current_hubert'].shape}")
    print(f"Current Text Shape: {example_data['current_text']}")
    print(f"Current Label: {example_data['current_label']}")
    print(f"Current Next Label: {example_data['current_next_label']}")
else:
    print("No preprocessed data loaded.")


Ses01F_impro01_F000
label neu
Create sequence data...
1
hubert_features in sequence data [[ 0.11579464  0.11709942  0.28083935 ... -0.2782844  -0.06476727
   0.21543494]
 [ 0.16728824  0.15846454  0.28208348 ... -0.3612778  -0.09611124
   0.04330719]
 [ 0.18510109  0.12920734  0.27065846 ... -0.31461972 -0.0056014
   0.06006904]
 ...
 [-0.00815474  0.4321724   0.2144017  ... -0.04938321  0.08605168
   0.13589515]
 [-0.02400946  0.4112573   0.21331745 ... -0.10015789  0.09740229
   0.09691934]
 [ 0.0906674   0.31126025  0.20738426 ... -0.15112437  0.04656437
   0.08059184]]
Data preprocessed and saved to ../../data\iemocap_preprocessed_data.pkl
Example Loaded Data:
History Length: 0
Current Hubert Shape: (97, 768)
Current Text Shape: Excuse me.
Current Label: neu
Current Next Label: fru


In [12]:
# preprocess all file
def preprocess_iemocap(df_iemocap, output_dir):
    """Preprocess data using audio filenames from the transcript CSV."""
    os.makedirs(output_dir, exist_ok=True)
    data = []  
  
    for _, row in tqdm(df_iemocap.iterrows()):
        # get audio filename and audio file full path
        audio_filename = row['wav_file']
        audio_subpath = audio_filename[:audio_filename.rfind("_")]
        audio_path = IEMOCAP_DIR +"/Session" + audio_filename[4]+ "/sentences/wav/"+ audio_subpath + "/"+ audio_filename+".wav"
        #audio_path = os.path.join(IEMOCAP_DIR, "Session" + audio_filename[3], "sentences", "wav", audio_filename)  # Reconstruct audio path.
  
        if os.path.exists(audio_path):

            print("audio_path:",audio_path)
            transcription = get_transcription(audio_filename, df_iemocap)
            hubert_features  = extract_audio_features(audio_path)  #TODO: change to WavLM            

            text_tokens = roberta_tokenizer(transcription, padding=True, truncation=True, return_tensors="pt")
            text_tokens = {key: value.squeeze(0).cpu().numpy() for key, value in text_tokens.items()}            
            label = row['label'] 
            index = row['index']
            next_label = row['next_label'] 
            #print("label",label)

            data.append({
                "audio_file": audio_filename,
                "hubert_features": hubert_features,               
                "input_ids": text_tokens["input_ids"],
                "attention_mask": text_tokens["attention_mask"],
                "transcription": transcription,
                "audio_path": audio_path,
                "label": label,
                "next_label": next_label,   
                "index": index       
            })            
        else:
            print(f"Warning: Audio file not found at {audio_path}")

    sequence_data = create_sequence_pairs(data)

    pkl_path = os.path.join(output_dir, "preprocessed_data.pkl")
    with open(pkl_path, 'wb') as f:
        pickle.dump(sequence_data, f)
    print(f"Data preprocessed and saved to {pkl_path}")

In [13]:
df_iemocap = pd.read_csv(TRANSCRIPT_CSV)
preprocess_iemocap(df_iemocap, OUTPUT_PKL_DIR)
loaded_data = load_preprocessed_data(OUTPUT_PKL_DIR)

if loaded_data:
    example_data = loaded_data[0]
    print("Example Loaded Data:")
    print(f"History Length: {len(example_data['history'])}")
    print(f"Current Hubert Shape: {example_data['current_hubert'].shape}")
    print(f"Current Text Shape: {example_data['current_text']}")
    print(f"Current Label: {example_data['current_label']}")
    print(f"Current Next Label: {example_data['current_next_label']}")
else:
    print("No preprocessed data loaded.")

0it [00:00, ?it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F000.wav
Ses01F_impro01_F000


1it [00:00,  3.26it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M000.wav
Ses01F_impro01_M000


2it [00:00,  2.66it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F001.wav
Ses01F_impro01_F001


3it [00:01,  2.94it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M001.wav
Ses01F_impro01_M001


4it [00:01,  2.18it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F002.wav
Ses01F_impro01_F002


5it [00:02,  2.11it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M002.wav
Ses01F_impro01_M002


6it [00:02,  1.89it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F003.wav
Ses01F_impro01_F003


7it [00:03,  2.23it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F004.wav
Ses01F_impro01_F004


8it [00:03,  2.17it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M003.wav
Ses01F_impro01_M003


9it [00:04,  1.93it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F005.wav
Ses01F_impro01_F005


10it [00:04,  1.84it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M004.wav
Ses01F_impro01_M004


11it [00:05,  1.86it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M005.wav
Ses01F_impro01_M005


12it [00:06,  1.19it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F006.wav
Ses01F_impro01_F006


13it [00:07,  1.20it/s]

audio_path: ../../dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M006.wav
Ses01F_impro01_M006


13it [00:08,  1.56it/s]


KeyboardInterrupt: 