In [None]:
'''
#In this file, 
Go through the  df_iemocap_recognition.csv file, for every row, extract following data , store the data in a csv file.:

"audio_file": audio file name,
"audio_path": audio file full path,
"transcription": current text,
"audio_features": features extract from audio  by Robert
"input_ids" and "attention_mask" : features extract from text by Robert
"label": emotion label of current sentence


TODO:
1. use future emotion Label as for prediction
2. add context (3 sentences), use current sentence and previous 2 sentences for prediction
3. change to store in .pkl file

'''

import os
import soundfile as sf
import torch
from transformers import RobertaTokenizer, HubertModel, HubertConfig
import pandas as pd
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --- Configuration ---
IEMOCAP_DIR = "dataset/IEMOCAP_full_release"  
SESSIONS = ["Session1", "Session2", "Session3", "Session4", "Session5"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUTPUT_CSV_DIR = "data/preprocess/"
TRANSCRIPT_CSV = "data/df_iemocap_recognition.csv"  

# --- Models and Tokenizers ---
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
hubert_config = HubertConfig.from_pretrained("facebook/hubert-base-ls960")
hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(DEVICE)
hubert_model.eval()

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

In [4]:
# use Hubert to extract audio feature
def extract_audio_features(audio_path):
    """Extract HuBERT features from audio."""
    audio, sr = sf.read(audio_path)
    audio_tensor = torch.tensor(audio).unsqueeze(0).to(DEVICE).float()
    with torch.no_grad():
        outputs = hubert_model(audio_tensor)
    return outputs.last_hidden_state.squeeze(0).cpu().numpy()

#  get transcript for audio file
def get_transcription(audio_file_path, transcript_df):
    """Get transcription from the provided CSV DataFrame."""
    audio_filename = os.path.basename(audio_file_path)
    print (audio_filename)
    if audio_filename in transcript_df['wav_file'].values:
        return transcript_df.loc[transcript_df['wav_file'] == audio_filename, 'script'].iloc[0]
    else:
        print(f"Warning: Transcription not found for {audio_filename}")
        return "Transcription not found."  # Or handle missing transcript as needed

In [None]:
def load_preprocessed_data(csv_dir):
    """Load preprocessed data from CSV files."""
    all_data = []
  
    for file in os.listdir(csv_dir):
        if file.endswith(".csv"):
            csv_path = os.path.join(csv_dir, file)
            df = pd.read_csv(csv_path)
            for _, row in df.iterrows():
                all_data.append({
                    "audio_file": row["audio_file"],
                    "audio_features": np.array(eval(row["audio_features"])), #convert string back to list, then to numpy array.                    
                    "input_ids": np.array(eval(row["input_ids"])),
                    "attention_mask": np.array(eval(row["attention_mask"])),
                    "transcription": row["transcription"],
                    "audio_path": row["audio_path"]
                })
    return all_data

In [None]:
# test on one file
def test_preprocess_iemocap(df_script, output_dir):
    """Preprocess data using audio filenames from the transcript CSV."""
    os.makedirs(output_dir, exist_ok=True)
    data = []
    
    row = df_script.iloc[0]
    audio_filename = row['wav_file']
    audio_subpath = audio_filename[:audio_filename.rfind("_")]
    audio_path = IEMOCAP_DIR +"/Session" + audio_filename[4]+ "/sentences/wav/"+ audio_subpath + "/"+ audio_filename+".wav"
    #audio_path = os.path.join(IEMOCAP_DIR, "Session" + audio_filename[3], "sentences", "wav", audio_filename)  # Reconstruct audio path.
        
    if os.path.exists(audio_path):       
        transcription = get_transcription(audio_filename, df_script)       
        audio_features = extract_audio_features(audio_path)
        #print("audio_features",audio_features)        
        print(f"Shape of audio_features: {audio_features.shape}")
        print(f"Data type of audio_features: {audio_features.dtype}")       
        flattened_list=audio_features.flatten().tolist()
        print(f"First 10 elements of flattened audio features: {flattened_list[:10]}")       
        text_tokens = roberta_tokenizer(transcription, padding=True, truncation=True, return_tensors="pt")        
        text_tokens = {key: value.squeeze(0).cpu().numpy() for key, value in text_tokens.items()}        
        #label = row['emotion'] if 'label' in df_script.columns else None
        label = row['emotion']         
        
        data.append({
            "audio_file": audio_filename,
            "audio_features": flattened_list,        
            "input_ids": text_tokens["input_ids"].flatten().tolist(),
            "attention_mask": text_tokens["attention_mask"].flatten().tolist(),
            "transcription": transcription,
            "audio_path": audio_path,
            "label": label
        })
        
        #print("data",data)   can't print because audio_features list is too big

    df = pd.DataFrame(data)
    csv_path = os.path.join(output_dir, "preprocessed_data.csv")  
    df.to_csv(csv_path, index=False)
    print(f"Data preprocessed and saved to {csv_path}")
    return


df_script = pd.read_csv(TRANSCRIPT_CSV)  # Load the transcript CSV
test_preprocess_iemocap(df_script, OUTPUT_CSV_DIR)
loaded_data = load_preprocessed_data(OUTPUT_CSV_DIR)

if loaded_data:
    example_data = loaded_data[0]
    print("Example Loaded Data:")     
    flattened_list =example_data['audio_features']
    original_shape=(97, 768)    
    reshaped_array =np.array(flattened_list).reshape(original_shape)
    print(f"Audio Features Shape: {reshaped_array.shape}")  
    print(f"Input IDs Shape: {example_data['input_ids'].shape}")
    print(f"Transcription: {example_data['transcription']}")
    print(f"Audio Path: {example_data['audio_path']}")
else:
    print("No preprocessed data loaded.")


Example Loaded Data:
Audio Features Shape: (97, 768)
Input IDs Shape: (6,)
Transcription: Excuse me.
Audio Path: dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F000.wav


In [None]:
# preprocess all file
def preprocess_iemocap(df_script, output_dir):
    """Preprocess data using audio filenames from the transcript CSV."""
    os.makedirs(output_dir, exist_ok=True)
    data = []  
    
    for _, row in tqdm(df_script.iterrows()):
        # get audio filename and audio file full path
        audio_filename = row['wav_file']
        audio_subpath = audio_filename[:audio_filename.rfind("_")]
        audio_path = IEMOCAP_DIR +"/Session" + audio_filename[4]+ "/sentences/wav/"+ audio_subpath + "/"+ audio_filename+".wav"
        #audio_path = os.path.join(IEMOCAP_DIR, "Session" + audio_filename[3], "sentences", "wav", audio_filename)  # Reconstruct audio path.
        
        
        if os.path.exists(audio_path):
            print("audio_path:",audio_path)
            transcription = get_transcription(audio_filename, df_script)
            audio_features = extract_audio_features(audio_path)
            text_tokens = roberta_tokenizer(transcription, padding=True, truncation=True, return_tensors="pt")
            text_tokens = {key: value.squeeze(0).cpu().numpy() for key, value in text_tokens.items()}
            #label = row['emotion'] if 'label' in df_script.columns else None
            label = row['emotion'] 
            print("label",label)

            data.append({
                "audio_file": audio_filename,
                "audio_features": audio_features.flatten().tolist(),
                "input_ids": text_tokens["input_ids"].flatten().tolist(),
                "attention_mask": text_tokens["attention_mask"].flatten().tolist(),
                "transcription": transcription,
                "audio_path": audio_path,
                "label": label
            })            
        else:
            print(f"Warning: Audio file not found at {audio_path}")

    df = pd.DataFrame(data)
    csv_path = os.path.join(output_dir, "preprocessed_data.csv")  # Single CSV for all
    df.to_csv(csv_path, index=False)
    print(f"Data preprocessed and saved to {csv_path}")

In [None]:

df_script = pd.read_csv(TRANSCRIPT_CSV)  # Load the transcript CSV
preprocess_iemocap(df_script, OUTPUT_CSV_DIR)
loaded_data = load_preprocessed_data(OUTPUT_CSV_DIR)

# Example: Accessing loaded data
if loaded_data:
    example_data = loaded_data[0]
    print("Example Loaded Data:")
    print(f"Audio Features Shape: {example_data['audio_features'].shape}")
    print(f"Input IDs Shape: {example_data['input_ids'].shape}")
    print(f"Transcription: {example_data['transcription']}")
    print(f"Audio Path: {example_data['audio_path']}")
else:
    print("No preprocessed data loaded.")


0it [00:00, ?it/s]

audio_path: dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F000.wav
Ses01F_impro01_F000


1it [00:00,  3.16it/s]

label neu
audio_path: dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_M000.wav
Ses01F_impro01_M000


2it [00:00,  2.74it/s]

label fru





Data preprocessed and saved to data/preprocess/preprocessed_data.csv
Example Loaded Data:
Audio Features Shape: (74496,)
Input IDs Shape: (6,)
Transcription: Excuse me.
Audio Path: dataset/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_impro01/Ses01F_impro01_F000.wav
