## Setup//Environment//Libraries

In [1]:
# If running in Colab, install necessary libraries
!pip install transformers tqdm scikit-learn




In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os

data_path = '/content/drive/MyDrive/IEMOCAP_full_release/IEMOCAP_full_release'  # Adjust if needed


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Unzip the file from Drive to Colab's local storage
!unzip -q '/content/drive/MyDrive/IEMOCAP_full_release.zip' -d '/content/IEMOCAP_full_release'

In [4]:
!rm -rf /content/IEMOCAP_full_release/__MACOSX
!ls /content/IEMOCAP_full_release/IEMOCAP_full_release

Documentation  Session1  Session2  Session3  Session4  Session5


In [5]:
# Audio processing
!pip install librosa soundfile

# Video processing
!pip install opencv-python moviepy

# Machine learning and deep learning
!pip install torch torchvision torchaudio transformers scikit-learn

# Data handling and visualization
!pip install pandas numpy matplotlib seaborn tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [6]:
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
import cv2
import moviepy.editor as mpy
import torch
import torchvision
import torchaudio
import transformers
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

print("All libraries imported successfully!")


  if event.key is 'enter':



All libraries imported successfully!


## Text Preprocessing and Label Extaction

In [24]:
data_path = '/content/IEMOCAP_full_release/IEMOCAP_full_release'

In [25]:
import re

# Extract utterances and texts
transcripts = []
for session in [f'Session{i}' for i in range(1, 6)]:
    trans_dir = os.path.join(data_path, session, 'dialog', 'transcriptions')
    for file in os.listdir(trans_dir):
        if file.endswith('.txt'):
            with open(os.path.join(trans_dir, file), 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split(':', 1)
                    if len(parts) == 2:
                        utt_id, text = parts
                        transcripts.append({'utterance_id': utt_id.strip(), 'text': text.strip()})

# Extract emotion labels, merge 'exc' into 'hap'
valid_emotions = {'ang', 'hap', 'sad', 'neu', 'exc'}
label_dict = {}
for session in [f'Session{i}' for i in range(1, 6)]:
    emo_eval_dir = os.path.join(data_path, session, 'dialog', 'EmoEvaluation')
    for file in os.listdir(emo_eval_dir):
        if file.endswith('.txt'):
            with open(os.path.join(emo_eval_dir, file), 'r') as f:
                for line in f:
                    match = re.match(r'\[.*?\]\s+(\S+)\s+(\w+)', line)
                    if match:
                        utt_id, emotion = match.groups()
                        if emotion in valid_emotions:
                            if emotion == 'exc':
                                emotion = 'hap'
                            label_dict[utt_id] = emotion


In [26]:
texts, utt_ids, y = [], [], []
for item in transcripts:
    utt_id = item['utterance_id'].split()[0]
    if utt_id in label_dict:
        texts.append(item['text'])
        utt_ids.append(utt_id)
        y.append(label_dict[utt_id])
print(f"Number of labeled utterances: {len(y)}")


Number of labeled utterances: 5531


BERT EMBEDDINGS

A. [CLS] Token Embeddings (Batch Processing)

In [27]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

batch_size = 32
cls_embeddings = []

with torch.no_grad():
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        outputs = model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        cls_embeddings.append(cls_emb)

cls_embeddings = np.vstack(cls_embeddings)
np.save('iemocap_text_cls_embeddings.npy', cls_embeddings)


100%|██████████| 173/173 [02:23<00:00,  1.21it/s]


B. Mean-Pooled Embeddings (Batch Processing, Improved)

In [28]:
mean_embeddings = []

with torch.no_grad():
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        outputs = model(**inputs)
        last_hidden = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        sum_embeddings = torch.sum(last_hidden * mask_expanded, 1)
        sum_mask = mask_expanded.sum(1)
        mean_pooled = sum_embeddings / sum_mask
        mean_embeddings.append(mean_pooled.cpu().numpy())

mean_embeddings = np.vstack(mean_embeddings)
np.save('iemocap_text_mean_embeddings.npy', mean_embeddings)


100%|██████████| 173/173 [02:19<00:00,  1.24it/s]


In [29]:
 from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)
print("Emotion classes:", le.classes_)


Emotion classes: ['ang' 'hap' 'neu' 'sad']


In [30]:
from sklearn.model_selection import train_test_split

# Choose which embeddings to use: cls_embeddings or mean_embeddings
X = np.load('iemocap_text_mean_embeddings.npy')  # or 'iemocap_text_cls_embeddings.npy'

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'class_weight': [None, 'balanced']
}

svc = SVC(probability=True, random_state=42)
grid = GridSearchCV(svc, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV score:", grid.best_score_)

y_pred = grid.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Best parameters: {'C': 10, 'class_weight': None, 'kernel': 'rbf'}
Best CV score: 0.6888840449818439
Test Accuracy: 0.7064137308039747
Classification Report:
               precision    recall  f1-score   support

         ang       0.74      0.75      0.75       221
         hap       0.75      0.72      0.74       327
         neu       0.64      0.70      0.67       342
         sad       0.73      0.64      0.68       217

    accuracy                           0.71      1107
   macro avg       0.71      0.70      0.71      1107
weighted avg       0.71      0.71      0.71      1107



In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr.fit(X_train, y_train)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr.predict(X_test)))

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
print("Random Forest Accuracy:", accuracy_score(y_test, rf.predict(X_test)))

# MLP (Neural Network)
mlp = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)
print("MLP Accuracy:", accuracy_score(y_test, mlp.predict(X_test)))


Logistic Regression Accuracy: 0.6549232158988256
Random Forest Accuracy: 0.6648599819331527
MLP Accuracy: 0.6775067750677507


In [33]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[
        ('svm', grid.best_estimator_),
        ('rf', rf),
        ('mlp', mlp)
    ],
    voting='soft'
)
ensemble.fit(X_train, y_train)
print("Ensemble Accuracy:", accuracy_score(y_test, ensemble.predict(X_test)))


Ensemble Accuracy: 0.7000903342366757


In [34]:
print(os.listdir(f"{data_path}/Session1/sentences/wav"))

['Ses01M_impro05', 'Ses01F_script02_2', 'Ses01F_impro01', 'Ses01M_impro07', 'Ses01M_script02_2', 'Ses01F_script03_1', 'Ses01M_script03_2', 'Ses01F_impro03', 'Ses01M_script01_1', 'Ses01M_impro06', 'Ses01F_script01_2', 'Ses01M_impro03', 'Ses01M_impro01', 'Ses01F_script01_3', 'Ses01M_script01_3', 'Ses01F_script03_2', 'Ses01M_script03_1', 'Ses01F_impro04', 'Ses01F_impro06', 'Ses01M_impro02', 'Ses01F_script01_1', 'Ses01F_impro02', 'Ses01M_script02_1', 'Ses01F_impro05', 'Ses01M_impro04', 'Ses01F_impro07', 'Ses01M_script01_2', 'Ses01F_script02_1']
