In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import torch
import torchaudio
import cv2
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
from transformers.models.wav2vec2 import Wav2Vec2Processor, Wav2Vec2Model
from tqdm import tqdm
from facenet_pytorch import InceptionResnetV1, MTCNN
from deepface import DeepFace

In [None]:
DATA_DIRS = {
    "train": "./data/train/train_splits",
    "val": "./data/dev/dev_splits_complete",
    "test": "./data/test/output_repeated_splits_test"
}

OUTPUT_BASE = "./output"

os.makedirs(OUTPUT_BASE, exist_ok=True)

In [None]:
main_speakers = ["Chandler", "Monica", "Ross", "Rachel", "Phoebe", "Joey"]
emotions = ["neutral", "joy", "surprise", "anger", "sadness", "disgust", "fear"]

viridis_colors = plt.cm.viridis(np.linspace(0, 1, len(emotions)))
emotion_colors = dict(zip(emotions, viridis_colors))

emotion_colors["neutral"] = "#808080"
emotion_colors["anger"] = "#DC143C"

## Carregamento e Transformação

In [None]:
def to_seconds(t):
    h, m, s_ms = t.split(":")
    s, ms = s_ms.split(",")
    return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000

def to_snake_case(name: str) -> str:
    """Convert a string (like a column name) to snake_case."""
    name = name.strip()
    name = re.sub(r"[^\w\s]", "", name)
    name = re.sub(r"\s+", "_", name)
    return name.lower()

def prepare_df(df: pd.DataFrame):
    df["start_s"] = df["StartTime"].apply(to_seconds)
    df["end_s"] = df["EndTime"].apply(to_seconds)
    df["duration_s"] = df["end_s"] - df["start_s"]

    df[df["duration_s"] <= 0].head()

    df = df.drop(columns=["Sr No.", "StartTime", "EndTime", "Season", "Episode", "Sentiment"])

    df.columns = [to_snake_case(c) for c in df.columns]

    return df

In [None]:
train_df = prepare_df(pd.read_csv("./data/train_sent_emo.csv"))
test_df = prepare_df(pd.read_csv("./data/test_sent_emo.csv"))
val_df = prepare_df(pd.read_csv("./data/dev_sent_emo.csv"))

## Funções base

In [None]:
def load_config(path):
    with open(path, "r") as f:
        return yaml.safe_load(f)

In [None]:
class BaseExtractor:
    def __init__(self, save_dir):
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(parents=True, exist_ok=True)

    def save(self, dialogue_id, utterance_id, vector):
        path = self.save_dir / f"{dialogue_id}_{utterance_id}.npy"
        np.save(path, vector)
        return str(path)

## Features de Texto

In [None]:
class TextExtractor(BaseExtractor):
    def __init__(self, model_name, save_dir, device="cpu"):
        super().__init__(save_dir)
        self.device = device
        self.tok = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)

    def extract(self, text):
        inputs = self.tok(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs).last_hidden_state[:, 0, :]  # [CLS]
        return outputs.squeeze().cpu().numpy()

In [None]:
def extract_text_with_progress(df, extractor, save_dir):
    """
    Extract features for all utterances, resuming if interrupted.
    
    - Skips already existing .npy files
    - Displays tqdm progress bar
    """
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    total = len(df)
    processed = 0

    if len(list(save_dir.glob("*.npy"))) >= len(df):
        print(f"✅ Already processed {len(df)} utterances. Skipping.")
        return


    for _, row in tqdm(df.iterrows(), total=total, desc="Extracting", ncols=80):
        filename = f"{row['dialogue_id']}_{row['utterance_id']}.npy"
        out_path = save_dir / filename

        if out_path.exists():
            processed += 1
            continue

        vec = extractor.extract(row["utterance"])
        extractor.save(row["dialogue_id"], row["utterance_id"], vec)

        processed += 1

    print(f"\n✅ Completed: {processed}/{total} utterances processed.")
    print(f"Features saved in: {save_dir}")

### Roberta


In [None]:
cfg = load_config("configs/text/roberta.yaml")
extractor = TextExtractor(cfg["model_name"], cfg["save_dir"], cfg["device"])

extract_text_with_progress(train_df, extractor, cfg["save_dir"])

### DistilBERT

In [None]:
cfg = load_config("configs/text/distilbert.yaml")
extractor = TextExtractor(cfg["model_name"], cfg["save_dir"], cfg["device"])

extract_text_with_progress(train_df, extractor, cfg["save_dir"])

### MpNet

In [None]:
cfg = load_config("configs/text/mpnet.yaml")
extractor = TextExtractor(cfg["model_name"], cfg["save_dir"], cfg["device"])

extract_text_with_progress(train_df, extractor, cfg["save_dir"])

## Features de Áudio

In [None]:
from pathlib import Path
import numpy as np
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model

class AudioExtractor(BaseExtractor):
    def __init__(self, model_name, save_dir, cfg):
        super().__init__(save_dir)
        self.model_name = model_name.lower()
        self.cfg = cfg
        self.sample_rate = cfg.get("sample_rate", 16000)

        # --- MFCC ---
        if self.model_name == "mfcc":
            self.extractor = torchaudio.transforms.MFCC(
                sample_rate=self.sample_rate,
                n_mfcc=cfg.get("n_mfcc", 13),
                melkwargs={
                    "n_fft": cfg.get("n_fft", 400),
                    "hop_length": cfg.get("hop_length", 160),
                    "n_mels": cfg.get("n_mels", 23),
                },
            )

        # --- MelSpectrogram ---
        elif self.model_name == "melspectrogram":
            self.extractor = torchaudio.transforms.MelSpectrogram(
                sample_rate=self.sample_rate,
                n_fft=cfg.get("n_fft", 400),
                hop_length=cfg.get("hop_length", 160),
                n_mels=cfg.get("n_mels", 64),
            )

        # --- Wav2Vec2 / Wav2Vec2-like ---
        elif "wav2vec" in self.model_name:
            self.processor = Wav2Vec2Processor.from_pretrained(
                self.model_name, use_safetensors=True
            )
            self.model = Wav2Vec2Model.from_pretrained(self.model_name)
            self.model.eval()
        else:
            raise ValueError(f"Unsupported model: {self.model_name}")

    # ----------------------------------------------------------------------

    def extract(self, audio_path: str):
        # Load the audio directly from .mp4 (requires ffmpeg in system or Conda)
        waveform, sr = torchaudio.load(audio_path)

        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Resample if necessary
        if sr != self.sample_rate:
            waveform = torchaudio.functional.resample(
                waveform, sr, self.sample_rate
            )

        # Ensure consistent max duration (avoid huge clips)
        max_len_seconds = self.cfg.get("max_len_seconds", 30)
        max_len_samples = int(max_len_seconds * self.sample_rate)
        if waveform.shape[1] > max_len_samples:
            waveform = waveform[:, :max_len_samples]

        # -------- MFCC or MelSpectrogram --------
        if self.model_name in ["mfcc", "melspectrogram"]:
            feat = self.extractor(waveform)
            vec = feat.mean(dim=-1).squeeze().numpy()
            return vec

        elif "wav2vec" in self.model_name:
            with torch.no_grad():
                inputs = self.processor(
                    waveform.squeeze(),
                    sampling_rate=self.sample_rate,
                    return_tensors="pt",
                    padding=True,
                )
                outputs = self.model(**inputs).last_hidden_state
                vec = outputs.mean(dim=1).squeeze().cpu().numpy()
            return vec

In [16]:
def extract_audio_with_progress(df: pd.DataFrame, extractor: AudioExtractor, save_dir: str, audio_dir: str):
    """
    Extract audio features for all utterances, continuing from previous progress.
    """

    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)
    total = len(df)
    processed = 0

    for _, row in tqdm(df.iterrows(), total=total, desc="Audio extracting", ncols=80):
        filename = f"{row['dialogue_id']}_{row['utterance_id']}.npy"
        out_path = save_dir / filename
        
        if out_path.exists():
            processed += 1
            continue

        try:
            vec = extractor.extract(f"{audio_dir}/dia{row['dialogue_id']}_utt{row['utterance_id']}.mp4")
            extractor.save(row["dialogue_id"], row["utterance_id"], vec)
        except Exception as e:
            print(f"⚠️ Error on {audio_dir}/dia{row['dialogue_id']}_utt{row['utterance_id']}.mp4: {e}")
            continue
        

        processed += 1

    print(f"\n✅ Completed {processed}/{total} utterances.")

### MFCC

In [17]:
cfg = load_config("configs/audio/mfcc.yaml")
extractor = AudioExtractor(cfg["model_name"], cfg["save_dir"], cfg)

extract_audio_with_progress(train_df, extractor, cfg["save_dir"], "./data/train/train_splits")

Audio extracting:  29%|█████▍             | 2890/9989 [00:00<00:01, 6063.18it/s]

⚠️ Error on ./data/train/train_splits/dia125_utt3.mp4: Failed to create AudioDecoder for ./data/train/train_splits/dia125_utt3.mp4: Could not open input file: ./data/train/train_splits/dia125_utt3.mp4 Invalid data found when processing input


Audio extracting: 100%|█████████████████████| 9989/9989 [05:25<00:00, 30.66it/s]


✅ Completed 9988/9989 utterances.





### MelSpectrogram

In [18]:
cfg = load_config("configs/audio/melspec.yaml")
extractor = AudioExtractor(cfg["model_name"], cfg["save_dir"], cfg)

extract_audio_with_progress(train_df, extractor, cfg["save_dir"], "./data/train/train_splits")

Audio extracting:  12%|██▍                  | 1169/9989 [00:47<08:42, 16.89it/s]

⚠️ Error on ./data/train/train_splits/dia125_utt3.mp4: Failed to create AudioDecoder for ./data/train/train_splits/dia125_utt3.mp4: Could not open input file: ./data/train/train_splits/dia125_utt3.mp4 Invalid data found when processing input


Audio extracting: 100%|█████████████████████| 9989/9989 [08:16<00:00, 20.10it/s]


✅ Completed 9988/9989 utterances.





### Wav2Vec2

In [None]:
cfg = load_config("configs/audio/wav2vec2.yaml")
extractor = AudioExtractor(cfg["model_name"], cfg["save_dir"], cfg)

extract_audio_with_progress(train_df, extractor, cfg["save_dir"], "./data/train/train_splits")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Audio extracting:  12%|██▏                | 1166/9989 [32:16<4:08:49,  1.69s/it]

⚠️ Error on ./data/train/train_splits/dia125_utt3.mp4: Failed to create AudioDecoder for ./data/train/train_splits/dia125_utt3.mp4: Could not open input file: ./data/train/train_splits/dia125_utt3.mp4 Invalid data found when processing input


Audio extracting:  83%|██████████████   | 8251/9989 [4:09:13<1:25:17,  2.94s/it]

## Features Faciais

In [None]:
class VideoExtractor(BaseExtractor):
    def __init__(self, model_name, save_dir, cfg):
        super().__init__(save_dir)
        self.model_name = model_name.lower()
        self.cfg = cfg
        self.frame_step = cfg.get("frame_step", 5)
        self.resize = cfg.get("resize", 160)

        if "facenet" in self.model_name:
            self.detector = MTCNN(keep_all=False, device="cpu")
            self.model = InceptionResnetV1(pretrained="vggface2").eval()
        elif "deepface" in self.model_name or "vgg-face" in self.model_name:
            self.detector_backend = cfg.get("detector_backend", "opencv")
            self.deepface = DeepFace
        else:
            raise ValueError(f"Unsupported video model: {self.model_name}")

    def extract(self, video_path):
        """Extract averaged face embedding for one utterance clip (.mp4)."""
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise RuntimeError(f"Could not open {video_path}")

        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS) or 25
        frame_step = self.frame_step

        frame_id = 0
        embeddings = []

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_id % frame_step != 0:
                frame_id += 1
                continue

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            if "facenet" in self.model_name:
                boxes, probs = self.detector.detect(frame_rgb)
                if boxes is not None and len(boxes) > 0:
                    x1, y1, x2, y2 = boxes[0].astype(int)
                    face = frame_rgb[y1:y2, x1:x2]
                    face = cv2.resize(face, (self.resize, self.resize))
                    face = np.transpose(face, (2, 0, 1)) / 255.0
                    face = torch.tensor(face).unsqueeze(0).float()
                    with torch.no_grad():
                        emb = self.model(face).squeeze().numpy()
                    embeddings.append(emb)

            elif "deepface" in self.model_name or "vgg-face" in self.model_name:
                try:
                    rep = self.deepface.represent(
                        img_path=frame_rgb,
                        model_name=self.model_name.upper(),
                        detector_backend=self.detector_backend,
                        enforce_detection=False,
                    )
                    if len(rep) > 0:
                        emb = np.array(rep[0]["embedding"])
                        embeddings.append(emb)
                except Exception:
                    pass

            frame_id += 1

        cap.release()

        if len(embeddings) == 0:
            return np.zeros(128)

        return np.mean(np.stack(embeddings), axis=0)

In [None]:
def extract_video_with_progress(df: pd.DataFrame, extractor: VideoExtractor, save_dir: str, video_dir: str):
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Video extracting", ncols=80):
        filename = f"{row['dialogue_id']}_{row['utterance_id']}.npy"
        out_path = save_dir / filename
        
        if out_path.exists():
            continue

        try:
            vec = extractor.extract(f"{video_dir}/dia{row['dialogue_id']}_utt{row['utterance_id']}.mp4")
            extractor.save(row["dialogue_id"], row["utterance_id"], vec)
        except Exception as e:
            print(f"⚠️ Error on {video_dir}: {e}")
            continue
        

    print(f"✅ Saved embeddings in {save_dir}")