# **A Multimodal Sentiment Analysis Pipeline Incorporating Valence Detection and Topological Data Analysis - Model Prototype**
---
Authors/Researchers:
1. Alverio, Franz Tovie G.
2. Almarinez, Lucky Richmon C.
3. Jamilano, Kyla Celine L.

# Prototype
----


## **Stage 0: Setup**

### **Installation**:

In [None]:
!pip install missingpy



### **Libraries**:

In [None]:
#supression warnings
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# Cell 6iJgCVCX4Jl8: Initial Library Imports and Constants

# --- Standard Libraries ---
import os
import re
import argparse
import json
import urllib.request
from pathlib import Path
import shutil # Added shutil for find_metadata

# --- Core Third-Party Libraries ---
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
import random # Added random for create_audio_subset

# --- PyTorch / Vision ---
import torch
import torch.nn as nn
import torchvision # Ensure torchvision is imported here
from torchvision import models, transforms
from PIL import Image

# --- Video ---
from moviepy.editor import VideoFileClip

# --- Audio ---
import librosa

# --- Text Encoders ---
_HAS_SBERT, _HAS_TRANSFORMERS = False, False
try:
    from sentence_transformers import SentenceTransformer
    from transformers import AutoTokenizer, AutoModel
    _HAS_SBERT, _HAS_TRANSFORMERS = True, True
    print("✓ Sentence-BERT and transformers available.")
except ImportError:
    try:
        from transformers import AutoTokenizer, AutoModel
        _HAS_TRANSFORMERS = True
        print("✓ transformers available (fallback, no Sentence-BERT).")
    except ImportError:
        print("✗ No text encoder libraries found. Install with:")
        print("  pip install sentence-transformers transformers")

# --- Imputation ---
from sklearn.impute import SimpleImputer

# --- Sklearn (Classifiers, Metrics, Preprocessing, Model Selection) ---
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    mean_absolute_error, confusion_matrix
)
from sklearn.preprocessing import StandardScaler, LabelEncoder


# --- Explainability ---
# import shap # Commented out for now as it's not used in the provided code

# --- Constants ---
DEFAULT_RESNET = "resnet50"
DEFAULT_BERT = "sentence-transformers/all-MiniLM-L6-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
print(f"Default ResNet: {DEFAULT_RESNET}")
print(f"Default BERT: {DEFAULT_BERT}")

✓ Sentence-BERT and transformers available.
Using device: cpu
Default ResNet: resnet50
Default BERT: sentence-transformers/all-MiniLM-L6-v2


### **Sentiment Lexicon**:

In [None]:
# Define sentiment lexicon here as it's a core constant
sentiment_lexicon = {
    'happy': 1, 'joy': 1, 'glad': 1, 'positive': 1,
    'sad': -1, 'unhappy': -1, 'grief': -1, 'negative': -1,
    'angry': -1, 'fear': -1, 'disgust': -1, 'neutral': 0
}
print("✓ Sentiment lexicon initialized.")

✓ Sentiment lexicon initialized.


## **Stage 1: Multimodal Inputs**
---

### **Initial Import Dataset Functions:**

In [None]:
def clone_repo(path, url, name):
    if not os.path.exists(path):
        print(f"Cloning {name} into {path}...")
        !git clone {url} {path}
    else:
        print(f"{name} already exists at {path}.")

def download_csvs(base_url, dest_dir, files):
    os.makedirs(dest_dir, exist_ok=True)
    for f in files:
        path = os.path.join(dest_dir, f)
        if not os.path.exists(path):
            try:
                urllib.request.urlretrieve(f"{base_url}{f}", path)
                print(f"✓ Downloaded {f}")
            except Exception as e:
                print(f"✗ Error downloading {f}: {e}")
        else:
            print(f"{f} already exists.")

def load_csvs(path, files):
    try:
        dfs = [pd.read_csv(os.path.join(path, f)) for f in files]
        print("✓ MELD loaded:", [df.shape for df in dfs])
        return dfs
    except Exception as e:
        print(f"✗ Error loading MELD CSVs: {e}")
        return [None]*len(files)

def find_metadata(base_path, expected):
    # Ensure shutil is imported globally or here if needed
    # import shutil
    for root, _, files in os.walk(base_path):
        for f in files:
            if f.endswith(".csv"):
                src = os.path.join(root, f)
                if src != expected:
                    # Check if shutil is available before using it
                    if 'shutil' in globals():
                        shutil.copy(src, expected)
                    else:
                        print("Warning: shutil not imported. Cannot copy metadata file.")
                return expected
    return None

def create_audio_subset(audio_dir, subset_dir, n=50):
    # Ensure random and shutil are imported globally or here if needed
    # import random, shutil
    if os.path.exists(audio_dir):
        os.makedirs(subset_dir, exist_ok=True)
        files = [f for f in os.listdir(audio_dir) if f.endswith(".mp3")]
        subset = random.sample(files, min(n, len(files)))
        for f in subset:
             # Check if shutil is available before using it
             if 'shutil' in globals():
                 shutil.copy(os.path.join(audio_dir, f), os.path.join(subset_dir, f))
             else:
                 print("Warning: shutil not imported. Cannot copy audio files.")
        print(f"✓ Copied {len(subset)} CREMA-D audio files.")
    else:
        print(f"✗ No audio directory at {audio_dir}.")

### **CREMA-D:**

In [None]:
# --- CREMA-D ---
crema_repo = "/content/CREMA-D"
clone_repo(crema_repo, "https://gitlab.com/cs-cooper-lab/crema-d-mirror.git", "CREMA-D")

crema_meta_path = os.path.join(crema_repo, "processedResults", "summaryTable.csv")
os.makedirs(os.path.dirname(crema_meta_path), exist_ok=True)
meta_file = find_metadata(crema_repo, crema_meta_path)

crema_meta = None
if meta_file and os.path.exists(meta_file):
    try:
        crema_meta = pd.read_csv(meta_file)
        print("✓ CREMA-D metadata:", crema_meta.shape)
    except Exception as e:
        print(f"✗ Error loading CREMA-D metadata: {e}")

Cloning CREMA-D into /content/CREMA-D...
Cloning into '/content/CREMA-D'...
remote: Enumerating objects: 22366, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 22366 (delta 5), reused 0 (delta 0), pack-reused 22351 (from 1)[K
Receiving objects: 100% (22366/22366), 13.09 MiB | 10.56 MiB/s, done.
Resolving deltas: 100% (18/18), done.
Updating files: 100% (22341/22341), done.
Filtering content: 100% (22326/22326), 3.42 GiB | 3.01 MiB/s, done.
✓ CREMA-D metadata: (7442, 2)


In [None]:
print("\nStage 2: Data Import complete.")


Stage 2: Data Import complete.


In [None]:
crema_audio_dir = os.path.join(crema_repo, "AudioMP3")
print("Audio files found:", len(os.listdir(crema_audio_dir)))

Audio files found: 7442


## **Stage 2: Preprocessing**
---

#### **Video Preprocessing (Franz)**

In [None]:
def preprocess_video(path: str, frame_time: float = 1.0, sr: int = 22050):
    """
    Extract audio + frame from video (.mp4).
    Returns: (audio_data, image_tensor) or (None, None)
    """
    if not path or not os.path.exists(path):
        return None, None
    audio_data, image_tensor = None, None
    try:
        clip = VideoFileClip(path)
        # audio
        # Note: Saving to a temp file might be necessary if librosa.load
        # cannot directly read from the clip's audio object.
        # For simplicity, assuming direct processing or a temp file approach is handled elsewhere if needed.
        # A common pattern is to extract audio to a .wav first.
        audio_temp_path = path.replace(".mp4", "_temp_audio.wav")
        try:
             clip.audio.write_audiofile(audio_temp_path, fps=sr, verbose=False, logger=None)
             audio_data_sr = preprocess_audio(audio_temp_path, sr)
             if audio_data_sr:
                 audio_data, sr = audio_data_sr
             os.remove(audio_temp_path) # Clean up temp file
        except Exception as e:
             print(f"✗ Video audio extraction error {path}: {e}")
             audio_data = None

        # frame
        try:
            frame = clip.get_frame(frame_time)
            image_tensor = _img_transform(Image.fromarray(frame))
        except Exception as e:
            print(f"✗ Video frame extraction error {path}: {e}")
            image_tensor = None

    except Exception as e:
        print(f"✗ Video error {path}: {e}")
    return audio_data, image_tensor

#### **Text Preprocessing (Franz)**

In [None]:
# --- 1. Text ---
def preprocess_text(text: str) -> str:
    """Clean text: lowercase, remove URLs + special chars."""
    if not text:
        return ""
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.strip()

#### **Image Preprocessing (Kyla)**

In [None]:
# --- 2. Image ---
# Ensure _img_transform is defined globally or within this cell if it's only used here
_img_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
def preprocess_image(path: str):
    """Load image → 224x224 tensor (normalized)."""
    if not path or not os.path.exists(path):
        # print(f"✗ Image path missing or not found: {path}") # Suppress frequent messages
        return None
    try:
        return _img_transform(Image.open(path).convert("RGB"))
    except Exception as e:
        print(f"✗ Image error {path}: {e}")
        return None

#### **Audio Preprocessing (Lucky)**

In [None]:
# --- 3. Audio ---
def preprocess_audio(path: str, sr: int = 22050):
    """Load audio (.wav/.mp3) → waveform + sample rate."""
    if not path or not os.path.exists(path):
        # print(f"✗ Audio path missing or not found: {path}") # Suppress frequent messages
        return None
    try:
        y, sr = librosa.load(path, sr=sr)
        return y, sr
    except Exception as e:
        print(f"✗ Audio error {path}: {e}")
        return None


In [None]:
print("\nStage 3: Preprocessing functions defined.")


Stage 3: Preprocessing functions defined.


## **Stage 3: Feature Extraction**
---

## Feature Matrix Construction

### Feature Matrix Valence (Schema Definition)

In [None]:
def define_feature_schema(text_dim=768, img_dim=2048, aud_dim=13, lex_dim=1):
    """
    Define the schema (column names) for multimodal features
    without performing any extraction.
    Returns a list of feature column names.
    """
    schema = []

    # Text
    schema += [f"text_feat_{i}" for i in range(text_dim)]
    # Image
    schema += [f"img_feat_{i}" for i in range(img_dim)]
    # Audio
    schema += [f"aud_feat_{i}" for i in range(aud_dim)]
    # Lexicon
    schema += [f"lexicon_feat_{i}" for i in range(lex_dim)]

    # Metadata / labels
    schema += ["dialogue_id", "utterance_id", "speaker",
               "emotion_label", "valence_label", "valence_label_numeric",
               "feature_extraction_successful"]

    return schema

### Feature Matrix Valence (Proper)

In [None]:
def extract_features_from_row(row, text_ext, img_ext, aud_ext,
                              image_dir=None, audio_dir=None, sentiment_lexicon=None):
    """
    Extract multimodal features for a single row.
    Uses schema from define_feature_schema but fills with actual values or NaN.
    """
    row_feats = {}
    feature_extraction_successful = True

    # --- Text features ---
    text_dim = getattr(text_ext, 'dim', 768)
    if 'text' in row and pd.notna(row['text']):
        try:
            text_features_raw = text_ext.extract(row['text'])
            if isinstance(text_features_raw, np.ndarray):
                row_feats.update({f"text_feat_{i}": val for i, val in enumerate(text_features_raw)})
            elif isinstance(text_features_raw, dict):
                row_feats.update(text_features_raw)
            else:
                row_feats.update({f"text_feat_{i}": np.nan for i in range(text_dim)})
                feature_extraction_successful = False
        except:
            row_feats.update({f"text_feat_{i}": np.nan for i in range(text_dim)})
            feature_extraction_successful = False
    else:
        row_feats.update({f"text_feat_{i}": np.nan for i in range(text_dim)})

    # --- Image features ---
    img_dim = getattr(img_ext, 'dim', 2048)
    img_col = row.get('image_filename') or row.get('image_path')
    if image_dir and img_col and os.path.exists(os.path.join(image_dir, img_col)):
        try:
            img_path = os.path.join(image_dir, img_col)
            preprocessed_img = preprocess_image(img_path)
            if preprocessed_img is not None:
                img_features_raw = img_ext.extract(preprocessed_img)
                if isinstance(img_features_raw, np.ndarray):
                    row_feats.update({f"img_feat_{i}": val for i, val in enumerate(img_features_raw)})
                elif isinstance(img_features_raw, dict):
                    row_feats.update(img_features_raw)
                else:
                    row_feats.update({f"img_feat_{i}": np.nan for i in range(img_dim)})
                    feature_extraction_successful = False
            else:
                row_feats.update({f"img_feat_{i}": np.nan for i in range(img_dim)})
                feature_extraction_successful = False
        except:
            row_feats.update({f"img_feat_{i}": np.nan for i in range(img_dim)})
            feature_extraction_successful = False
    else:
        row_feats.update({f"img_feat_{i}": np.nan for i in range(img_dim)})

    # --- Audio features ---
    aud_dim = getattr(aud_ext, 'n_mfcc', 13)
    aud_col = row.get('audio_filename') or row.get('audio_path')
    if audio_dir and aud_col and os.path.exists(os.path.join(audio_dir, aud_col)):
        try:
            aud_path = os.path.join(audio_dir, aud_col)
            audio_data_sr = preprocess_audio(aud_path)
            if audio_data_sr:
                audio_data, sr = audio_data_sr
                aud_features_raw = aud_ext.extract(audio_data, sr)
                if isinstance(aud_features_raw, np.ndarray):
                    row_feats.update({f"aud_feat_{i}": val for i, val in enumerate(aud_features_raw)})
                elif isinstance(aud_features_raw, dict):
                    row_feats.update(aud_features_raw)
                else:
                    row_feats.update({f"aud_feat_{i}": np.nan for i in range(aud_dim)})
                    feature_extraction_successful = False
            else:
                row_feats.update({f"aud_feat_{i}": np.nan for i in range(aud_dim)})
                feature_extraction_successful = False
        except:
            row_feats.update({f"aud_feat_{i}": np.nan for i in range(aud_dim)})
            feature_extraction_successful = False
    else:
        row_feats.update({f"aud_feat_{i}": np.nan for i in range(aud_dim)})

    # --- Lexicon features ---
    if sentiment_lexicon and 'text' in row and pd.notna(row['text']):
        try:
            lex_raw = get_lexicon_sentiment_features(row['text'], sentiment_lexicon)
            if isinstance(lex_raw, np.ndarray):
                row_feats.update({f"lexicon_feat_{i}": val for i, val in enumerate(lex_raw)})
            elif isinstance(lex_raw, dict):
                row_feats.update(lex_raw)
            else:
                row_feats.update({"lexicon_feat_0": np.nan})
                feature_extraction_successful = False
        except:
            row_feats.update({"lexicon_feat_0": np.nan})
            feature_extraction_successful = False
    else:
        row_feats.update({"lexicon_feat_0": np.nan})

    # --- Metadata ---
    row_feats["dialogue_id"] = row.get("Dialogue_ID")
    row_feats["utterance_id"] = row.get("Utterance_ID")
    row_feats["speaker"] = row.get("Speaker")
    row_feats["emotion_label"] = row.get("label")
    row_feats["valence_label"] = row.get("valence_label")

    valence_map = {"positive": 1, "negative": -1, "neutral": 0}
    row_feats["valence_label_numeric"] = valence_map.get(row_feats["valence_label"], np.nan)

    row_feats["feature_extraction_successful"] = feature_extraction_successful

    return row_feats

### Feature Matrix Valence (Orchestration)

In [None]:
def build_feature_matrix_valence(df, text_ext, img_ext, aud_ext,
                                 image_dir=None, audio_dir=None, sentiment_lexicon=None):
    """
    Build the feature matrix by applying extract_features_from_row
    to each row in the dataframe.
    """
    features = []

    for idx, row in df.iterrows():
        feats = extract_features_from_row(row, text_ext, img_ext, aud_ext,
                                          image_dir=image_dir,
                                          audio_dir=audio_dir,
                                          sentiment_lexicon=sentiment_lexicon)
        features.append(feats)

    features_df = pd.DataFrame(features, columns=define_feature_schema())
    print("Feature matrix built:", features_df.shape)
    return features_df

### Sentiment Lexicon Extraction


In [None]:
def get_lexicon_sentiment_features(text, lexicon=sentiment_lexicon):
    """Return mean sentiment score based on a lexicon."""
    if not isinstance(text, str) or not text.strip():
        return np.array([0.0])

    words = preprocess_text(text).split()
    scores = [lexicon.get(word, 0) for word in words]
    return np.array([np.mean(scores)]) if scores else np.array([0.0])


### Audio Feature Extractor

In [None]:
# Cell A1D0oG7ooE4d: Stage 4 - Valence-specific Audio Feature Extractor
# Ensure necessary imports are available (should be covered by Stage 1)
# import torch.nn as nn, numpy as np, librosa

import librosa
import numpy as np
import torch.nn as nn

class AudioFeatureExtractorValence(nn.Module):
    """Extract MFCC features for valence analysis."""
    def __init__(self, n_mfcc=13, device='cpu'):
        super().__init__()
        self.n_mfcc = n_mfcc
        self.device = device # Device might not be strictly needed for librosa, but keep for consistency

    # Modified extract method to accept audio_data and sr
    def extract(self, audio_data, sr):
        # Removed the redundant local imports as numpy and librosa are imported globally
        # import numpy as np
        # import librosa

        if audio_data is None or sr is None:
            # Return a dictionary with named features
            return {f'aud_feat_{i}': np.nan for i in range(self.n_mfcc)}
        try:
            # Ensure audio_data is a numpy array with float32 dtype
            audio_data = np.asarray(audio_data, dtype=np.float32)

            mfccs = librosa.feature.mfcc(
                y=audio_data, # Use the provided audio_data
                sr=sr,         # Use the provided sample rate
                n_mfcc=self.n_mfcc,
                n_fft=2048,
                hop_length=512
            )
            # Average over time and convert to dictionary with named features
            features_array = mfccs.mean(axis=1)
            return {f'aud_feat_{i}': features_array[i] for i in range(self.n_mfcc)}

        except Exception as e:
            print(f"[AudioFeatureExtractorValence] Error: {e}")
            # Return placeholder dictionary on error
            return {f'aud_feat_{i}': np.nan for i in range(self.n_mfcc)}

print("\nStage 4: Valence-specific Audio Feature Extractor defined.")


Stage 4: Valence-specific Audio Feature Extractor defined.


### Text Feature Extractor

In [None]:
# Cell glbWaK4voQmd: Stage 4 - Base Feature Extractors (Text)

# Ensure necessary imports are available (should be covered by Stage 1)
# import torch, torch.nn as nn, numpy as np
# from transformers import AutoTokenizer, AutoModel # Ensure transformers are imported

class TextFeatureExtractor:
    def __init__(self, model_name=DEFAULT_BERT, device=DEVICE):
        # Check if necessary libraries are available globally
        if '_HAS_SBERT' not in globals() or '_HAS_TRANSFORMERS' not in globals() or not (_HAS_SBERT or _HAS_TRANSFORMERS):
            raise ImportError("Neither Sentence-BERT nor transformers available globally.")

        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
        # Store dimension
        self.dim = self.model.config.hidden_size

    def extract(self, text):
        if not text:
            # Return a dictionary with named features
            return {f'text_feat_{i}': 0.0 for i in range(self.dim)}
        try:
            # Ensure torch is available
            if 'torch' not in globals():
                 import torch
            encoded = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(self.device)
            with torch.no_grad():
                output = self.model(**encoded)
            # Convert numpy array output to a dictionary with named features
            features_array = output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            return {f'text_feat_{i}': features_array[i] for i in range(self.dim)}

        except Exception as e:
            print(f"[TextFeatureExtractor] Error: {e}")
            # Return placeholder dictionary on error
            return {f'text_feat_{i}': np.nan for i in range(self.dim)}

print("\nStage 4: Text Feature Extractor defined.")


Stage 4: Text Feature Extractor defined.


### Image Feature Extractor

In [None]:
# Cell BY4OgFkMoR_V: Stage 4 - Base Feature Extractors (Image)

import torch
import torch.nn as nn
import torchvision

class ImageFeatureExtractor(nn.Module):
    def __init__(self, model_name=DEFAULT_RESNET, device=DEVICE):
        super().__init__()
        self.device = device
        try:
            # Ensure torchvision is imported globally or within the class
            if 'torchvision' not in globals():
                 import torchvision
            backbone = getattr(torchvision.models, model_name)(pretrained=True)
        except AttributeError:
            raise ValueError(f"Unsupported image model: {model_name}")
        # Remove classifier head
        self.model = nn.Sequential(*list(backbone.children())[:-1]).to(device).eval()
        # Determine output dimension by passing a dummy tensor
        try:
            if 'torch' not in globals():
                 import torch
            with torch.no_grad():
                dummy_input = torch.randn(1, 3, 224, 224).to(device)
                self.dim = self.model(dummy_input).squeeze().shape[0]
        except Exception as e:
            print(f"Warning: Could not determine image feature dimension: {e}")
            self.dim = 2048 # Default for ResNet50/101/152 features before FC


    def extract(self, image_tensor):
        # Ensure torch and numpy are available
        if 'torch' not in globals():
             import torch
        if 'np' not in globals():
             import numpy as np

        if image_tensor is None:
            # Return a dictionary with named features
            return {f'img_feat_{i}': np.nan for i in range(self.dim)}
        try:
            with torch.no_grad():
                # Add batch dimension if needed and move to device
                if image_tensor.ndim == 3:
                    image_tensor = image_tensor.unsqueeze(0).to(self.device)
                elif image_tensor.device != self.device:
                    image_tensor = image_tensor.to(self.device)

                features_tensor = self.model(image_tensor).squeeze()
                # Convert numpy array output to a dictionary with named features
                features_array = features_tensor.cpu().numpy()
                return {f'img_feat_{i}': features_array[i] for i in range(self.dim)}

        except Exception as e:
            print(f"[ImageFeatureExtractor] Error: {e}")
            # Return placeholder dictionary on error
            return {f'img_feat_{i}': np.nan for i in range(self.dim)}

print("\nStage 4: Image Feature Extractor defined.")


Stage 4: Image Feature Extractor defined.


### Feature Extraction (Overall)

In [None]:
from tqdm import tqdm

In [None]:
# Assuming your feature extractors and preprocessors are already defined:
# TextFeatureExtractor, AudioFeatureExtractorValence, ImageFeatureExtractor
# preprocess_audio, preprocess_video, preprocess_image
# get_lexicon_sentiment_features, define_feature_schema, extract_features_from_row

# --- Initialize extractors ---
text_extractor = TextFeatureExtractor(model_name="bert-base-uncased", device='cpu')
audio_extractor = AudioFeatureExtractorValence(n_mfcc=13, device='cpu')
image_extractor = ImageFeatureExtractor(model_name="resnet50", device='cpu')

# Assume sentiment_lexicon is a dictionary {word: score}
lexicon = sentiment_lexicon

# Path to CREMA-D videos and metadata
crema_video_dir = "/content/CREMA-D/VideoClips"
crema_meta_path = "/content/CREMA-D/processedResults/summaryTable.csv"
crema_meta = pd.read_csv(crema_meta_path)

# Optional: if audio files exist separately
crema_audio_dir = "/content/CREMA-D/AudioClips"

# --- Build feature matrix ---
features_list = []

for idx, row in tqdm(crema_meta.iterrows(), total=len(crema_meta), desc="Processing CREMA-D"):
    row_dict = {}

    # --- 1. Text ---
    text_features = text_extractor.extract(row.get("text", ""))

    # --- 2. Lexicon ---
    lex_raw = get_lexicon_sentiment_features(row.get("text", ""), lexicon)
    lex_dict = {f"lexicon_feat_0": lex_raw[0]}

    # --- 3. Video & Image ---
    video_path = row.get("video_path") or os.path.join(crema_video_dir, row["Filename"])
    audio_data, frame_tensor = preprocess_video(video_path, frame_time=1.0, sr=22050)

    # Image features from frame
    img_features = image_extractor.extract(frame_tensor)

    # Audio features from extracted waveform
    aud_features = audio_extractor.extract(audio_data, 22050)

    # --- 4. Combine features ---
    row_dict.update(text_features)
    row_dict.update(img_features)
    row_dict.update(aud_features)
    row_dict.update(lex_dict)

    # --- 5. Metadata ---
    row_dict["dialogue_id"] = row.get("Dialogue_ID")
    row_dict["utterance_id"] = row.get("Utterance_ID")
    row_dict["speaker"] = row.get("Speaker")
    row_dict["emotion_label"] = row.get("label")
    row_dict["valence_label"] = row.get("valence_label")
    valence_map = {"positive":1, "negative":-1, "neutral":0}
    row_dict["valence_label_numeric"] = valence_map.get(row_dict["valence_label"], np.nan)
    row_dict["feature_extraction_successful"] = True

    features_list.append(row_dict)

# --- 6. Convert to DataFrame ---
features_df = pd.DataFrame(features_list, columns=define_feature_schema())
print("CREMA-D feature matrix shape:", features_df.shape)

UnboundLocalError: cannot access local variable 'torchvision' where it is not associated with a value

## **Stage 4: Fusion & Robustness**
---

### Imputation

In [None]:
# Define the main training pipeline function
# This function encapsulates the steps for feature extraction, imputation, training, and evaluation.

# Ensure necessary imports are available (should be covered by Stage 1, but included for clarity)
import os
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt # Added for plotting
import seaborn as sns # Added for plotting


def run_training_pipeline(args):
    """
    Runs the end-to-end training and evaluation pipeline.

    Args:
        args (ConfigArgs): An object containing configuration parameters.
                             Expected attributes: manifest, out_dir, bert_model,
                             tda, trees, max_depth.
    Returns:
        dict: A dictionary containing the trained model package (model, scaler,
                imputer, label_encoder) or None if training failed.
    """
    # Declare global variables at the beginning of the function
    global evaluation_results_valence

    print("\n--- Running Training Pipeline ---")

    # --- 1. Load Data Manifest ---
    if not os.path.exists(args.manifest):
        print(f"Error: Manifest file not found at {args.manifest}")
        return None
    try:
        manifest_df = pd.read_csv(args.manifest)
        print(f"Manifest loaded: {manifest_df.shape}")
    except Exception as e:
        print(f"Error loading manifest: {e}")
        return None

    # --- 2. Initialize Feature Extractors ---
    print("\nInitializing Feature Extractors...")
    try:
        text_extractor = TextFeatureExtractor(model_name=args.bert_model)
        image_extractor = ImageFeatureExtractor() # Using default resnet50
        audio_extractor = AudioFeatureExtractorValence() # Using default n_mfcc=13

        # Initialize TDA object if enabled and libraries are available - REMOVED
        # persim_obj = None # REMOVED
        # if args.tda and '_HAS_TDA' in globals() and _HAS_TDA: # REMOVED
        #     # Assuming a default pixel size for PersImage for now # REMOVED
        #     # This might need to be configurable or determined dynamically # REMOVED
        #     persim_obj = PersImage(pixels=[20, 20], verbose=False) # Example pixels # REMOVED

        print("Feature extractors initialized.")
    except ImportError as e:
        print(f"Error initializing feature extractors: {e}")
        print("Ensure necessary libraries are installed (transformers, torchvision, librosa).")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during extractor initialization: {e}")
        return None


    # --- 3. Build Feature Matrix ---
    print("\nBuilding Feature Matrix...")
    try:
        # Use the corrected build_feature_matrix_valence function
        features_df = build_feature_matrix_valence(
            manifest_df, text_extractor, image_extractor, audio_extractor,
            image_dir=None, # Specify if you have image files
            audio_dir=None, # Specify if you have audio files
            sentiment_lexicon=sentiment_lexicon, # Use the global sentiment lexicon
        )
        print(f"Feature matrix built: {features_df.shape}")
        # Check for successful feature extraction
        if not features_df['feature_extraction_successful'].all():
            print("Warning: Feature extraction failed for some rows. These rows will be excluded.")
            # Optionally filter out rows where feature extraction was not successful
            features_df = features_df[features_df['feature_extraction_successful']].drop(columns=['feature_extraction_successful'])
            print(f"Feature matrix after filtering unsuccessful extractions: {features_df.shape}")
        else:
            features_df = features_df.drop(columns=['feature_extraction_successful'])


    except Exception as e:
        print(f"An unexpected error occurred during feature matrix building: {e}")
        return None

    # Separate features (X) and labels (y)
    # Identify feature columns (exclude labels and metadata)
    # Assuming columns not in this list are features. Adjust if needed.
    label_cols = ['emotion_label', 'valence_label', 'valence_label_numeric']
    meta_cols = ['dialogue_id', 'utterance_id', 'speaker']
    # Also exclude any potential TDA placeholder columns if TDA was requested but not available - REMOVED
    # placeholder_tda_cols = [col for col in features_df.columns if col.startswith('tda_feat_') and features_df[col].isnull().all()] # REMOVED
    # Combine all non-feature columns to exclude - MODIFIED
    non_feature_cols = label_cols + meta_cols # placeholder_tda_cols removed
    feature_cols = [col for col in features_df.columns if col not in non_feature_cols]

    X = features_df[feature_cols].values
    # Use the numeric valence label for the target variable 'y'
    # Ensure 'valence_label_numeric' exists and select it
    if 'valence_label_numeric' in features_df.columns:
        y = features_df['valence_label_numeric'].values
        # Keep the original string labels for potential later use (e.g., confusion matrix)
        y_string_labels = features_df['valence_label'].values
        emotion_string_labels = features_df['emotion_label'].values # Also keep emotion labels
    else:
        print("Error: 'valence_label_numeric' column not found in features_df.")
        return None

    print(f"Feature matrix shape (X): {X.shape}")
    print(f"Label vector shape (y): {y.shape}")
    # print(f"Sample X (first row): {X[0, :10]}") # Debug print
    # print(f"Sample y (first 10): {y[:10]}") # Debug print


    # --- Handle Invalid Labels and Label Encoding ---
    print("\nHandling labels...")
    # Define valence_map here before it's used
    valence_map = {"positive": 1, "negative": -1, "neutral": 0}

    # Use the numeric valence label for the target variable 'y'
    # Filter out samples with missing or invalid numeric valence labels
    # Invalid labels are defined by pd.isna or potentially specific numeric values if applicable
    valid_label_mask = ~pd.isna(y)

    print(f"Total samples: {len(y)}")
    print(f"Valid samples for training: {valid_label_mask.sum()}")

    # Apply the mask to X and y
    X_valid = X[valid_label_mask]
    y_valid = y[valid_label_mask]
    y_string_labels_valid = y_string_labels[valid_label_mask] # Keep string labels for valid data

    # For valence classification, the numeric labels (1, -1, 0) are already suitable.
    # We still need a label encoder to map these numeric values back to string labels
    # for evaluation metrics and plotting (like confusion matrix).
    # Fit LabelEncoder on the unique *valid* numeric labels and their corresponding strings
    unique_numeric_labels = np.unique(y_valid[~np.isnan(y_valid)]) # Get unique non-NaN numeric labels
    unique_string_labels = [valence_map.get(num_label, 'Unknown') for num_label in unique_numeric_labels] # Map back to strings

    label_encoder = LabelEncoder()
    # Fit on the unique numeric labels directly, as they will be the classes for the classifier
    label_encoder.fit(unique_numeric_labels)
    print(f"Label encoder fitted on unique valid numeric labels: {label_encoder.classes_}")

    # The y_valid is already in the correct numeric format for training RandomForestClassifier
    # y_train will be y_valid


    # --- Imputation (Apply imputation after feature extraction) ---
    # Use the imputation function defined in cell 1703588a
    print("\nApplying Imputation on valid features...")
    if np.isnan(X_valid).any():
        # Create an imputer and fit/transform
        # Using SimpleImputer as fallback if MissForest is not available
        imputer = SimpleImputer(strategy='mean') # Instantiate imputer
        X_imputed = imputer.fit_transform(X_valid)
        print("Imputation applied.")
    else:
        X_imputed = X_valid # No imputation needed
        imputer = None # No imputer was fitted


    # --- 4. Train Model ---
    print("\nTraining Model...")
    try:
        # Use the train_random_forest function defined in cell acb19d53
        # Pass the imputed features and the valid numeric labels
        training_result = train_random_forest(X_imputed, y_valid,
                                                 n_estimators=args.trees,
                                                 max_depth=args.max_depth)

        model = training_result.get('model')
        scaler = training_result.get('scaler') # Scaler from train_random_forest
        # The mask returned by train_random_forest is based on NaNs in y_valid,
        # but we already handled NaNs in y earlier using valid_label_mask.
        # The scaler returned is fitted on the data *after* imputation within train_random_forest.

        if model is None:
            print("Model training failed.")
            return None

        print("Model trained successfully.")

    except Exception as e:
        print(f"An unexpected error occurred during model training: {e}")
        return None


    # --- 5. Save Model Artifacts ---
    print("\nSaving Model Artifacts...")
    os.makedirs(args.out_dir, exist_ok=True)
    try:
        model_path = os.path.join(args.out_dir, "valence_model.joblib")
        scaler_path = os.path.join(args.out_dir, "valence_scaler.joblib")
        imputer_path = os.path.join(args.out_dir, "valence_imputer.joblib") # Save the imputer if used
        label_encoder_path = os.path.join(args.out_dir, "valence_label_encoder.joblib")

        joblib.dump(model, model_path)
        joblib.dump(scaler, scaler_path)
        if imputer: # Only save imputer if it was used
             joblib.dump(imputer, imputer_path)
        joblib.dump(label_encoder, label_encoder_path) # Save the label encoder

        print(f"Model saved to {model_path}")
        print(f"Scaler saved to {scaler_path}")
        if imputer:
             print(f"Imputer saved to {imputer_path}")
        print(f"Label Encoder saved to {label_encoder_path}")

    except Exception as e:
        print(f"Error saving model artifacts: {e}")
        # Continue execution even if saving fails, so evaluation can still run


    # --- 6. Evaluate Model ---
    # This section uses the test set (meld_test) if available globally.
    print("\nEvaluating Model on Test Set...")
    if 'meld_test' in globals() and meld_test is not None:
        try:
            # Build feature matrix for the test set
            print("Building feature matrix for test set...")
            test_features_df = build_feature_matrix_valence(
                meld_test, text_extractor, image_extractor, audio_extractor,
                image_dir=None, # Specify if you have test image files
                audio_dir=None, # Specify if you have test audio files
                sentiment_lexicon=sentiment_lexicon,
                # include_tda=args.tda, # REMOVED
                # persim_obj=persim_obj # REMOVED
            )
            print(f"Test feature matrix built: {test_features_df.shape}")

            # Separate features (X_test) and labels (y_test)
            # Apply the same column filtering as training data
            # Ensure 'valence_label_numeric' exists
            if 'valence_label_numeric' in test_features_df.columns:
                y_test_raw = test_features_df['valence_label_numeric'].values
                y_test_string_labels_raw = test_features_df['valence_label'].values # Keep original string labels
                X_test_raw = test_features_df[feature_cols].values # Use the same feature columns identified during training
            else:
                 print("Error: 'valence_label_numeric' column not found in test_features_df. Skipping evaluation.")
                 return {'model': model, 'scaler': scaler, 'imputer': imputer, 'label_encoder': label_encoder}


            # Handle missing/invalid labels in test set
            test_valid_mask = ~pd.isna(y_test_raw)
            X_test_valid = X_test_raw[test_valid_mask]
            y_test_valid = y_test_raw[test_valid_mask]
            y_test_string_labels_valid = y_test_string_labels_raw[test_valid_mask] # Keep string labels for valid test data


            # Apply the same imputer (fitted on training data) to the test features
            print("Applying Imputation on valid test features...")
            # Check if imputer was fitted during training
            if imputer:
                if np.isnan(X_test_valid).any():
                    X_test_imputed = imputer.transform(X_test_valid)
                    print("Imputation applied to test features.")
                else:
                    X_test_imputed = X_test_valid # No imputation needed for test set
            else:
                # If no imputer was fitted during training (because training data had no NaNs),
                # but test data has NaNs, use a new imputer (fitted on test data only - WARNING: potential data leakage)
                # A better approach is to fit imputer on training data and transform both train/test.
                # Assuming the imputer from training is used if available. If not, fallback.
                if np.isnan(X_test_valid).any():
                    print("Warning: Training data had no NaNs, so no imputer was saved. Applying SimpleImputer (mean) on test features (potential data leakage).")
                    test_imputer = SimpleImputer(strategy='mean')
                    X_test_imputed = test_imputer.fit_transform(X_test_valid)
                else:
                    X_test_imputed = X_test_valid


            # Apply the same scaler (fitted on training data) to the test features
            print("Applying Scaling on imputed test features...")
            if scaler:
                 X_test_scaled = scaler.transform(X_test_imputed)
                 print("Scaling applied to test features.")
            else:
                 # This case should ideally not happen if scaling was applied during training
                 print("Warning: Scaler not found from training. Skipping scaling on test features.")
                 X_test_scaled = X_test_imputed


            # Make predictions
            print("Making predictions on test set...")
            y_pred_valid = model.predict(X_test_scaled)

            # Convert numeric predictions back to string labels for evaluation metrics
            # Handle potential edge case where predictions are not in label_encoder.classes_
            y_pred_string_labels_valid = []
            for pred in y_pred_valid:
                try:
                    # Inverse transform might expect integer labels corresponding to fitted classes
                    # Find the closest class in label_encoder.classes_
                    closest_class_index = np.argmin(np.abs(label_encoder.classes_ - pred))
                    predicted_numeric_label = label_encoder.classes_[closest_class_index]
                    # Inverse transform the matched numeric label
                    predicted_string = label_encoder.inverse_transform([predicted_numeric_label])[0]
                    y_pred_string_labels_valid.append(predicted_string)
                except Exception as e:
                    # Fallback if inverse transform fails or prediction is unexpected
                    print(f"Warning: Could not inverse transform prediction {pred}: {e}. Assigning 'Unknown'.")
                    y_pred_string_labels_valid.append('Unknown')

            # Ensure y_test_string_labels_valid and y_pred_string_labels_valid are lists or arrays of strings
            y_test_string_labels_valid = y_test_string_labels_valid.tolist() if isinstance(y_test_string_labels_valid, np.ndarray) else y_test_string_labels_valid
            y_pred_string_labels_valid = y_pred_string_labels_valid # Already a list

            # Evaluate based on the task type (classification for valence 1/-1/0)
            # Ensure labels used for metrics are consistent (string or numeric)
            # Let's use numeric labels for sklearn metrics that expect numeric input
            # and string labels for plotting confusion matrix with meaningful labels.

            print("\n--- Evaluation Metrics (Valence) ---")
            # Ensure y_test_valid and y_pred_valid are in a format compatible with metrics
            # They should be arrays of numeric values (1, -1, 0)
            if model.__class__.__name__ == 'RandomForestClassifier':
                 # Classification metrics
                 accuracy = accuracy_score(y_test_valid, y_pred_valid)
                 # Use average='weighted' for multi-class with potential imbalance
                 f1 = f1_score(y_test_valid, y_pred_valid, average='weighted', zero_division=0)
                 precision = precision_score(y_test_valid, y_pred_valid, average='weighted', zero_division=0)
                 recall = recall_score(y_test_valid, y_pred_valid, average='weighted', zero_division=0)

                 print(f"Accuracy: {accuracy:.4f}")
                 print(f"F1 Score (weighted): {f1:.4f}")
                 print(f"Precision (weighted): {precision:.4f}")
                 print(f"Recall (weighted): {recall:.4f}")

                 # Store evaluation results globally (optional, for external access if needed)
                 evaluation_results_valence = {
                     'y_true': y_test_string_labels_valid, # Store string labels for plotting
                     'y_pred': y_pred_string_labels_valid, # Store string labels for plotting
                     'class_labels': label_encoder.inverse_transform(label_encoder.classes_).tolist(), # Get sorted string class labels
                     'accuracy': accuracy,
                     'f1_weighted': f1,
                     'precision_weighted': precision,
                     'recall_weighted': recall
                 }

                 # Plot Confusion Matrix
                 print("\nPlotting Confusion Matrix...")
                 # Use string labels for plotting for clarity
                 plot_confusion_matrix(
                     evaluation_results_valence['y_true'],
                     evaluation_results_valence['y_pred'],
                     evaluation_results_valence['class_labels'],
                     save_path=os.path.join(args.out_dir, "valence_confusion_matrix.png")
                 )


            elif model.__class__.__name__ == 'RandomForestRegressor':
                 # Regression metrics (if using regression for valence score directly)
                 mae = mean_absolute_error(y_test_valid, y_pred_valid)
                 print(f"Mean Absolute Error: {mae:.4f}")

                 # Store regression evaluation results
                 evaluation_results_valence = {
                     'y_true': y_test_valid,
                     'y_pred': y_pred_valid,
                     'mae': mae
                 }

            print("------------------------------")


        except Exception as e:
            print(f"An unexpected error occurred during model evaluation: {e}")
            # Continue, but return the trained model package without evaluation results


    else:
        print("Test set (meld_test) not found or is None. Skipping evaluation.")


    print("\n--- Training Pipeline Complete ---")

    # Return the trained model package
    return {'model': model, 'scaler': scaler, 'imputer': imputer, 'label_encoder': label_encoder}

print("run_training_pipeline function defined.")


run_training_pipeline function defined.


In [None]:
# This block contains the configuration and execution part of cell Vn1D5VmzP0BY.
# It assumes the run_training_pipeline function is already defined in the previous cell output.

# Define a simple class to hold the arguments (configuration)
class ConfigArgs:
    """Simple class to hold configuration parameters."""
    def __init__(self, manifest, out_dir="./models", bert_model=DEFAULT_BERT, tda=False, trees=300, max_depth=None):
        self.manifest = manifest
        self.out_dir = out_dir
        self.bert_model = bert_model
        self.tda = tda
        self.trees = trees
        self.max_depth = max_depth

# --- Set your configuration here ---
# Create an instance of the ConfigArgs class with your desired parameters.
# The 'manifest' argument is required. We will create a manifest
# from the loaded MELD training data (`meld_train`) for demonstration.

# Ensure the dummy manifest file exists (re-create it if necessary)
# Create a manifest DataFrame that includes columns needed by build_feature_matrix_valence
# Use the loaded meld_train DataFrame (from cell 89StvGbdyzx8)
dummy_manifest_path = "/content/meld_train_manifest_full.csv"

# Check if meld_train DataFrame is available globally and is not None
if 'meld_train' in globals() and meld_train is not None:
    # Select and rename columns needed by build_feature_matrix_valence
    # Columns needed: 'text', 'label', 'valence_label', 'Dialogue_ID', 'Utterance_ID', 'Speaker'
    # meld_train has: 'Utterance', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Speaker' (and others)
    # Ensure all required columns exist in meld_train before selecting
    required_meld_cols = ['Utterance', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Speaker']
    available_meld_cols = [col for col in required_meld_cols if col in meld_train.columns]

    # Create manifest_df using only available required columns
    manifest_df = meld_train[available_meld_cols].copy()

    # Rename columns to match build_feature_matrix_valence expectations
    rename_map = {'Utterance': 'text', 'Emotion': 'label', 'Sentiment': 'valence_label'}
    # Only apply renames for columns that exist and are in the map
    cols_to_rename = {k: v for k, v in rename_map.items() if k in manifest_df.columns}
    manifest_df = manifest_df.rename(columns=cols_to_rename)

    # Add any missing expected columns as None to ensure consistency for build_feature_matrix_valence
    expected_manifest_cols = ['text', 'label', 'valence_label', 'Dialogue_ID', 'Utterance_ID', 'Speaker', 'image_filename', 'audio_filename'] # Added image/audio filenames
    for col in expected_manifest_cols:
         if col not in manifest_df.columns:
              manifest_df[col] = None # Add missing columns with None

    try:
        manifest_df.to_csv(dummy_manifest_path, index=False)
        print(f"Created/Updated dummy manifest at: {dummy_manifest_path}")
    except Exception as e:
         print(f"Error saving dummy manifest to {dummy_manifest_path}: {e}. Cannot proceed.")
         dummy_manifest_path = None # Set to None if saving fails
else:
    print("Warning: meld_train DataFrame not found or is None. Cannot create dummy manifest.")
    dummy_manifest_path = None


# Create an args object with the desired configuration
# Only proceed if the dummy manifest path was successfully created
if dummy_manifest_path is not None:
    # Ensure _HAS_TDA is defined globally (it's from cell 6iJgCVCX4Jl8 if TDA libs were imported)
    # Check if _HAS_TDA is defined globally, otherwise default to False
    _HAS_TDA = globals().get('_HAS_TDA', False)


    args_valence_train = ConfigArgs(
        manifest=dummy_manifest_path, # Use the path to the created manifest CSV
        out_dir="./models_valence", # Output directory for the model artifacts
        bert_model=DEFAULT_BERT, # Use the default BERT model defined earlier (in cell 6iJgCVCX4Jl8)
        tda=_HAS_TDA, # Include TDA features if the libraries were successfully imported (_HAS_TDA is global)
        trees=300, # Number of trees for Random Forest
        max_depth=None # Max depth for Random Forest (None means nodes are expanded until all leaves are pure or contain less than min_samples_split samples.)
    )

    print("\n--- Running training pipeline with configuration: ---")
    print(f"  Manifest: {args_valence_train.manifest}")
    print(f"  Output Directory: {args_valence_train.out_dir}")
    print(f"  BERT Model: {args_valence_train.bert_model}")
    print(f"  Use TDA: {args_valence_train.tda}")
    print(f"  Random Forest Trees: {args_valence_train.trees}")
    print(f"  Random Forest Max Depth: {args_valence_train.max_depth}")
    print("-------------------------------------------------")

    # Call the main training pipeline function with the configuration object
    print("\nCalling run_training_pipeline...")
    # The run_training_pipeline function now handles both training and evaluation internally.
    # It returns the trained model package or None if training failed.
    trained_model_package = run_training_pipeline(args_valence_train)

    if trained_model_package and trained_model_package.get('model') is not None: # Use .get() for safety
         print("\n--- Training and Evaluation pipeline execution complete. ---")
         print("Model trained and evaluated successfully.")
         # trained_model_package contains the trained model, scaler, imputer, label_encoder
    else:
         print("\n--- Training and Evaluation pipeline execution complete. ---")
         print("Model training failed. Evaluation skipped.")

else:
    print("\n--- Training and Evaluation pipeline execution skipped ---")
    print("Reason: Dummy manifest file could not be created.")
    print("------------------------------------------")

Created/Updated dummy manifest at: /content/meld_train_manifest_full.csv

--- Running training pipeline with configuration: ---
  Manifest: /content/meld_train_manifest_full.csv
  Output Directory: ./models_valence
  BERT Model: sentence-transformers/all-MiniLM-L6-v2
  Use TDA: False
  Random Forest Trees: 300
  Random Forest Max Depth: None
-------------------------------------------------

Calling run_training_pipeline...

--- Running Training Pipeline ---
Manifest loaded: (9989, 8)

Initializing Feature Extractors...
An unexpected error occurred during extractor initialization: cannot access local variable 'torchvision' where it is not associated with a value

--- Training and Evaluation pipeline execution complete. ---
Model training failed. Evaluation skipped.


In [None]:
# This block contains the configuration and execution part of cell Vn1D5VmzP0BY.
# It assumes the run_training_pipeline function is already defined in the previous cell output.

# Define a simple class to hold the arguments (configuration)
class ConfigArgs:
    """Simple class to hold configuration parameters."""
    def __init__(self, manifest, out_dir="./models", bert_model=DEFAULT_BERT, tda=False, trees=300, max_depth=None):
        self.manifest = manifest
        self.out_dir = out_dir
        self.bert_model = bert_model
        self.tda = tda
        self.trees = trees
        self.max_depth = max_depth

# --- Set your configuration here ---
# Create an instance of the ConfigArgs class with your desired parameters.
# The 'manifest' argument is required. We will create a manifest
# from the loaded MELD training data (`meld_train`) for demonstration.

# Ensure the dummy manifest file exists (re-create it if necessary)
# Create a manifest DataFrame that includes columns needed by build_feature_matrix_valence
# Use the loaded meld_train DataFrame (from cell 89StvGbdyzx8)
dummy_manifest_path = "/content/meld_train_manifest_full.csv"

# Check if meld_train DataFrame is available globally and is not None
if 'meld_train' in globals() and meld_train is not None:
    # Select and rename columns needed by build_feature_matrix_valence
    # Columns needed: 'text', 'label', 'valence_label', 'Dialogue_ID', 'Utterance_ID', 'Speaker'
    # meld_train has: 'Utterance', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Speaker' (and others)
    # Ensure all required columns exist in meld_train before selecting
    required_meld_cols = ['Utterance', 'Emotion', 'Sentiment', 'Dialogue_ID', 'Utterance_ID', 'Speaker']
    available_meld_cols = [col for col in required_meld_cols if col in meld_train.columns]

    # Create manifest_df using only available required columns
    manifest_df = meld_train[available_meld_cols].copy()

    # Rename columns to match build_feature_matrix_valence expectations
    rename_map = {'Utterance': 'text', 'Emotion': 'label', 'Sentiment': 'valence_label'}
    # Only apply renames for columns that exist and are in the map
    cols_to_rename = {k: v for k, v in rename_map.items() if k in manifest_df.columns}
    manifest_df = manifest_df.rename(columns=cols_to_rename)

    # Add any missing expected columns as None to ensure consistency for build_feature_matrix_valence
    expected_manifest_cols = ['text', 'label', 'valence_label', 'Dialogue_ID', 'Utterance_ID', 'Speaker', 'image_filename', 'audio_filename'] # Added image/audio filenames
    for col in expected_manifest_cols:
         if col not in manifest_df.columns:
              manifest_df[col] = None # Add missing columns with None

    try:
        manifest_df.to_csv(dummy_manifest_path, index=False)
        print(f"Created/Updated dummy manifest at: {dummy_manifest_path}")
    except Exception as e:
         print(f"Error saving dummy manifest to {dummy_manifest_path}: {e}. Cannot proceed.")
         dummy_manifest_path = None # Set to None if saving fails
else:
    print("Warning: meld_train DataFrame not found or is None. Cannot create dummy manifest.")
    dummy_manifest_path = None


# Create an args object with the desired configuration
# Only proceed if the dummy manifest path was successfully created
if dummy_manifest_path is not None:
    # Ensure _HAS_TDA is defined globally (it's from cell 6iJgCVCX4Jl8 if TDA libs were imported)
    # Check if _HAS_TDA is defined globally, otherwise default to False - REMOVED
    # _HAS_TDA = globals().get('_HAS_TDA', False) # REMOVED


    args_valence_train = ConfigArgs(
        manifest=dummy_manifest_path, # Use the path to the created manifest CSV
        out_dir="./models_valence", # Output directory for the model artifacts
        bert_model=DEFAULT_BERT, # Use the default BERT model defined earlier (in cell 6iJgCVCX4Jl8)
        tda=False, # Include TDA features if the libraries were successfully imported (_HAS_TDA is global) - MODIFIED to False
        trees=300, # Number of trees for Random Forest
        max_depth=None # Max depth for Random Forest (None means nodes are expanded until all leaves are pure or contain less than min_samples_split samples.)
    )

    print("\n--- Running training pipeline with configuration: ---")
    print(f"  Manifest: {args_valence_train.manifest}")
    print(f"  Output Directory: {args_valence_train.out_dir}")
    print(f"  BERT Model: {args_valence_train.bert_model}")
    print(f"  Use TDA: {args_valence_train.tda}")
    print(f"  Random Forest Trees: {args_valence_train.trees}")
    print(f"  Random Forest Max Depth: {args_valence_train.max_depth}")
    print("-------------------------------------------------")

    # Call the main training pipeline function with the configuration object
    print("\nCalling run_training_pipeline...")
    # The run_training_pipeline function now handles both training and evaluation internally.
    # It returns the trained model package or None if training failed.
    trained_model_package = run_training_pipeline(args_valence_train)

    if trained_model_package and trained_model_package.get('model') is not None: # Use .get() for safety
         print("\n--- Training and Evaluation pipeline execution complete. ---")
         print("Model trained and evaluated successfully.")
         # trained_model_package contains the trained model, scaler, imputer, label_encoder
    else:
         print("\n--- Training and Evaluation pipeline execution complete. ---")
         print("Model training failed. Evaluation skipped.")

else:
    print("\n--- Training and Evaluation pipeline execution skipped ---")
    print("Reason: Dummy manifest file could not be created.")
    print("------------------------------------------")


In [None]:
# Cell USeDlns18i4Z: Stage 5 - Imputation Function

# Ensure necessary imports are available (should be covered by Stage 1)
# import numpy as np
# from sklearn.impute import SimpleImputer
# try: from missingpy import MissForest # Import if available
# except ImportError: pass # Handle import error

def impute_missing(X):
    """
    Impute missing values in feature matrix X.
    Uses MissForest if available, otherwise SimpleImputer (mean).
    """
    print("Applying imputation...")
    # Check if MissForest is available globally
    if '_HAS_MISSFOREST' in globals() and _HAS_MISSFOREST:
        try:
            imputer = MissForest()
            X_imputed = imputer.fit_transform(X)
            print("✓ MissForest imputation applied.")
            return X_imputed
        except Exception as e:
            print(f"Error using MissForest: {e}. Falling back to SimpleImputer.")
            # Fallback to SimpleImputer if MissForest fails
            imp = SimpleImputer(strategy='mean')
            X_imputed = imp.fit_transform(X)
            print("✓ SimpleImputer (mean) applied.")
            return X_imputed
    else:
        imp = SimpleImputer(strategy='mean')
        X_imputed = imp.fit_transform(X)
        print("✓ SimpleImputer (mean) applied.")
        return X_imputed

print("\nStage 5: Imputation function defined.")


Stage 5: Imputation function defined.


###  Training

In [None]:
# Cell JclnoRXSR-uu: Stage 5 - Training Function

# Ensure necessary imports are available (should be covered by Stage 1)
# import numpy as np, pandas as pd
# from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer # Only needed if fallback imputation is here

def train_random_forest(X, y, n_estimators=100, max_depth=None):
    """
    Train a Random Forest (classifier or regressor) with imputation and scaling.

    Args:
        X (np.ndarray): Feature matrix (may contain NaNs).
        y (np.ndarray): Target labels (numerical, may contain NaNs).
        n_estimators (int): Number of trees.
        max_depth (int or None): Maximum tree depth.

    Returns:
        dict: Trained model, scaler, and label mask. (Imputer assumed to be handled before calling)
    """
    print("Starting Random Forest training...")
    # 1. Filter out samples with missing labels
    mask = ~pd.isna(y)
    if mask.sum() == 0:
        print("No valid labels found. Cannot train model.")
        return {'model': None, 'scaler': None, 'mask': mask}

    X_train, y_train = X[mask], y[mask]

    # 2. Decide task type (classification vs regression)
    # Check if labels are integers (implies classification) or floats (implies regression or potentially classification)
    # A simple check is if all non-NaN values are integers
    if np.issubdtype(y_train.dtype, np.number) and np.all(np.mod(y_train[~np.isnan(y_train)], 1) == 0):
        y_train = y_train.astype(int)
        model_type = 'classifier'
        print("Training a RandomForestClassifier.")
    else:
        model_type = 'regressor'
        print("Training a RandomForestRegressor.")


    # 3. Impute missing values (Check and fallback imputation moved here)
    # Ensure X_train does not have NaNs here.
    # If imputation is expected within this function, move impute_missing call here.
    # Based on the pipeline structure, imputation happens after feature building but before training.
    # So X should be imputed before being passed to this function.
    # Let's add a check and basic imputation fallback just in case.
    if np.isnan(X_train).any():
        print("Warning: NaNs found in training features before model training. Applying SimpleImputer (mean).")
        imputer = SimpleImputer(strategy='mean')
        X_train_imp = imputer.fit_transform(X_train)
        # Note: The imputer fitted here is not returned by this function.
        # If you need to use the *same* imputer for test data, imputation should
        # be done *before* calling train_random_forest, and the imputer should be
        # returned by the function that performs imputation (like impute_missing).
        # Assuming impute_missing is called before this function.
    else:
        X_train_imp = X_train # No imputation needed if no NaNs

    # 4. Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imp)
    print("Features scaled.")


    # 5. Train model
    if model_type == 'classifier':
        model = RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth,
            random_state=42, n_jobs=-1
        )
    else:
        model = RandomForestRegressor(
            n_estimators=n_estimators, max_depth=max_depth,
            random_state=42, n_jobs=-1
        )

    print(f"Training model with {n_estimators} trees and max_depth={max_depth}...")
    model.fit(X_train_scaled, y_train)
    print("Model training complete.")

    # Return scaler and imputer used within this function if imputation was done here.
    # If imputation is done *before* calling this function, return the ones used there.
    # Assuming imputation happens before, so imputer is not returned from here.
    return {'model': model, 'scaler': scaler, 'mask': mask} # Removed imputer from return


print("Stage 5: Training function defined.")

Stage 5: Training function defined.


### CLI & Orchestration

In [None]:
# Cell Vn1D5VmzP0BY: Stage 5: Fusion & Robustness (Orchestration - Configuration)

# This cell defines the configuration class for the pipeline.

# Ensure necessary imports are available (should be covered by Stage 1, but included for clarity)
import os
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import LabelEncoder
import argparse # Keep import if you want to potentially use argparse structure elsewhere


# Define a simple class to hold the arguments (configuration)
class ConfigArgs:
    """Simple class to hold configuration parameters."""
    def __init__(self, manifest, out_dir="./models", bert_model=DEFAULT_BERT, tda=False, trees=300, max_depth=None):
        self.manifest = manifest
        self.out_dir = out_dir
        self.bert_model = bert_model
        self.tda = tda
        self.trees = trees
        self.max_depth = max_depth

# The pipeline execution logic (manifest creation and calling run_training_pipeline)
# has been moved to a separate cell that will be executed after the
# run_training_pipeline function is defined.

print("Configuration class ConfigArgs defined.")

Configuration class ConfigArgs defined.


## **Stage 5: Random Forest Heads**
* * *

In [None]:
# Cell 8e08cead: Stage 6 - Model Evaluation

# This cell is for evaluating the trained model on the test set.

# Ensure necessary imports are available (should be covered by Stage 1)
# import pandas as pd, numpy as np, os, joblib
# from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
# Ensure build_feature_matrix_valence, preprocess_image, preprocess_audio,
# TextFeatureExtractor, ImageFeatureExtractor, AudioFeatureExtractorValence,
# sentiment_lexicon, plot_confusion_matrix are defined globally.
# Ensure meld_test DataFrame is loaded globally.


# Global variable to store evaluation results (e.g., for plotting confusion matrix)
evaluation_results_valence = None

# The evaluation logic is now integrated into the run_training_pipeline function
# in the Orchestration cell (Vn1D5VmzP0BY). This cell now primarily serves
# as a placeholder for the evaluation stage and the global results variable.

print("Stage 6: Model Evaluation stage defined.")

Stage 6: Model Evaluation stage defined.


## **Stage 7: Outputs**
---
*   Polarity Mapping (Positive/Neutral/Negative)
*   Explainability(SHAP/Lime, Confusion Matrices, Persistence Diagrams)



### *Confusion Matrix*

In [None]:
# Cell c63e159c: Stage 7 - Outputs (Confusion Matrix Plotting)

# This cell contains the function to plot the confusion matrix.

# Ensure necessary imports are available (should be covered by Stage 1)
# import matplotlib.pyplot as plt, seaborn as sns, pandas as pd, numpy as np, os
# from sklearn.metrics import confusion_matrix
# Ensure evaluation_results_valence is available globally if used outside the function

def plot_confusion_matrix(y_true, y_pred, class_labels, save_path=None):
    """
    Calculates and plots the confusion matrix.

    Args:
        y_true (np.ndarray): True labels (numerical or string, consistent with y_pred).
        y_pred (np.ndarray): Predicted labels (numerical or string, consistent with y_true).
        class_labels (list or np.ndarray): List of class labels corresponding to the numerical encoding.
        save_path (str, optional): Path to save the plot. If None, the plot is displayed.
                                    Defaults to None.
    """
    print("\n--- Confusion Matrix ---")

    # Ensure we have valid inputs
    if y_true is None or y_pred is None or class_labels is None:
        print("Error: True labels, predicted labels, or class labels are missing.")
        return
    if len(y_true) == 0 or len(y_pred) == 0 or len(y_true) != len(y_pred):
        print("Error: True and predicted labels are empty or have inconsistent lengths.")
        return

    try:
        # Compute the confusion matrix
        cm = confusion_matrix(y_true, y_pred)

        # Create a DataFrame for better visualization
        # Ensure class_labels is a list for DataFrame index/columns
        cm_df = pd.DataFrame(cm, index=list(class_labels), columns=list(class_labels))

        # Plot the confusion matrix
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')

        # Save or display the plot
        if save_path:
            try:
                # Ensure directory exists
                save_dir = os.path.dirname(save_path)
                if save_dir and not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                plt.savefig(save_path, bbox_inches='tight')
                print(f"Saved confusion matrix plot to {save_path}")
            except Exception as e:
                print(f"Error saving confusion matrix plot to {save_path}: {e}")
                plt.show() # Display if saving fails
        else:
            plt.show()

        print("--------------------------")

    except ValueError as e:
        print(f"Error computing confusion matrix: {e}")
        print("This might be due to inconsistent labels between true values and predictions, or prediction issues.")
    except Exception as e:
        print(f"An unexpected error occurred during confusion matrix plotting: {e}")

# Note: The plotting function is defined here, but it is called from
# the run_training_pipeline function in the Orchestration cell (Vn1D5VmzP0BY)
# after evaluation is complete.

print("\nStage 7: Confusion Matrix plotting function defined.")


Stage 7: Confusion Matrix plotting function defined.


---