<a href="https://colab.research.google.com/github/Valiyantt/Thesis-Multimodal-Sentiment-Analysis/blob/main/Thesis%3A%20Multimodal%20Sentiment%20Analysis%20Prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A Multimodal Sentiment Analysis Pipeline Incorporating Valence Detection and Topological Data Analysis - Model Prototype   
---
Authors/Researchers:
1. Alverio, Franz Tovie G.
2. Almarinez, Lucky Richmon C.
3. Jamilano, Kyla Celine L.

# Prototype
----


### Import Libraries

In [None]:
import os
import argparse
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import joblib

# ML / DL
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image

# ML / DL
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image

# Text encoders
try:
    from sentence_transformers import SentenceTransformer
    _HAS_SBERT = True
except Exception:
    _HAS_SBERT = False
    from transformers import AutoTokenizer, AutoModel
    _HAS_TRANSFORMERS = True

# Audio
import librosa

# TDA (optional)
try:
    from persim import PersImage
    from ripser import ripser
    _HAS_TDA = True
except Exception:
    _HAS_TDA = False

# Imputation
try:
    from missingpy import MissForest
    _HAS_MISSFOREST = True
except Exception:
    _HAS_MISSFOREST = False
    from sklearn.impute import SimpleImputer

# Classifier + metrics
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Explainability optional
try:
    import shap
    _HAS_SHAP = True
except Exception:
    _HAS_SHAP = False

RuntimeError: duplicate registrations for aten.linspace.Tensor_Tensor

### Import Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

# Load the uploaded data into a DataFrame
import pandas as pd
import io

df = pd.read_csv("Official Tourism Dataset (Synthetic).csv")
df.head()

Saving Official Tourism Dataset (Synthetic).csv to Official Tourism Dataset (Synthetic).csv


Unnamed: 0,ID,Platform,URL,Place/Topic,Date Posted,Username,Comment/Review Text,Language,Location,Likes,Replies,Rating,Sentiment,Emotion,Notes,Is Synthetic?,Preprocessing Notes,Media Metadata,Timestamp Metadata
0,SYNTH-00001,YouTube,http://keith.biz/,El Nido,2023-10-24,lovetara,There film should amount raise boy question ba...,English,Davao,375,90,5.0,Neutral,Joy,,Yes,Raw Text,Group Name: Boracay Lovers,12:45
1,SYNTH-00002,Facebook,https://stewart.com/,Baguio,2023-09-30,lesliewilson,Science treat ask development probably media s...,Filipino,Quezon City,69,96,5.0,Neutral,Surprise,Them impact ago evening wide thus.,Yes,Raw Text,Channel: TravelPH,12:45
2,SYNTH-00003,Facebook,http://www.myers.com/,Baguio,2020-01-11,ihall,Manager cause become key among player Mrs water.,Cebuano,Quezon City,699,194,4.0,Neutral,Fear,,Yes,"Tokenized, Lemmatized",Video Title: Amazing Sunset,12:45
3,SYNTH-00004,Google,https://www.gates.info/,El Nido,2025-01-26,debrarichardson,Meet fly school but mind course wife involve m...,English,Cebu,828,82,5.0,Positive,Sadness,,Yes,"Tokenized, Lemmatized",Video Title: Amazing Sunset,05:23
4,SYNTH-00005,Google,https://www.mills.com/,"San Juan, La Union",2024-08-07,gwilson,Easy authority audience involve itself around.,Cebuano,Quezon City,247,19,4.0,Neutral,Fear,,Yes,"Tokenized, Lemmatized",Video Title: Amazing Sunset,12:45


## Data Cleaning - Pre Processing

In [None]:
def preprocess_text(text):
    # Basic preprocessing (extend as needed): lowercasing, strip
    if pd.isna(text):
        return ""
    txt = str(text).strip().lower()
    return txt

def preprocess_image(image_path, image_transform):
    if not isinstance(image_path, str) or not image_path or not os.path.exists(image_path):
        return None
    try:
        img = Image.open(image_path).convert('RGB')
        return image_transform(img)
    except Exception as e:
        print(f"Image load error {image_path}: {e}")
        return None

def preprocess_audio(audio_path, sr=16000, duration=None):
    if not isinstance(audio_path, str) or not audio_path or not os.path.exists(audio_path):
        return None, None
    try:
        y, _sr = librosa.load(audio_path, sr=sr, mono=True, duration=duration)
        # simple denoise placeholder: pre-emphasis
        y = np.append(y[0], y[1:] - 0.97 * y[:-1])
        return y, sr
    except Exception as e:
        print(f"Audio load error {audio_path}: {e}")
        return None, None

Unnamed: 0,Comment/Review Text,cleaned_text
0,There film should amount raise boy question ba...,there film should amount raise boy question ba...
1,Science treat ask development probably media s...,science treat ask development probably media s...
2,Manager cause become key among player Mrs water.,manager cause become key among player mrs water
3,Meet fly school but mind course wife involve m...,meet fly school but mind course wife involve m...
4,Easy authority audience involve itself around.,easy authority audience involve itself around


## Feature extractors
---

In [None]:
class ImageFeatureExtractor:
    def __init__(self, model_name=DEFAULT_RESNET, device=DEVICE):
        self.device = device
        if model_name == "resnet50":
            model = models.resnet50(pretrained=True)
            model = nn.Sequential(*list(model.children())[:-1])  # remove classifier
            model.eval().to(self.device)
            self.model = model
            self.out_dim = 2048
        else:
            raise ValueError("Only resnet50 supported in this template")
        self.transform = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485,0.456,0.406],
                                 std=[0.229,0.224,0.225])
        ])

    def extract(self, pil_tensor):
        if pil_tensor is None:
            return None
        with torch.no_grad():
            x = pil_tensor.unsqueeze(0).to(self.device)
            feat = self.model(x)  # (1,2048,1,1)
            feat = feat.view(feat.size(0), -1).cpu().numpy()[0]
            return feat

class TextFeatureExtractor:
    def __init__(self, model_name=DEFAULT_BERT, device=DEVICE):
        self.device = device
        if _HAS_SBERT:
            self.model = SentenceTransformer(model_name, device=device)
            self.encode = self.model.encode
            self.is_sbert = True
            self.out_dim = self.model.get_sentence_embedding_dimension()
        else:
            # fallback to transformers
            self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
            self.model = AutoModel.from_pretrained("distilbert-base-uncased").to(self.device)
            self.is_sbert = False
            self.out_dim = 768

    def extract(self, text):
        if text is None or text=="":
            return None
        if self.is_sbert:
            emb = self.encode(text, convert_to_numpy=True)
            return emb
        else:
            tok = self.tokenizer(text, truncation=True, padding=True, return_tensors='pt').to(self.device)
            with torch.no_grad():
                out = self.model(**tok)
                emb = out.last_hidden_state[:,0,:].cpu().numpy()[0]
                return emb

class AudioFeatureExtractor:
    def __init__(self, n_mfcc=13):
        self.n_mfcc = n_mfcc

    def extract(self, y, sr):
        if y is None:
            return None
        # MFCC
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)
        # aggregate statistics across time
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        # Pitch via librosa.yin (if length sufficient)
        try:
            f0 = librosa.yin(y, fmin=50, fmax=500)
            f0_mean = np.mean(f0[np.isfinite(f0)]) if np.any(np.isfinite(f0)) else 0.0
        except Exception:
            f0_mean = 0.0
        energy = np.mean(librosa.feature.rms(y=y))
        feat = np.concatenate([mfcc_mean, mfcc_std, [f0_mean, energy]])
        return feat

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

(10000, 768)

## Optional TDA (persistence images)

In [None]:
def compute_persistence_image_from_pointcloud(X, persim_obj=None, maxdim=1):
    """
    X : (n_points, dim) array
    returns : fixed-size persistence image vector or None
    """
    if not _HAS_TDA:
        return None
    try:
        dgms = ripser(X, maxdim=maxdim)['dgms']
        # choose H0 and H1 combined
        dgm = np.vstack([d for d in dgms if d.size>0]) if len(dgms)>0 else np.empty((0,2))
        if persim_obj is None:
            persim_obj = PersImage(pixels=[20,20], spread=0.1)
        if dgm.shape[0]==0:
            return np.zeros((20*20,))
        img = persim_obj.transform(dgm)
        return img.flatten()
    except Exception as e:
        print("TDA error:", e)
        return None

(10000, 10000)

## Imputation

In [None]:
def impute_missing(X):
    if _HAS_MISSFOREST:
        imputer = MissForest()
        return imputer.fit_transform(X)
    else:
        imp = SimpleImputer(strategy='mean')
        return imp.fit_transform(X)

## Fusion + Training

In [None]:
def build_feature_matrix(df, text_ext, img_ext, aud_ext, tda=False, persim_obj=None):
    """
    df: manifest dataframe with columns id, text (or text_path), image_path, audio_path, label
    Returns X (n_samples, n_features), y (labels), meta list
    """
    feats = []
    labels = []
    meta = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        # TEXT
        text = None
        if 'text' in row and pd.notna(row['text']):
            text = preprocess_text(row['text'])
        elif 'text_path' in row and pd.notna(row['text_path']):
            try:
                with open(row['text_path'], 'r', encoding='utf-8') as f:
                    text = preprocess_text(f.read())
            except:
                text = ""
        text_feat = text_ext.extract(text) if text is not None else None

        # IMAGE
        img_raw = None
        if 'image_path' in row and pd.notna(row['image_path']):
            img_raw = preprocess_image(row['image_path'], img_ext.transform)
        img_feat = img_ext.extract(img_raw) if img_raw is not None else None

        # AUDIO
        y, sr = (None, None)
        if 'audio_path' in row and pd.notna(row['audio_path']):
            y, sr = preprocess_audio(row['audio_path'])
        aud_feat = aud_ext.extract(y, sr) if y is not None else None

        # Optional TDA features combine available vectors into point cloud
        tda_vector = None
        if tda:
            pts = []
            if text_feat is not None:
                pts.append(text_feat.reshape(-1,1) if text_feat.ndim==1 else text_feat)
            if img_feat is not None:
                pts.append(img_feat.reshape(-1,1) if img_feat.ndim==1 else img_feat)
            if aud_feat is not None:
                pts.append(aud_feat.reshape(-1,1) if aud_feat.ndim==1 else aud_feat)
            if len(pts)>0:
                try:
                    pc = np.concatenate(pts, axis=0).T  # shape (n_points, dim)
                    tda_vec = compute_persistence_image_from_pointcloud(pc, persim_obj=persim_obj)
                    tda_vector = tda_vec
                except Exception as e:
                    tda_vector = None

        # Concatenate features (we will impute missing ones later)
        feat_parts = []
        # preserve ordering
        if text_feat is not None:
            feat_parts.append(text_feat.flatten())
        else:
            feat_parts.append(None)
        if img_feat is not None:
            feat_parts.append(img_feat.flatten())
        else:
            feat_parts.append(None)
        if aud_feat is not None:
            feat_parts.append(aud_feat.flatten())
        else:
            feat_parts.append(None)
        if tda_vector is not None:
            feat_parts.append(tda_vector.flatten())
        else:
            feat_parts.append(None)

        feats.append(feat_parts)
        labels.append(row['label'] if 'label' in row else np.nan)
        meta.append({'id': row.get('id', None)})
    # Determine max lengths per slot and pad with NaNs
    # Convert list-of-lists into 2D array
    slot_dims = [max([p[i].shape[0] if (p[i] is not None) else 0 for p in feats]) for i in range(len(feats[0]))]
    X = np.zeros((len(feats), sum(slot_dims))) * np.nan
    for idx, p in enumerate(feats):
        offset = 0
        for j in range(len(p)):
            dim = slot_dims[j]
            if p[j] is None:
                X[idx, offset:offset+dim] = np.nan
            else:
                vec = p[j].flatten()
                X[idx, offset:offset+len(vec)] = vec
                if len(vec)<dim:
                    X[idx, offset+len(vec):offset+dim] = np.nan
            offset += dim
    y = np.array(labels)
    return X, y, meta

def train_random_forest(X, y, n_estimators=200, max_depth=None):
    # Impute
    X_imp = impute_missing(X)
    # Scale features
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X_imp)
    # Simple stratified split if labels present
    mask = ~pd.isna(y)
    y_mask = y[mask].astype(int)
    X_mask = Xs[mask]
    X_train, X_test, y_train, y_test = train_test_split(X_mask, y_mask, test_size=0.2, stratify=y_mask, random_state=42)
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"RF test acc={acc:.4f} macro-F1={f1:.4f}")
    return {'model': rf, 'scaler': scaler, 'mask': mask, 'X_imp': X_imp}

## CLI & Orchestration

In [None]:
def main(args):
    print("Device:", DEVICE)
    # Load manifest
    df = pd.read_csv(args.manifest)
    print("Loaded manifest:", len(df), "rows")

    # Instantiate extractors
    text_ext = TextFeatureExtractor(model_name=args.bert_model, device=DEVICE)
    img_ext = ImageFeatureExtractor(model_name="resnet50", device=DEVICE)
    aud_ext = AudioFeatureExtractor(n_mfcc=13)
    persim_obj = PersImage(pixels=[20,20], spread=0.1) if _HAS_TDA else None

    # Build features
    print("Building feature matrix (this may take a while)...")
    X, y, meta = build_feature_matrix(df, text_ext, img_ext, aud_ext, tda=args.tda, persim_obj=persim_obj)

    # Train
    print("Training Random Forest...")
    model_pack = train_random_forest(X, y, n_estimators=args.trees, max_depth=args.max_depth)

    # Save
    os.makedirs(args.out_dir, exist_ok=True)
    joblib.dump(model_pack['model'], os.path.join(args.out_dir, "rf_model.joblib"))
    joblib.dump(model_pack['scaler'], os.path.join(args.out_dir, "scaler.joblib"))
    print("Saved model + scaler to", args.out_dir)

    # Optional explainability
    if _HAS_SHAP:
        print("Computing SHAP on a sample subset")
        explainer = shap.TreeExplainer(model_pack['model'])
        X_imp = impute_missing(X)
        Xs = model_pack['scaler'].transform(X_imp)
        sample = Xs[~np.isnan(y.astype(float))][:200]
        shap_vals = explainer.shap_values(sample)
        joblib.dump(shap_vals, os.path.join(args.out_dir, "shap_vals.joblib"))
        print("Saved SHAP")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Multimodal Sentiment Runner")
    parser.add_argument("--manifest", type=str, required=True, help="CSV manifest with columns: id,text,text_path,image_path,audio_path,label")
    parser.add_argument("--out-dir", type=str, default="./models", help="output dir")
    parser.add_argument("--bert-model", type=str, default=DEFAULT_BERT, help="BERT / SBERT model name")
    parser.add_argument("--tda", action="store_true", help="enable TDA persistence images (requires ripser + persim)")
    parser.add_argument("--trees", type=int, default=300, help="RF trees")
    parser.add_argument("--max-depth", type=int, default=None, help="RF max depth")
    args = parser.parse_args()
    main(args)