In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import tempfile, cv2, numpy as np
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class VideoEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(3, 64, 3, padding=1)
        self.conv2 = nn.Conv1d(64, 128, 5, padding=2)
        self.conv3 = nn.Conv1d(128, 256, 7, padding=3)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        return x.mean(dim=2)

In [None]:
class ConsensusTransformer(nn.Module):
    def __init__(self, hidden_dim=256):
        super().__init__()
        layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=4)
        self.transformer = nn.TransformerEncoder(layer, num_layers=2)

    def forward(self, x):
        return self.transformer(x)

In [None]:
class GoldenGrounding(nn.Module):
    def __init__(self, hidden_dim=256):
        super().__init__()
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.relu = nn.ReLU()

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.relu(out).mean(dim=1)

In [None]:
class FusionClassifier(nn.Module):
    def __init__(self, input_dim=1024):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.opinion = nn.Linear(128, 3)
        self.emotion = nn.Linear(128, 8)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.opinion(x), self.emotion(x)

In [None]:
class VCCSAModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.video_encoder = VideoEncoder()
        self.consensus = ConsensusTransformer()
        self.golden = GoldenGrounding()
        self.fusion = FusionClassifier()

    def forward(self, video_feats, text_feats):
        v = self.video_encoder(video_feats.permute(0, 2, 1))
        t = text_feats
        combined = torch.cat((v, t), dim=1)
        return self.fusion(combined)

In [None]:
def extract_frames_from_video(video_file, max_frames=16):
    tfile = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
    tfile.write(video_file.read())
    cap = cv2.VideoCapture(tfile.name)
    frames, count = [], 0
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    rate = max(total // max_frames, 1)
    while cap.isOpened() and len(frames) < max_frames:
        ret, frame = cap.read()
        if not ret: break
        if count % rate == 0:
            f = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            f = cv2.resize(f, (64, 64))
            frames.append(f)
        count += 1
    cap.release()
    return torch.tensor(np.stack(frames)/255.0, dtype=torch.float32).unsqueeze(0).to(device)

In [None]:
def extract_text_features(text, tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        out = model(**tokens)
    return out.last_hidden_state[:, 0, :]

In [None]:
def predict_sentiment(video_file, comment):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    text_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
    model = VCCSAModel().to(device)
    model.eval()
    v = extract_frames_from_video(video_file)
    t = extract_text_features(comment, tokenizer, text_model)
    o_logits, e_logits = model(v, t)
    opinion = o_logits.argmax(1).item()
    emotion = e_logits.argmax(1).item()
    return opinion, emotion

In [None]:
opinion_labels = ["Positive", "Negative", "Neutral"]
emotion_labels = ["Fear", "Disgust", "Anger", "Sadness", "Joy", "Trust", "Anticipation", "Surprise"]