In [2]:

import torch
print("torch:", torch.__version__, "cuda:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


torch: 2.8.0+cu128 cuda: 12.8
CUDA available: True
Device: NVIDIA GeForce RTX 3080


In [3]:
from transformers import pipeline, VideoMAEForVideoClassification
import accelerate
import scipy
import librosa as lr
import soundfile as sf
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from datasets import load_dataset
from decord import VideoReader
from decord import cpu, gpu
import kagglehub


  from .autonotebook import tqdm as notebook_tqdm
2025-10-22 19:12:27.249576: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-22 19:12:28.028741: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-22 19:12:42.658116: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# Video emotion detection
- training a model to detect the emotion present in a video or scene
- the scene may have people in it or maybe it could convey a mood based on the color
- I selected VideoMAE because it is very effcient with data meaning that it can be used when i dont have enough video data


In [4]:
# Load VideoMAE model
MAE_model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics", attn_implementation="sdpa", dtype=torch.float16)
print("VideoMAE is loaded")

VideoMAE is loaded


# Preprocessing the Video data for emotion detection


In [5]:
# using the `kagglehub` library to download the MSR-VTT dataset
path = kagglehub.dataset_download("vishnutheepb/msrvtt")
msrvtt_videos_path = "/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo"
print("Path to dataset files:", path)

Path to dataset files: /home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1


In [6]:
videos_list = os.listdir(msrvtt_videos_path)
print("Number of videos in MSR-VTT dataset:", len(videos_list))
print("Sample video file:", videos_list[:5])
if isinstance(videos_list, list):
    print("my_variable is a list.")

Number of videos in MSR-VTT dataset: 7010
Sample video file: ['video1250.mp4', 'video4543.mp4', 'video6671.mp4', 'video1990.mp4', 'video6803.mp4']
my_variable is a list.


In [17]:
# getting all of the file paths to add to a dataframe
file_paths = [os.path.join(msrvtt_videos_path, f) for f in videos_list]
print("Sample file paths:", file_paths[:5])

# turn the video paths into a dataframe
path_df = pd.DataFrame({
    "file_path": file_paths,
})


# extract the video names from the file paths
path_df["video_name"] = path_df["file_path"].apply(os.path.basename)
path_df.head()

Sample file paths: ['/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video1250.mp4', '/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video4543.mp4', '/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video6671.mp4', '/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video1990.mp4', '/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video6803.mp4']


Unnamed: 0,file_path,video_name
0,/home/zach/.cache/kagglehub/datasets/vishnuthe...,video1250.mp4
1,/home/zach/.cache/kagglehub/datasets/vishnuthe...,video4543.mp4
2,/home/zach/.cache/kagglehub/datasets/vishnuthe...,video6671.mp4
3,/home/zach/.cache/kagglehub/datasets/vishnuthe...,video1990.mp4
4,/home/zach/.cache/kagglehub/datasets/vishnuthe...,video6803.mp4


In [14]:
# loading MSR-VTT dataset with the train_7k split and convert it to a dataframe
msrvtt_data_train = load_dataset("friedrichor/MSR-VTT", name="train_7k")
msrvtt_data_train_df=msrvtt_data_train['train'].to_pandas()
msrvtt_data_train_df.head()
#print("MSR-VTT training dataset loaded with", len(msrvtt_data_train_df), "entries.")

Unnamed: 0,video_id,video,caption,source,category,url,start time,end time,id
0,video0,video0.mp4,"[a car is shown, a group is dancing, a man dri...",MSR-VTT,9,https://www.youtube.com/watch?v=9lZi22qLlEo,137.72,149.44,0
1,video1,video1.mp4,[in a kitchen a woman adds different ingredien...,MSR-VTT,16,https://www.youtube.com/watch?v=w4JM08PDEng,184.33,206.89,1
2,video2,video2.mp4,"[a guying showing a tool, a man fixes a car, a...",MSR-VTT,9,https://www.youtube.com/watch?v=QA7KVQq9vKA,31.17,41.24,2
3,video3,video3.mp4,"[a big door is being opened in a video game, a...",MSR-VTT,8,https://www.youtube.com/watch?v=QFmJZ0GU6yc,48.26,58.51,3
4,video4,video4.mp4,"[a girl wearing a black shirt, a man is arguin...",MSR-VTT,14,https://www.youtube.com/watch?v=2q-dONPhzis,268.58,278.83,4


In [25]:
# merging the dataframes on the video file names
full_msrvtt_df = msrvtt_data_train_df.merge(
    path_df,
    left_on="video",
    right_on="video_name",
    how="left"
)
# dropping the redundant video_name column
full_msrvtt_df.drop(columns=["video_name"], inplace=True)

# adding a clip_time column to the dataframe
full_msrvtt_df["clip_time"] = full_msrvtt_df["end time"] - full_msrvtt_df["start time"]

full_msrvtt_df.head()

Unnamed: 0,video_id,video,caption,source,category,url,start time,end time,id,file_path,clip_time
0,video0,video0.mp4,"[a car is shown, a group is dancing, a man drives a vehicle through the countryside, a man drives down the road in an audi, a man driving a car, a man is driving a car, a man is driving down a road, a man is driving in a car as part of a commercial, a man is driving, a man riding the car speedly in a narrow road, a man showing the various features of a car, a man silently narrates his experience driving an audi, a person is driving his car around curves in the road, a person telling about a car, guy driving a car down the road, man talking about a car while driving, the man drives the car, the man driving the audi as smooth as possible, a man is driving, guy driving a car down the road]",MSR-VTT,9,https://www.youtube.com/watch?v=9lZi22qLlEo,137.72,149.44,0,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video0.mp4,11.72
1,video1,video1.mp4,"[in a kitchen a woman adds different ingredients into the pot and stirs it, a woman puts prawns and seasonings into a large pot on a stove, in the kitchen a woman makes a dish by adding ingredients mixing and allowing to boil on flame, a woman adding ingredients to a pot on the stove and stirring, instructions on how to cook a dish of prawns or crayfish are given on screen while the chef prepares the dish, a woman is in the kitchen making a recipe in a large pot with many ingredients, a woman adds some packets of spices and spoonfuls of tomato sauce to a pot then stirs it and covers the pot, a person add ingredients to a pot in a counter than stirs it, a person puts items in a pot on the stove in the kitchen, a woman cooking food with a metal pan on top of a stove, a woman adds different ingredients into a a pot on the stove, a woman in a kitchen is cooking a stew in a large pan on her stove, a women in a multi-color outfit is cooking a stew type dish in a silver pot, a woman adds ingredients to a pot that is simmering on a stove, a woman is preparing a seafood stew recipe on a stove demonstrating each step herself while at the same time the easy to read directions, in a kitchen a lady preferred crayfish with mixing of curry powder, a woman and a bowl spoon mixing dish inside kitchen to prepare to serve to eat displaying on screen, cooking the dried smoked prawn in a vessel having boiled water and the lied closed, a lady is making dried prawns curry and she added tomato puree and salt in it, a woman in a colorful scarf is showing how to make a stew]",MSR-VTT,16,https://www.youtube.com/watch?v=w4JM08PDEng,184.33,206.89,1,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video1.mp4,22.56
2,video2,video2.mp4,"[a guying showing a tool, a man fixes a car, a man holding a combustion leak tester, a man is explaining how to use a car repair kit, a man is showing a pack, a man is showing how to test a vehicle for a leak, a man is talking about a combustion leak tester, a man is talking about a test for a car engine, a man is talking next to a car, a man is talking with a tool box, a man is talking, a man picks up a combustion leak test, a man presents a piece of equipment, a man shows an item which can be used for car repair, a man shows off his tools to fix his car, a man stands outside talking about an automotive combustion leak tester, an advertisement for tools, man talking about car things, scotty kilmer shows you how to test for leaks, a man is showing how to test a vehicle for a leak]",MSR-VTT,9,https://www.youtube.com/watch?v=QA7KVQq9vKA,31.17,41.24,2,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video2.mp4,10.07
3,video3,video3.mp4,"[a big door is being opened in a video game, a bright light is flashing, a cartoon of a door opening, a door is opened and a bright light is shown, a door opens and a pink light shines out of it, a door opens with bright lights, a door with different color boxes opens, a first-person perspective sees a futuristic door opening, a gate opens up and lets in light, a light rays comes when a door opens, a short 3d rendering taken in either a video game or movie, a short minecraft video showing a door being opened, a strange door opens, a video game transition sequence shows doors opening in a large hall, a virtual door made of blocks opening up, an animated door opened, an animated door opens up, an animated wall opened up, a door opens and a pink light shines out of it, a first-person perspective sees a futuristic door opening]",MSR-VTT,8,https://www.youtube.com/watch?v=QFmJZ0GU6yc,48.26,58.51,3,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video3.mp4,10.25
4,video4,video4.mp4,"[a girl wearing a black shirt, a man is arguing about the need for eyebrows with a woman, a man is chastising a woman over her eyebrows, a man is joking with a woman about not needing eyebrows, a man is talking to a woman about not needing eye brows, a man is telling about lady s eyebrows, a man is trying to convince a woman about the uselessness of eyebrows, a man talks to a woman about her eyebrows, a man yells at a woman about not needing eyebrows, a woman is in the kitchen, a woman is walking around her kitchen, a woman talks about her eyebrows, clip of person talking about eyebrows, in a kitchen a girl checks food in the oven, man talking to a woman about eyebrows, man tells woman that she doesn t need eyebrows, man trying to convince a woman that she doesn t need eybrows, people are in the kitchen, woman showing her kitchen oven, woman talking to a man]",MSR-VTT,14,https://www.youtube.com/watch?v=2q-dONPhzis,268.58,278.83,4,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video4.mp4,10.25


In [26]:
# displaying the full file paths in the dataframe in order to verify that file paths have been merged correctly
pd.set_option('display.max_colwidth', None)
full_msrvtt_df[["file_path"]].head()

Unnamed: 0,file_path
0,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video0.mp4
1,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video1.mp4
2,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video2.mp4
3,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video3.mp4
4,/home/zach/.cache/kagglehub/datasets/vishnutheepb/msrvtt/versions/1/TrainValVideo/video4.mp4


### Decode and Sample the frames from the videos

In [None]:
    
### Decode and sample frames from the videos
def video_clipping(video_path,max_frames=32):
    """Extract frames from a video file."""
    
    # vr is used create a VideoReader object to read the video file which allows for access to individual frames
    vr = VideoReader(video_path, ctx=cpu(0))
    
    # idx is used to create a list of frame indices to be extracted from the video or sampled uniformly across the video's duration
    idx = list(range(0, len(vr), max(1, len(vr)//max_frames)))[:max_frames]
    
    # frames is used to extract the frames from the video at the specified indices and convert them to a numpy array
    frames = vr.get_batch(idx).asnumpy()   # (T, H, W, 3) uint8
    
    return frames




TypeError: expected str, bytes or os.PathLike object, not list