# Preprocessing and EDA of SSBD+ dataset

All relevant imports

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import imageio

Define constants

In [None]:

VIDEO_CHUNK_SIZE = 40
FRAME_HEIGHT = 100
FRAME_WIDTH = 100
FPS = 10
LABEL_THRESHOLD = 0.75


Function to load and preprocess video chunks

In [None]:
def load_and_preprocess_video_chunk(video_path):
    # Load video using opencv
    cap = cv2.VideoCapture(video_path)
    
    # Read frames and create chunks
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    
    # Sample frames at 10 FPS
    sampled_frames = frames[::int(cap.get(5)//FPS)]
    
    # Create video chunks
    video_chunks = [sampled_frames[i:i+VIDEO_CHUNK_SIZE] for i in range(0, len(sampled_frames), VIDEO_CHUNK_SIZE//2)]
    
    # Preprocess each chunk
    preprocessed_chunks = []
    for chunk in video_chunks:
        # Resize frames to 100x100 pixels
        resized_frames = [cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT)) for frame in chunk]
        
        # Convert frames to torch tensor
        video_tensor = torch.tensor(resized_frames).permute(3, 0, 1, 2).float() / 255.0
        
        # Check for stimming action label
        label = 1 if chunk.count('stimming') / len(chunk) >= LABEL_THRESHOLD else 0
        
        preprocessed_chunks.append({'video_tensor': video_tensor, 'label': label})
    
    return preprocessed_chunks


# EDA:

List files in the dataset directory

In [None]:
dataset_directory = '/path/to/your/dataset'
video_files = [f for f in os.listdir(dataset_directory) if f.endswith('.mp4')]

In [None]:
# Display some basic information about the dataset
print(f"Number of videos in the dataset: {len(video_files)}")


In [None]:

# Load and preprocess a sample video chunk
sample_video_path = os.path.join(dataset_directory, video_files[0])
sample_preprocessed_chunk = load_and_preprocess_video_chunk(sample_video_path)

# Display sample frames from the preprocessed chunk
fig, axs = plt.subplots(1, VIDEO_CHUNK_SIZE, figsize=(20, 2))
for i in range(VIDEO_CHUNK_SIZE):
    axs[i].imshow(sample_preprocessed_chunk[0]['video_tensor'][i].permute(1, 2, 0))
    axs[i].axis('off')
plt.show()

# Create a DataFrame for EDA
eda_data = {'filename': [], 'label': []}
for video_file in video_files:
    video_path = os.path.join(dataset_directory, video_file)
    preprocessed_chunks = load_and_preprocess_video_chunk(video_path)
    label = preprocessed_chunks[0]['label']
    eda_data['filename'].append(video_file)
    eda_data['label'].append(label)

eda_df = pd.DataFrame(eda_data)

In [None]:

print(eda_df['label'].value_counts())