# Data Filtering Pipeline for Text to Video Generation
Reducing training dataset size for *Text to Video Generation* by utilising static and dynamic video analysing techniques.



Developed by: team SuperShy

## Preparation
In this section, we will perform the necessary preparations before starting the main analysis. This includes installing dependencies, and creating helper functions for later inference


### Install Dependencies
*source: requirements.txt*

Use the requirements.txt file to download all the required dependencies

In [None]:
# the required libraries are available on the requirements.txt file
!pip install -r "requirements.txt" -q

### Helper Functions 
*source: get_data_idx_range.py, dataset_generator.py*

#### Reading Datas
The given dataset is in the form of JSON. We will need to create some helper functions in order to correctly extract the data. Since the given data is very large, we will only sample some of the data from the dataset.

Our code will sample the data by taking the video from index *start_idx* to *end_idx*.

In [None]:
# file: get_data_idx_range.py
import json

# get videos per batch
import json

def get_data_idx_range(data, list_keys, start_idx, end_idx, save_to_json=False):
    keys = list_keys[start_idx:end_idx+1]

    # take random n records from list_keys
    records = {}
    for key in keys:
        records[key] = data[key]

    if save_to_json:
        # Write the random n records to a new JSON file
        with open(f'metafiles/hdvg_batch_{start_idx}-{end_idx}.json', 'w') as f:
            json.dump(records, f)
    
    return records

#### Creating Dataset Generator

The video in our dataset are represented by its Youtube id. Hence, we will create a generator to help us download, split, and cut the videos asynchronusly by utilising multi-threading.

In [None]:
# file: dataset_generator.py
import json
from pytube import YouTube
import cv2
import subprocess
import os
import shutil
import multiprocessing as mp

class DatasetGenerator:
    def download_worker(self, q, vq, initialize_finished):
        while not q.empty() or not initialize_finished.is_set():
            try:
                video_id, video_info = q.get()
                self.download_video(video_id, video_info)

                for clip_id, clip_info in video_info['clip'].items():
                    vq.put((video_id, clip_id, clip_info))
                print("Done downloading video: " + video_id)
            except Exception as e:
                print(f"Error when downloading video: {video_id}")
            finally:
                q.task_done()
        
        return

    def video_split_worker(self,q, sq, download_finished):
        while not q.empty() or not download_finished.is_set():
            try:
                video_id, clip_id, clip_info = q.get()
                self.split_video(clip_info, video_id, clip_id)
                for scene in clip_info['scene_split']:
                    sq.put((video_id, clip_id, scene))
                print("Done splitting video: " + video_id)
            except Exception as e:
                print(f"Error when splitting video: {video_id}")
            finally:
                q.task_done()
        
        return


    def clip_split_worker(self,q, video_split_finished):
        while not q.empty() or not video_split_finished.is_set():
            try:
                video_id, clip_id, info_scene = q.get()
                self.split_clip(info_scene, self.data[video_id]['clip'][clip_id]['fps'], video_id, clip_id, info_scene["clip_id"])
                print("Done splitting clip: " + video_id + "/" + clip_id + "/" + info_scene["clip_id"])
            except Exception as e:
                print(f"Error when splitting clip: {video_id}/{clip_id}")
            finally:
                q.task_done()
        
        return

    def __init__(self, filename = None, generate_scene_video = True, generate_scene_samples = False, frame_output_folder = "frames_output", scenes_output_folder = "video_clips", download_output_folder="download_videos", tmp_output_folder="tmp_clips"):
        self.filename = filename
        self.generate_scene_video = generate_scene_video
        self.generate_scene_samples = generate_scene_samples
        self.frame_output_folder = frame_output_folder
        self.scenes_output_folder = scenes_output_folder
        self.download_output_folder = download_output_folder
        self.tmp_output_folder = tmp_output_folder
        self.data = None
        self.FRAME_INTERVAL = 250

        os.makedirs(download_output_folder, exist_ok=True)
        os.makedirs(tmp_output_folder, exist_ok=True)

        if self.generate_scene_video:
            os.makedirs(scenes_output_folder, exist_ok=True)

        if self.generate_scene_samples:
            os.makedirs(self.frame_output_folder, exist_ok=True)
        

    def cmd(self, cmd):
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        out, _ = proc.communicate()
        return out.decode('utf-8')

    def hhmmss(self, timestamp1, timestamp2):
        hh,mm,s = timestamp1.split(':')
        ss,ms = s.split('.')
        timems1 = 3600*1000*int((hh)) +  60*1000*int(mm) + 1000*int(ss) + int(ms)
        hh,mm,s = timestamp2.split(':')
        ss,ms = s.split('.')
        timems2 = 3600*1000*int((hh)) +  60*1000*int(mm) + 1000*int(ss) + int(ms)
        dur = (timems2 - timems1)/1000
        return str(dur)

    def download_video(self, video_id, video_info):
        # Checks if the video is already downloaded
        if os.path.exists(os.path.join(self.download_output_folder, f"{video_id}.mp4")):
            return
        
        url = video_info['url']
        file_name = f"{video_id}.mp4"
        
        yt = YouTube(url)
        stream = yt.streams.get_by_resolution('360p')
        stream.download(output_path=self.download_output_folder, filename=file_name)

    def split_video(self, info, video_id, output_name):
        # info: a unit of a clip (span, scene_split, fps)
        # video_name = the name of the downloaded video
        # output_name = the name of the outputted video
        # cut hdvila clip
        yt_video = os.path.join(self.download_output_folder, video_id +'.mp4')

        ori_clip_path = os.path.join(self.tmp_output_folder, output_name) # output the clip videos on temporary folder
        os.makedirs(os.path.join(self.tmp_output_folder, video_id), exist_ok=True)
        if not os.path.exists(ori_clip_path):
            sb = info['span']
            cmd = ['ffmpeg', '-ss', sb[0], '-t', self.hhmmss(sb[0], sb[1]),'-accurate_seek', '-i', yt_video, '-c', 'copy',
                '-avoid_negative_ts', '1', '-reset_timestamps', '1',
                '-y', '-hide_banner', '-loglevel', 'panic', '-map', '0', ori_clip_path]
            self.cmd(cmd)

        if not os.path.isfile(ori_clip_path):
            raise Exception(f"{ori_clip_path}: ffmpeg clip extraction failed")

    def split_clip(self, info_scene, fps, video_id, clip_input_name, scene_output_name):
        # info: a unit of a scene
        ori_clip_path = os.path.join(self.tmp_output_folder, clip_input_name)

        clip_id = clip_input_name[:-4]
        
        try:
            start, end = int(info_scene['scene_cut'][0]), int(info_scene['scene_cut'][1])
            save_split_path = os.path.join(self.scenes_output_folder, video_id, scene_output_name + '.mp4')

            # Skip if the scene is already split
            if os.path.exists(save_split_path):
                return

            os.makedirs(os.path.join(self.scenes_output_folder, video_id), exist_ok=True)
            if end == -1:
                shutil.copy(ori_clip_path, save_split_path)
            else:
                oricap = cv2.VideoCapture(ori_clip_path)
                h = oricap.get(cv2.CAP_PROP_FRAME_HEIGHT)
                w = oricap.get(cv2.CAP_PROP_FRAME_WIDTH)

                writer = cv2.VideoWriter(save_split_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(w),int(h)))
                oricap.set(cv2.CAP_PROP_POS_FRAMES, start+1)
                current = start+1
                frame_cnt = 0
                
                if self.generate_scene_samples:
                    os.makedirs(os.path.join(self.frame_output_folder, video_id, scene_output_name), exist_ok=True)

                while current < end:
                    ret, frame = oricap.read()

                    if self.generate_scene_samples and (frame_cnt * self.FRAME_INTERVAL < current / fps * 1000):
                        frame_name = os.path.join(self.frame_output_folder, video_id, scene_output_name, f"{frame_cnt:04d}.jpg")
                        cv2.imwrite(frame_name, frame)
                        frame_cnt += 1
                    
                    if self.generate_scene_video and ret:
                        writer.write(frame)
                    current += 1
                writer.release()
                oricap.release()

        except Exception as e:
            print("Error occured")
            print(e)

    def load_data(self, data):
        self.data = data
        # print("Metadata loaded (data)")

    def load_from_file(self):
        with open(self.filename, 'r') as f:
            self.data = json.load(f)
        # print("Metadata loaded (from files)")
    
    def download(self):
        for video_id, video_info in self.data.items():
            self.download_video(video_id, video_info)
        # print("Done downloading")

    def split_videos(self):
        for video_id, video_info in self.data.items():
            for clip_id, clip_info in video_info['clip'].items():
                self.split_video(clip_info, video_id, clip_id)
            # print("Done splitting videos: " + video_id)

            # Can delete video
        # print("Done splitting videos")
    
    def split_scenes(self):
        for video_id, video_info in self.data.items():
            for clip_id, clip_info in video_info['clip'].items():
                for scene in clip_info['scene_split']:
                    self.split_clip(scene, clip_info['fps'], video_id, clip_id, scene["clip_id"])
                    # print("Done splitting clips: " + scene["clip_id"])
        # print("Done splitting clips")

                
    
    def run(self):
        self.load_from_file()
        self.download()
        self.split_videos()
        self.split_scenes()
        print("Done")

        
    def run_threaded(self, num_of_downloader_threads = 2, num_of_video_splitter_threads = 2, num_of_clip_splitter_threads = 4):

        if self.filename is not None:
            self.load_from_file()

        dq = mp.JoinableQueue() # video id, video info
        vq = mp.JoinableQueue()
        sq = mp.JoinableQueue()

        initialize_finished = mp.Event()
        download_finished = mp.Event()
        video_split_finished = mp.Event()

        processes = []

        for _ in range(num_of_downloader_threads):
            t = mp.Process(target=self.download_worker, args=(dq, vq, initialize_finished))
            t.start()
            processes.append(t)     
        print("Spawned download workers")

        for _ in range(num_of_video_splitter_threads):
            t = mp.Process(target=self.video_split_worker, args=(vq, sq, download_finished,))
            t.start()
            processes.append(t)
        print("Spawned video split workers")

        for _ in range(num_of_clip_splitter_threads):
            t = mp.Process(target=self.clip_split_worker, args=(sq, video_split_finished,))
            t.start()
            processes.append(t)
        print("Spawned clip split workers")

        for video_id, video_info in self.data.items():
            dq.put((video_id, video_info))
        
        initialize_finished.set()
        print("Initialization finished")        
        dq.join()
        print("Download finished")
        download_finished.set()
        vq.join()
        print("Video split finished")
        video_split_finished.set()
        sq.join()
        print("Clip split finished")

        for p in processes:
            if p.is_alive():
                p.terminate()

        return

### Creating Dataset 
*source: dataset_class_batch.py*

From the dataset, extract the important detail of each *scene split* and store it as a single data point

Our code will utilise PyTorch framework to create the dataset

In [None]:
# file: dataset_class_batch.py
from torch.utils.data import Dataset
import os
import json

class VideoDataset(Dataset):
    def __init__(self, data, start_idx, end_idx):
        dg = DatasetGenerator(generate_scene_samples=True, generate_scene_video=True)
        self.list_keys = list(data.keys())
        self.data = get_data_idx_range(data, self.list_keys, 
                                       start_idx, end_idx, 
                                       save_to_json=False)
        self.data_list = list(self.data.values())
        dg.load_data(self.data)
        dg.run_threaded(num_of_clip_splitter_threads=6, 
                        num_of_downloader_threads=2, 
                        num_of_video_splitter_threads=2)

        self.scene_data = []
        for video_id in self.data:
            for clip_id in self.data[video_id]["clip"]:
                for scene in self.data[video_id]["clip"][clip_id]["scene_split"]:
                    scene_dict = {}
                    scene_dict["scene_id"] = scene["clip_id"]
                    scene_dict["video_id"] = video_id
                    scene_dict["clip_id"] = clip_id[:-4]
                    scene_dict["caption"] = scene["caption"]
                    scene_dict["scene_cut"] = scene["scene_cut"]
                    scene_dict["video_path"] = os.path.join('video_clips', video_id, scene_dict["scene_id"] + '.mp4').replace('\\','/')
                    scene_dict["frames_path"] = os.path.join('frames_output', video_id, scene["clip_id"]).replace('\\','/')

                    if os.path.exists(scene_dict["frames_path"]) and os.path.exists(scene_dict["video_path"]):
                        # ranges from 1s to 15s
                        if 4 <= len(os.listdir(scene_dict["frames_path"])) <= 60:
                            self.scene_data.append(scene_dict)
    
    def __len__(self):
        return len(self.scene_data)
    
    def __getitem__(self, idx):
        # uncomment to print the scene data
        # print(idx, self.scene_data[idx])
        return self.scene_data[idx]

## Filtering

In this section focuses on how to filter the dataset to only pick some *proposedly good video clips*. It is the main pipeline for this project, which we utilise several video analysing and classification techniques.

### Generating Filtering Metrics 
*source: static_check.py, optical_flow_check.py, image_to_embedding.py, inference.py*

Not every data points can be considered good, and some might potentially hurt the model to be trained. Thus, it is essential to filter these datas and only choose some of the best. As a result, a more comprehensive and smaller dataset is obtained. 

The filtering process used in this notebook will be based on 2 features, *staticity of video* and *scene transitions*

#### Staticity of Video
Staticity of the video is the metrics to score whether the main object of the video is moving or not (remains the same in its position). Static video would not be a great choice for the filtered dataset because it will cause the text-to-video model to not learn about object movement.

#### Scene Transitions
Scene transitions determines whether there is a sudden change in the scene. This sudden change would hurt the performance of the model because the model will likely to loss the context of the previous frame. In another word, we want our model to always have the same context from the begining till the end. 


We will discuss different approach to score these metrics

##### Energy-Based

The difference of energy content can be calculated as the sum of squared difference of every pixel on each of the channel. We will calculate the energy content difference for each consecutive frames of the scene. A small energy content difference means that there is not much movement done by the main object. 

In [None]:
# file: static_check.py
import numpy as np
import torch

def get_static_difference(frames):
    diff = []
    for i in range(len(frames)-1):
        frame1 = frames[i] * 255
        frame2 = frames[i+1] * 255
        diff.append(torch.mean(torch.square(frame1 - frame2)))
    return np.max(diff), np.mean(diff)

##### Optical Flow

Optical Flow is a method that can estimate the direction and magnitude of motion at each pixel or in regions of the image. By taking the sum of magnitudes, we can use optical flow as one of the key factors to determine the staticness of a video.

In [None]:
# file: optical_flow_check.py
import cv2
import numpy as np
from PIL import Image
import os
from torchvision import transforms

def image_transform(image):
    transform = transforms.Compose([
        transforms.Resize((320, 240)),
        transforms.CenterCrop(240),
        transforms.ToTensor(),  
    ])
    return transform(image).unsqueeze(0)

def load_image(folder_path):
    # Sort the frames
    frames = sorted(os.listdir(folder_path))
    conv_frames = []
    
    # Convert the frames to tensor
    for frame in frames:
        image = Image.open(os.path.join(folder_path, frame)).convert('RGB')
        image = image_transform(image)
        conv_frames.append(image)
    return conv_frames

def get_optical_flow(frames):
    avg_velocities = []
    prv = frames[0].squeeze(0).numpy().transpose((1, 2, 0)) * 255
    prv_gray = cv2.cvtColor(prv, cv2.COLOR_BGR2GRAY)

    for i in range(len(frames)-1):
        nxt = frames[i+1].squeeze(0).numpy().transpose((1, 2, 0)) * 255
        nxt_gray = cv2.cvtColor(nxt, cv2.COLOR_BGR2GRAY)

        flow = cv2.calcOpticalFlowFarneback(prv_gray, nxt_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)

        # Calculate the magnitude and angle of the 2D vectors
        magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])

        # Calculate the average magnitude of the vectors, which corresponds to the average velocity
        average_velocity = np.mean(magnitude)
        avg_velocities.append(average_velocity)

        # Update the previous frame
        prv = nxt

    return np.max(avg_velocities), np.mean(avg_velocities)

##### Image Embedding

Image embedding is one way how machine interprets a given image. A similar image embedding between 2 images means that the 2 images have similar context. Auto Encoder can be used to encode the image, while cosine similarity can be used to evaluate similarity of the embeddings.

In [None]:
# file: image_to_embedding.py
import torch
import numpy as np
import warnings
warnings.filterwarnings("ignore")

def tensor_to_flat_latent(tensor, model):
    with torch.inference_mode():
        y = model.encoder(tensor)
    return y.flatten()

def MSELoss(x1, x2):
    loss = np.mean((x1 - x2) ** 2)
    return loss

def cosine_similarity(x1, x2):
    norm_x1 = np.linalg.norm(x1)
    norm_x2 = np.linalg.norm(x2)
    cosine_score = np.dot(x1, x2) / (norm_x1 * norm_x2)
    return cosine_score

def get_image_to_embedding(frames, model, device):
    final_mse = 0
    final_cos_sim = 0

    curr_latent = tensor_to_flat_latent(frames[0].to(device), model).cpu().numpy()
    for i in range(len(frames) - 1):
        next_latent = tensor_to_flat_latent(frames[i+1].to(device), model).cpu().numpy()
        mse = MSELoss(curr_latent, next_latent)
        cos_sim = cosine_similarity(curr_latent, next_latent)
        final_mse += mse
        final_cos_sim += cos_sim
        curr_latent = next_latent
    
    final_mse = final_mse / (len(frames)-1)
    final_cos_sim = final_cos_sim / (len(frames)-1)
    return final_mse, final_cos_sim

##### Combined Filtering Metric Generation Process

In [None]:
#file: inference.py

from diffusers import AutoencoderKL
import json
import os
from PIL import Image
import torch
from torchvision import transforms
import time
import pickle
import yaml
from tqdm import tqdm

warnings.filterwarnings("ignore")
os.environ['KMP_DUPLICATE_LIB_OK']='True'
def image_transform(image):
    transform = transforms.Compose([
        transforms.Resize((320, 240)),
        transforms.CenterCrop(240),
        transforms.ToTensor(),
    ])
    return transform(image).unsqueeze(0)


def load_image(folder_path):
    # Sort the frames
    frames = sorted(os.listdir(folder_path))
    conv_frames = []

    # Convert the frames to tensor
    for frame in frames:
        image = Image.open(os.path.join(folder_path, frame)).convert('RGB')
        image = image_transform(image)
        conv_frames.append(image)
    return conv_frames


def get_model():
    model = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
    return model


def get_metrics(dataset, model, device):
    res = {}
    print("Processing the dataset metrics...")
    print("Total number of scene videos: ", len(dataset))
    for idx, data in tqdm(enumerate(dataset)):
        scene_id = data['scene_id']
        frames_path = data['frames_path']
        frames = load_image(frames_path)

        # Getting static difference
        max_rgb_diff, mean_rgb_diff = get_static_difference(frames)

        # getting optical flow
        max_velocity, mean_velocity = get_optical_flow(frames)

        # Getting image context similarity
        frame_mse, frame_cos_sim = get_image_to_embedding(frames, model, device)

        # No. frames
        no_frames = len(frames)

        # Storing the results
        res[scene_id] = {
            'max_rgb_diff': float(max_rgb_diff),
            'mean_rgb_diff': float(mean_rgb_diff),
            'max_velocity': float(max_velocity),
            'mean_velocity': float(mean_velocity),
            'mse': float(frame_mse),
            'cos_sim': float(frame_cos_sim),
            'no_frames': float(no_frames),
            'idx': idx,
            'clip_id': data['clip_id'],
        }

### Filtering Scenes
*source: filter_scenes.py*

Function used to filter the scene based on a classification model

In [None]:
#file: filter_scenes.py
import numpy
import pandas as pd
import json
import pickle

def filter_scenes(data, n_taken, classifier_model):
    """
    data consist of scene_id keys with dict containing: rgb_diff, avg_velocity, mse, cos_sim
    """

    # data is dict of dict and we want to convert it to list of dict
    data = list(data.values())
    data_ids = []
    clip_ids = []
    for scene_data in data:
        data_ids.append(scene_data['idx'])
        clip_ids.append(scene_data['clip_id'])
        scene_data.pop('idx')
        scene_data.pop('clip_id')

    # convert to dataframe
    data = pd.DataFrame(data)

    # get the prediction
    pred = classifier_model.predict_proba(data)[:, 1]

    # sort data_ids and clip_ids based on pred descending
    sorted_data_ids = numpy.argsort(pred)[::-1]
    sorted_clip_ids = [clip_ids[i] for i in sorted_data_ids]
    sorted_data_ids = [data_ids[i] for i in sorted_data_ids]

    # take the top n_taken but skip clips that have been taken
    filtered_data = []
    clips_taken = []
    for i in range(len(data)):
        if len(filtered_data) == n_taken:
            break
        if sorted_clip_ids[i] not in clips_taken:
            filtered_data.append(sorted_data_ids[i])
            clips_taken.append(sorted_clip_ids[i])

    
    return filtered_data

### Building XGBoost Model for Classification
*source: human_evaluation.ipynb, building_xg_boost.ipynb*

XGBoost is one powerful model to do classification task based on numeric features.

#### Creating Dataset for Training

In [None]:
# file: human_evaluation.ipynb
import cv2
from IPython.display import display
import random

def display_video(video_path):
    # Display the video in a separate window popup
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Display the frame in a separate window
        cv2.imshow('Video', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

def annotate_videos(dataset, annotation_dict):
    labels = []
    print(len(dataset))

    no_clips = min(len(dataset), 3)
    ids = random.sample(range(len(dataset)), no_clips)
    
    for id in ids:
        video_sample = dataset[id]
        video_path = video_sample["video_path"]
        frames_path = video_sample["frames_path"]
        print(f"video: {frames_path}")
        
        # Display the video in a separate window popup
        display_video(video_path)
        
        # Get user input for label
        label = input("Enter the label for this video clip (0 or 1) (or 'q' to quit): ")
        while label != '0' and label != '1' and label != 'q':
            print("Invalid label. Please enter 0 or 1.")
            display_video(video_path)
            label = input("Enter the label for this video clip (0 or 1) (or 'q' to quit): ")
        
        # Save the label and video file name to dataset
        if label != 'q':
            annotation_dict[frames_path] = int(label)
            labels.append(label)
        else:
            break
    
    return labels, annotation_dict

In [None]:
# file: human_evaluation.ipynb
import json

with open("metafiles/hdvg_0.json", 'r') as f:
    data = json.load(f)
print("Data loaded")

total = 0
annotation_dict = {}
for idx in range(0,101,1):
    dataset = VideoDataset(data, idx, idx)
    labels, annotation_dict = annotate_videos(dataset, annotation_dict)
    total += len(labels)
    print(labels)

print(annotation_dict)

In [None]:
# file: human_evaluation.ipynb
# Save the annotation dictionary to a json
import json
with open('annotations.json', 'w') as f:
    json.dump(annotation_dict, f)

#### Building The Model

In [None]:
# file: building_xg_boost_model.ipynb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# file: building_xg_boost_model.ipynb
from diffusers import AutoencoderKL
import torch

import cv2
import numpy as np
from PIL import Image
import os
from torchvision import transforms


def image_transform(image):
    transform = transforms.Compose([
        transforms.Resize((320, 240)),
        transforms.CenterCrop(240),
        transforms.ToTensor(),  
    ])
    return transform(image).unsqueeze(0)

def load_image(folder_path):
    # Sort the frames
    frames = sorted(os.listdir(folder_path))
    conv_frames = []
    
    # Convert the frames to tensor
    for frame in frames:
        image = Image.open(os.path.join(folder_path, frame)).convert('RGB')
        image = image_transform(image)
        conv_frames.append(image)
    return conv_frames

def get_model():
    model = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
    return model

def get_inference(folder_path, model, device):
    frames = load_image(folder_path)

    # Getting static difference
    rgb_diff_max, rgb_diff_mean = get_static_difference(frames)

    # getting optical flow
    max_velocity, avg_velocity = get_optical_flow(frames)
    
    # Getting image context similarity
    frame_mse, frame_cos_sim = get_image_to_embedding(frames, model, device)

    # Getting number of frames
    num_frames = len(frames)

    return rgb_diff_max, rgb_diff_mean, max_velocity, avg_velocity, frame_mse, frame_cos_sim, num_frames

In [None]:
# file: building_xg_boost_model.ipynb
# Load from annotations.json
import json
with open('annotations.json', 'r') as f:
    labeled_data = json.load(f)

In [None]:
# file: building_xg_boost_model.ipynb
rgb_diff_maxs = []
rgb_diff_means = []
max_velocities = []
avg_velocities = []
frame_mses = []
frame_cos_sims = []
num_frames_list = []
labels = []

In [None]:
# file: building_xg_boost_model.ipynb
device = "cuda" if torch.cuda.is_available() else "cpu"
model = get_model().to(device)

for i, path in enumerate(labeled_data.keys()):
    rgb_diff_max, rgb_diff_mean, max_velocity, avg_velocity, frame_mse, frame_cos_sim, num_frames = get_inference(path, model, device)
    rgb_diff_maxs.append(rgb_diff_max)
    rgb_diff_means.append(rgb_diff_mean)
    max_velocities.append(max_velocity)
    avg_velocities.append(avg_velocity)
    frame_mses.append(frame_mse)
    frame_cos_sims.append(frame_cos_sim)
    num_frames_list.append(num_frames)
    labels.append(labeled_data[path])
    print(f'Enumerated {i+1} paths')

In [None]:
# file: building_xg_boost_model.ipynb
import pandas as pd

dataset = pd.DataFrame({
    "max_rgb_diff": rgb_diff_maxs,
    "mean_rgb_diff": rgb_diff_means,
    "max_velocity": max_velocities,
    "mean_velocity": avg_velocities,
    "mse": frame_mses,
    "cos_sim": frame_cos_sims,
    "no_frames": num_frames_list,
    "label": labels
})

In [None]:
# file: building_xg_boost_model.ipynb
X = dataset.drop(columns=["label"])
y = dataset["label"]

In [None]:
# file: building_xg_boost_model.ipynb
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Creating an XGBoost classifier
model = xgb.XGBClassifier()

# Training the model
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=False)

y_pred_train = model.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Train Accuracy:", accuracy_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# file: building_xg_boost_model.ipynb
import pickle

# Save the trained model to a file
filename = 'xgboost_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# file: building_xg_boost_model.ipynb
# TO TEST LOADING THE SAVED MODEL

# Load the saved model from file
classifier_model = pickle.load(open(filename, 'rb'))

# Use the loaded model for predictions
y_pred_loaded = classifier_model.predict(X_test)

# Evaluate the loaded model
accuracy_loaded = accuracy_score(y_test, y_pred_loaded)
print("Accuracy of loaded model:", accuracy_loaded)

## Re-captioning the Chosen Dataset

In this section, the obtained dataset will be improved accordingly by the help of the already available Generative AI

### Gemini Recaptioning Class
*source: gemini_recaptioning.py*

Re-captioning is essential to provide more details of the generated video to the dataset.

In [None]:
# file: gemini_recaptioning.py
from IPython.display import Markdown
import google.generativeai as genai

import json
import textwrap
import PIL.Image
import os

from dotenv import load_dotenv
load_dotenv()

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

class GeminiRecaptioning:
    def __init__(self, api_key, data):
        self.safety_settings = [
            {
                "category": "HARM_CATEGORY_DANGEROUS",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_HARASSMENT",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_HATE_SPEECH",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                "threshold": "BLOCK_NONE",
            },
        ]
        self.api_key = api_key
        self.data = data

        genai.configure(api_key = api_key)
        self.model = genai.GenerativeModel('gemini-pro-vision')

        self.RESOLUTION = (240, 240)


    def __generate_prompt(self, images, caption):
        res = [f"The original caption to be improved is: {caption}.", 
               "Provide additional visual details from these video frames with these following rules:",
               "All lowercase",
               "No more than 100 words",
               "Only use information available in the image and no assumptions",
               "Answer in English only",]
        for image in images:
            res.append(image)
        
        return res
     
    def run(self, frames_folder_path: str) -> str:
        scene_name = frames_folder_path.split('/')[-1]
        video_id = '.'.join(scene_name.split('.')[0:-1])
        clip_id = '_'.join(scene_name.split('_')[0:-1])
        scene_no =  int(scene_name.split('_')[-1])

        scene_info = list(filter(lambda x: x['clip_id'] == scene_name, self.data[video_id]['clip'][clip_id+".mp4"]['scene_split']))[0]

        folder_path = f'./frames_output/{video_id}/{scene_name}'
        count = len(os.listdir(folder_path))
        indexes = []

        if count >= 3:
            indexes = [0, count // 2, count - 1]
        elif count == 2:
            indexes = [0, count - 1]
        else:
            indexes = [0]

        

        images =  []

        for i in indexes:
            image_path = f'{folder_path}/{i:0>4}.jpg'
            image = PIL.Image.open(image_path).resize(self.RESOLUTION)
            images.append(image)

        caption = scene_info['caption']
        content = self.__generate_prompt(images, caption)

        for image in images:
            content.append(image)

        response = self.model.generate_content(content, safety_settings=self.safety_settings)
        return response.text

## Evaluation

In this section, we will provide the evidence to proof our chosen dataset works better than randomly choosing another dataset of the same length

### Evaluation Function
*source: evaluation.py*

Evaluation is necessary to compare whether our reduced dataset can score better compared to the base dataset. Since our reduced dataset is smaller, then the base model must also have the same length, which will be chosen through random sampling. There are 2 features that we suggest is crucial for comparisson, video clips and captions. Our whole evaluation process are done through 2 different pipelines: video quality checking and caption quality checking.

In [None]:
# file: evaluation.py
import open_clip
import torch
import PIL
import json
import os
import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(filename = "eval.log", level = logging.INFO)

def get_eval_dataset(path):
    with open(path, 'r') as file:
        eval_dataset = json.load(file)
        eval_dataset = eval_dataset['scenes']

    return eval_dataset


def get_eval_model():
    # Load the model and tokenizer
    model, preprocess = open_clip.create_model_from_pretrained('hf-hub:laion/CLIP-ViT-g-14-laion2B-s12B-b42K')
    tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-g-14-laion2B-s12B-b42K')

    return model, preprocess, tokenizer

#### Video Quality Checking

To compare the quality of video, it is essential to keep the other variable (the caption) modified the same way. In our case, both the captions are re-generated, and thus, we are comparing *Re-captioned Reduced Dataset* vs *Re-captioned Randomly Sampled Dataset*. 

In [None]:
# file: evaluation.py
def eval_different_dataset(eval_dataset1, eval_dataset2, preprocess, model, tokenizer, device, logging):
    # This evaluation pipeline is used to evaluate the video quality
    cosine = torch.nn.functional.cosine_similarity
    print('Evaluating caption and image similarity on sampled original and refined datasets')
    total_score1 = 0
    total_score2 = 0

    # Evaluate the similarity between the caption and the image of randomized and refined datasets
    # Both of these datasets have been recaptioned
    for i, data in enumerate(zip(eval_dataset1, eval_dataset2)):
        data1, data2 = data
        # Load the image, caption and recaption
        frames_path1, caption1 = data1['frames_path'], data1['recaption']
        frames_path2, caption2 = data2['frames_path'], data2['recaption']
        
        text_input1 = tokenizer([caption1])
        text_input2 = tokenizer([caption2])
        text_input1 = text_input1.to(device)
        text_input2 = text_input2.to(device)

        frame_list1 = os.listdir(frames_path1)
        frame_list2 = os.listdir(frames_path2)

        with torch.inference_mode():
            text_features1 = model.encode_text(text_input1)
            text_features2 = model.encode_text(text_input2)

        sim1, sim2 = 0, 0

        length = min(len(frame_list1), len(frame_list2))
        print(f'length: {length}')

        for i in range(0, length, 2):
            endpath1 = frame_list1[i]
            endpath2 = frame_list2[i]
            print(f'{endpath1} | {endpath2}')
            imagepath1 = frames_path1 + '/' + endpath1
            imagepath2 = frames_path2 + '/' + endpath2

            image1 = PIL.Image.open(imagepath1)
            image2 = PIL.Image.open(imagepath2)

            image_input1 = preprocess(image1).unsqueeze(0)
            image_input2 = preprocess(image2).unsqueeze(0)

            image_input1 = image_input1.to(device)
            image_input2 = image_input2.to(device)

            with torch.inference_mode():
                image_features1 = model.encode_image(image_input1)
                image_features2 = model.encode_image(image_input2)
        
            # Calculate the similarity between the image and the caption of these two datasets
            sim1 += max(100 * cosine(image_features1, text_features1), 0)
            sim2 += max(100 * cosine(image_features2, text_features2), 0)
        
        sim1 = sim1.item() / length
        sim2 = sim2.item() / length

        # Update the total score of each dataset
        total_score1 += sim1
        total_score2 += sim2 
        print(f'Random Dataset similarity: {total_score1/(i+1)} | Filtered Dataset similarity: {total_score2/(i+1)}')
        logging.info(f'Random Dataset similarity: {total_score1/(i+1)} | Filtered Dataset similarity: {total_score2/(i+1)}')

    total_score1 = total_score1 / len(eval_dataset1)
    total_score2 = total_score2 / len(eval_dataset2)
    
    return total_score1, total_score2 

#### Caption Quality Checking
Similarly, to compare the quality of caption, is it essential to keep the videos modified the same way. Since there is no processing of videos, then we are comparing *Re-captioned Reduced Dataset* vs *Original Reduced Dataset*.

In [None]:
def eval_same_dataset(eval_dataset, preprocess, model, tokenizer, device, logging):
    # This evaluation pipeline is used to evaluate the caption quality
    cosine = torch.nn.functional.cosine_similarity
    print('Evaluating caption and recaption similarity on the same dataset')
    total_caption_score = 0
    total_recaption_score = 0

    for i, data in enumerate(eval_dataset):
        # Load the image, caption and recaption
        frames_path, caption, recaption = data['frames_path'], data['caption'], data['recaption']

        caption_input = tokenizer([caption])
        recaption_input = tokenizer([recaption])
        caption_input = caption_input.to(device)
        recaption_input = recaption_input.to(device)
    
        caption_image_sim = 0
        recaption_image_sim = 0

        with torch.inference_mode():
            caption_features = model.encode_text(caption_input)
            recaption_features = model.encode_text(recaption_input)

        frame_list = os.listdir(frames_path)
        for i in range(0, len(frame_list), 2):
            endpath = frame_list[i]
            imagepath = frames_path + '/' + endpath
            print(endpath)
            image = PIL.Image.open(imagepath)
            image_input = preprocess(image).unsqueeze(0)
            image_input = image_input.to(device)
            with torch.inference_mode():
                image_features = model.encode_image(image_input)

            # Calculate the similarity between the image and the caption
            caption_image_sim += max(100 * cosine(image_features, caption_features), 0)
            recaption_image_sim += max(100 * cosine(image_features, recaption_features), 0) 

        caption_score = caption_image_sim.item() / len(frame_list)
        recaption_score = recaption_image_sim.item() / len(frame_list)
        
        # Update the total score of each dataset
        total_caption_score += caption_score
        total_recaption_score += recaption_score
        print(f'image {i+1} | caption: {total_caption_score/(i+1)} | recaption: {total_recaption_score/(i+1)}')
        logging.info(f'image {i+1} | caption: {total_caption_score/(i+1)} | recaption: {total_recaption_score/(i+1)}')
    
    total_caption_score = total_caption_score / len(eval_dataset)
    total_recaption_score = total_recaption_score / len(eval_dataset)
    return total_caption_score, total_recaption_score


## Inference

In this section, the whole process would be executed to obtain the result needed based on the given dataset

### Inference Process
*source: inference.py*

We will follow this pipeline in order to filter the dataset:
1. Preparing the PyTorch Dataset (sample only each 100 videos due to time and memory constraint). This include downloading videos, splitting and cutting clips.
2. Calculate the filtering metrics, i.e. Staticity and Motion of Video
3. Use XGBoost to classify good/bad video, with filtering metrics as the features
4. Recaption the chosen video clips
5. Evaluate whether the chosen video is better than random picking
6. Evaluate whether re-captioning is better than not recaptioning

In [None]:
import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(filename = "eval.log", level = logging.INFO)

from dotenv import load_dotenv
load_dotenv()
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

def select_random_scenes(dataset, n_taken):
    ids = torch.arange(len(dataset))
    select_ids = ids[torch.randperm(len(dataset))[:n_taken]]
    return select_ids

In [None]:
N_VIDEOS_PER_BATCH = 4
N_TOTAL_VIDEOS = 18_750
N_TOTAL_CLIPS = 1_500_000
TOTAL_CLIPS_TAKEN = 10_000
CLIPS_IDX_START = 100
LENGTH = 4
CLIPS_IDX_END = CLIPS_IDX_START + LENGTH - 1
CLIPS_TAKEN_PER_BATCH = max(1, int(N_VIDEOS_PER_BATCH / N_TOTAL_VIDEOS * TOTAL_CLIPS_TAKEN))

metafile_path = './metafiles/hdvg_0.json'
classifier_filename = 'xgboost_model.pkl'
api_key = os.getenv("GEMINI_API_KEY")

In [None]:
# file: inference.py

# read the config file
with open('config.yaml', 'r') as f:
        config = yaml.safe_load(f)

# setting the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'using device: {device}')

# Parse the arguments
n_videos_per_batch = config["n_videos_per_batch"]
clip_idx_start = config["clip_idx_start"]
clip_idx_end = config["clip_idx_end"]
store_intermediate_json = config["store_intermediate_json"]
clips_taken_per_batch = max(1, int(config["clips_taken_per_batch"]))
print(f'CLIPS_TAKEN_PER_BATCH: {clips_taken_per_batch}')

# reading the metafile data
with open(metafile_path, 'r') as f:
    data = json.load(f)

# Load the model for filtering
model = get_model()
model.to(device)

inference_output_dir = 'frame_score_results'
if not os.path.exists(inference_output_dir):
    os.makedirs(inference_output_dir)

# Load the classifier model
classifier_model = pickle.load(open(classifier_filename, 'rb'))

# Create an instance of the GeminiRecaptioning class
gemini_recaptioning = GeminiRecaptioning(api_key, data)

# Load the model for evaluation
eval_model, preprocess, tokenizer = get_eval_model()
eval_model = eval_model.to(device)

for i in range(clip_idx_start, clip_idx_end+1, n_videos_per_batch):
    j = i + n_videos_per_batch - 1
    print(f'Processing Video Index {i}-{j}...')
    logging.info(f'Processing Video Index {i}-{j}...')

    starttime = time.time()
    # getting the dataset
    dataset = VideoDataset(data, i, j)
    
    # getting the filtering metrics
    res = get_metrics(dataset, model, device)

    if store_intermediate_json:
        # save the result if specified
        with open(os.path.join(inference_output_dir, f'inference_result_{i}-{j}.json'), 'w') as f:
            json.dump(res, f)
        print(f"Saved to json: inference_result_{i}-{j}.json")

    # filter the scenes
    filtered_scenes = filter_scenes(res, clips_taken_per_batch, classifier_model)
    filtered_scenes = [dataset[idx] for idx in filtered_scenes]
    for scene in filtered_scenes:
        frames_path = scene['frames_path']
        assert(os.path.exists(frames_path))
        print(f"Recaptioning for {frames_path}")
        
        # Initialize the recaption variable
        recaption = ""

        # Try the operation three times max
        for _ in range(3):
            try:
                recaption = gemini_recaptioning.run(frames_path).strip()
                break
            except:
                continue

        scene['recaption'] = recaption

    # save information of selected into json
    json_info_selected = {
        'length': len(filtered_scenes),
        'scenes': filtered_scenes
    }

    print(f'Total time: {(time.time() - starttime):.2f}s')

    # select CLIPS_TAKEN_PER_BATCH random idx from the dataset
    random_scenes = select_random_scenes(dataset, clips_taken_per_batch)
    random_scenes = [dataset[idx] for idx in random_scenes]
    for scene in random_scenes:
        frames_path = scene['frames_path']
        assert(os.path.exists(frames_path))
        print(f"Recaptioning for {frames_path}")
        
        # Initialize the recaption variable
        recaption = ""

        # Try the operation three times max
        for _ in range(3):
            try:
                recaption = gemini_recaptioning.run(frames_path).strip()
                break
            except:
                continue

        scene['recaption'] = recaption
    
    json_info_random = {
        'length': len(random_scenes),
        'scenes': random_scenes
    }

    if store_intermediate_json:
        json_filename = f'random_scenes_{i}-{j}.json'
        with open(os.path.join(inference_output_dir, json_filename), 'w') as f:
            json.dump(json_info_random, f)
        print(f"Saved to json: {json_filename}")

    # Run the evaluation
    print("Running the evaluation...")

    total_caption_score, total_recaption_score = eval_same_dataset(json_info_selected["scenes"], preprocess, eval_model, tokenizer, device, logging)
    print(f"Total caption score: {total_caption_score}")
    print(f"Total recaption score: {total_recaption_score}")

    total_score_random, total_score_selected  = eval_different_dataset(json_info_random["scenes"], json_info_selected["scenes"], preprocess, eval_model, tokenizer, device, logging)
    print(f"Total score random: {total_score_random}")
    print(f"Total score selected: {total_score_selected}")

    # Add logging
    logging.info(f"Total caption score: {total_caption_score}")
    logging.info(f"Total recaption score: {total_recaption_score}")
    logging.info(f"Total score random: {total_score_random}")
    logging.info(f"Total score selected: {total_score_selected}")

    json_info_selected["total_caption_score"] = total_caption_score
    json_info_selected["total_recaption_score"] = total_recaption_score
    json_info_selected["total_score_random"] = total_score_random
    json_info_selected["total_score_selected"] = total_score_selected

    # Save the result to a json file
    json_filename = f'{str(i).zfill(5)}-{str(j).zfill(5)}.json'
    with open(os.path.join(inference_output_dir, json_filename), 'w') as f:
        json.dump(json_info_selected, f)
    
    print(f"Saved to json: {json_filename}")