# Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# add to path to allow import
import sys
sys.path.append('/content/gdrive/MyDrive/CIS6200ProjectWork/cis6200projectwork2/TempoTokens')

In [3]:
# at this point it has the exact same thing as the outer directory
%cd /content/gdrive/MyDrive/CIS6200ProjectWorkNOTGIT/cis6200projectwork2/TempoTokens

/content/gdrive/MyDrive/CIS6200ProjectWorkNOTGIT/cis6200projectwork2/TempoTokens


In [None]:
! pip install git+https://github.com/openai/CLIP.git # get clip - this is for CLIP Score

! pip install moviepy
! pip install torchaudio
! pip install compel

! pip install fvcore # this one's needed for the 3d resnet model
! pip install av # for pytorch video - inception score and fvd score
! pip install "git+https://github.com/facebookresearch/pytorchvideo.git"

In [5]:
# imports for
import cv2 # for reading video frames
import torch
from PIL import Image # used for casting the images for inception score, may be used for more
import pandas as pd # for dealing with the csv files
import os

import numpy as np

In [6]:
device = "cpu" # "cuda"

# this is for fvd and inception score
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True) # keep a copy of the model out of the class
model = model.eval()
model = model.to(device)

Downloading: "https://github.com/facebookresearch/pytorchvideo/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOW_8x8_R50.pyth" to /root/.cache/torch/hub/checkpoints/SLOW_8x8_R50.pyth
100%|██████████| 248M/248M [00:08<00:00, 30.2MB/s]


# Set up parameters and directories

In [53]:
generated_vids_path = "/content/gdrive/Shareddrives/CIS 6200 Final Project/data/Mix/dog/" # this must be changed
gen_vids_npy_path = "/content/gdrive/Shareddrives/CIS 6200 Final Project/data/Mix/mix_dog_fvd.npy"

# do not change these:
real_vids_path = "/content/gdrive/Shareddrives/CIS 6200 Final Project/data/Dog/dog_test_video/"
real_vids_npy_path = "/content/gdrive/Shareddrives/CIS 6200 Final Project/data/Dog/dog_test_video_fvd.npy"
# this one is for CLIP score
text_desc = "Dog Barking"
# this one is only for av align
audio_path = "/content/gdrive/Shareddrives/CIS 6200 Final Project/data/Dog/dog_test_audio/"


## Inception

In [79]:
import evaluation.Inception_Score as inceptionscore

In [80]:
cavp_inception_score_calc = inceptionscore.InceptionScoreCalculator(generated_vids_path,
                                                        device = device, model = model)

Inception Score: Using 149 videos in directory: /content/gdrive/Shareddrives/CIS 6200 Final Project/data/volleyball/Tempotoken_CAVP_generated/


In [81]:
cavp_inception_score_calc.calculate_inception_score_batched(splits = 10)

100%|██████████| 10/10 [10:35<00:00, 63.59s/it]


5.01893

## CLIP score (working)

In [54]:
import evaluation.CLIP_Score as clipscore
import clip

In [55]:
clip_model, _ = clip.load("ViT-B/32", device=device)
_ = clip_model.eval()

In [56]:
clipscorecalc = clipscore.CLIPScoreCalculator(generated_vids_path,
                                     model = clip_model, text_desc = text_desc,
                                     device = device)
clipscorecalc.calculate_clip_score()

CLIP Score: Using 100 videos in directory: /content/gdrive/Shareddrives/CIS 6200 Final Project/data/Mix/dog/
Using text description: Dog Barking


100%|██████████| 100/100 [10:18<00:00,  6.19s/it]


0.2283387390275796

# AV-Align

In [57]:
import av_align as av_score

In [58]:
# get wav files
video_dir = generated_vids_path
filenames = [i[:-6] for i in os.listdir(video_dir) if (i[-6:] == '_c.mp4')]
audio_dir = audio_path # directory with input audios of the form filename.wav

In [None]:
score = 0
for file in filenames:

    video_path = f'{video_dir}{file}_c.mp4'
    print(video_path)
    audio_path = f'{audio_dir}{file}.wav'
    print(audio_path)

    frames, fps = av_score.extract_frames(video_path)

    audio_peaks = av_score.detect_audio_peaks(audio_path)
    flow_trajectory, video_peaks = av_score.detect_video_peaks(frames, fps)

    score += av_score.calc_intersection_over_union(audio_peaks, video_peaks, fps)

print("Done!")
print('AV-Align: ', score/len(filenames))

# FVD Score

In [60]:
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)
device = "cpu"
model = model.to(device)
_ = model.eval()

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [62]:
from scipy.linalg import sqrtm
import os
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.nn.functional as F
from PIL import Image
import numpy as np
from scipy.stats import entropy
import os
from tqdm import tqdm
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)

class VideoBatchPrepper:

    def __init__(self, vid_dir_path, device = "cpu", model = None):
        self.device = device

        if model is None:
            self.model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)
            self.model = self.model.eval()
            self.model = self.model.to(self.device)
        else:
            self.model = model # assumes that self.model has been set to eval mode and sent to device already

        # specified for the model
        self.side_size = 256
        self.mean = [0.45, 0.45, 0.45]
        self.std = [0.225, 0.225, 0.225]
        self.crop_size = 256
        self.num_frames = 8
        self.sampling_rate = 8
        self.frames_per_second = 30

        self.transform =  ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(self.num_frames),
                    Lambda(lambda x: x/255.0),
                    NormalizeVideo(self.mean, self.std),
                    ShortSideScale(
                        size=self.side_size
                    ),
                    CenterCropVideo(crop_size=(self.crop_size, self.crop_size))
                ]
            ),
        )

        self.clip_duration = (self.num_frames * self.sampling_rate)/self.frames_per_second

        self.start_sec = 0 # should correspond to where the action happens in the video, but set to 0 for this.
        self.end_sec = self.start_sec + self.clip_duration

        # get the videos
        self.vid_dir_path = vid_dir_path
        self.vid_names = [i for i in os.listdir(vid_dir_path) if (i[-4:] == '.mp4')]
        print(f"Video Batch Preparer: Using {len(self.vid_names)} videos in directory: {self.vid_dir_path}")

    def get_video_batch(self, batch_start = 0, batch_end = None):
        if batch_end is None:
            batch_end = len(self.vid_names)
        vid_data_tgt = []
        for vid_name in self.vid_names[batch_start:batch_end]:
          vid_file_path = self.vid_dir_path + vid_name
          vid_data = self.transform(EncodedVideo.from_path(vid_file_path).get_clip(start_sec=self.start_sec,end_sec=self.end_sec))["video"]
          vid_data_tgt.append(vid_data)
        return(torch.stack(vid_data_tgt)) # end user must put it on the device

In [63]:
real_vid_dir_path = real_vids_path
real_vid_names = [i for i in os.listdir(real_vid_dir_path) if (i[-4:] == '.mp4')]
real_vb = VideoBatchPrepper(real_vid_dir_path, device = "cuda", model = model) # this is the exact same thing as for the inception score

real_features = np.zeros((len(real_vid_names), 400)) # we know it's 8192 bc thats what the model automatically pools to
batch_size = 4
if len(real_vid_names) % batch_size != 0:
  num_batches = len(real_vid_names) // batch_size + 1
else:
  num_batches = len(real_vid_names) // batch_size
for i in tqdm(range(num_batches)):
  start_idx = i * batch_size
  end_idx = (i + 1) * batch_size if i < num_batches - 1 else len(real_vid_names)
  realbatch1 = real_vb.get_video_batch(batch_start = start_idx, batch_end = end_idx) # todo: copy and paste this code into the fvd thing too, clean up
  realbatch1 = realbatch1.to(device)
  with torch.no_grad():
    testres = model(realbatch1)
  testres = testres.cpu().numpy()
  real_features[start_idx:end_idx, :] = testres
print(f"\nsaving to: {real_vids_npy_path}")
np.save(real_vids_npy_path, real_features)

Video Batch Preparer: Using 101 videos in directory: /content/gdrive/Shareddrives/CIS 6200 Final Project/data/Dog/dog_test_video/


100%|██████████| 26/26 [06:30<00:00, 15.02s/it]



saving to: /content/gdrive/Shareddrives/CIS 6200 Final Project/data/Dog/dog_test_video_fvd.npy


In [69]:
real_features = np.load(real_vids_npy_path)

In [65]:
gen_vid_dir_path = generated_vids_path
gen_vid_names = [i for i in os.listdir(gen_vid_dir_path) if (i[-4:] == '.mp4')]
gen_vb = VideoBatchPrepper(gen_vid_dir_path, device = "cuda", model = model) # this is the exact same thing as for the inception score

Video Batch Preparer: Using 100 videos in directory: /content/gdrive/Shareddrives/CIS 6200 Final Project/data/Mix/dog/


In [66]:
gen_features = np.zeros((len(gen_vid_names), 400))
batch_size = 4
if len(gen_vid_names) % batch_size != 0:
  num_batches = len(gen_vid_names) // batch_size + 1
else:
  num_batches = len(gen_vid_names) // batch_size
for i in tqdm(range(num_batches)):
  start_idx = i * batch_size
  end_idx = (i + 1) * batch_size if i < num_batches - 1 else len(gen_vid_names)
  batch = gen_vb.get_video_batch(batch_start = start_idx, batch_end = end_idx) # todo: copy and paste this code into the fvd thing too, clean up
  batch = batch.to(device)
  with torch.no_grad():
    res = model.forward(batch)
  res = res.cpu().numpy() # these are the features
  gen_features[start_idx:end_idx, :] = res
print(f"\nsaving to: {gen_vids_npy_path}")
np.save(gen_vids_npy_path, gen_features)

100%|██████████| 25/25 [04:20<00:00, 10.43s/it]


saving to: /content/gdrive/Shareddrives/CIS 6200 Final Project/data/Mix/mix_dog_fvd.npy





In [34]:
# optional:
# gen_features = np.load(gen_vids_npy_path)

In [67]:
# get the mean and covariance of real features
realmean = np.mean(real_features, axis = 0)
realcov = np.cov(real_features, rowvar = False)
# get mean and cov of generated features
genmean = np.mean(gen_features, axis = 0)
gencov = np.cov(gen_features, rowvar = False)

In [68]:
ssdiff = np.sum((realmean - genmean)**2)
covmean = sqrtm(realcov @ gencov, disp = False)[0]
if np.iscomplexobj(covmean):
    covmean = covmean.real

fvd = ssdiff + np.trace(realcov + gencov - 2 * covmean)
print(fvd)

-1744828002.4575338
