# Setup
This section initializes the dataset creation packages and import modules. First, we download the necessary packages and modules - we require PyAVm, PyTorch (and Pytorch Lightning), Youtube DL and FFMPEG. We then create folders and configure our crawler parameters.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Installs
!pip install av
!pip install --upgrade youtube_dl
!pip install --upgrade pytorch_lightning
!apt -y install ffmpeg lame

In [None]:
# Imports
import requests
import os

import os.path as osp
import math
import random
import pickle
import warnings

import glob
import h5py
import numpy as np

import torch
import torch.utils.data as data
import torch.nn.functional as F
from torchvision.datasets.video_utils import VideoClips
import pytorch_lightning as pl

In [None]:
# Setup Dataset
os.makedirs("video_dataset")
os.makedirs("video_dataset/trainTANAY")
os.makedirs("video_dataset/testTANAY")

In [None]:
# CONFIG
# Tanay is 1400 - 3000
# Nithya is 3000 - 4800 (Start at 30)
# Cathleen is 4800 - END (Roughly 6300) (Start at 48)
KEEP_PROB = 0.2
# Increment every time
OVERALL_CHUNK_TRAIN = 15
OVERALL_CHUNK_TEST = 14
# Increase by between 100 and 250
START_TRAIN = 2000
START_TEST = 2200
END_TEST = 2250 # 80% Train, 20% Test

# Datasets and Dataloaders

In [None]:
# Fetch video list
video_list_url = "https://data.csail.mit.edu/tofu/dataset/original_video_list.txt"
r = requests.get(video_list_url, auth=('user', 'pass'))
full_video_list = r.text.split()

def splits(video_list):
  train_list = video_list[START_TRAIN:START_TEST]
  test_list = video_list[START_TEST:END_TEST]
  return train_list, test_list

# Test and train splits
train_list, test_list = splits(full_video_list)

print(len(full_video_list))

In [None]:
# download train videos
for ind,url in enumerate(train_list):
  !youtube-dl $url -f 'bestvideo[height<=480]' -o 'video_dataset/trainTANAY/%(title)s.mp4' -q

# download test videos
for ind,url in enumerate(test_list):
  !youtube-dl $url -f 'bestvideo[height<=480]' -o 'video_dataset/testTANAY/%(title)s.mp4' -q

In [None]:
# Copied directly from https://github.com/wilson1yan/VideoGPT
def preprocess(video, resolution, sequence_length=None):
    # video: THWC, {0, ..., 255}
    video = video.permute(0, 3, 1, 2).float() / 255. # TCHW
    t, c, h, w = video.shape

    scale = resolution / min(h, w)
    if h < w:
        target_size = (resolution, math.ceil(w * scale))
    else:
        target_size = (math.ceil(h * scale), resolution)
    video = F.interpolate(video, size=target_size, mode='bilinear',
                          align_corners=False)

    # center crop
    t, c, h, w = video.shape
    w_start = (w - resolution) // 2
    h_start = (h - resolution) // 2
    video = video[:, :, h_start:h_start + resolution, w_start:w_start + resolution]
    video = video.permute(1, 0, 2, 3).contiguous() # CTHW

    video -= 0.5

    return video

# Custom dataset class
class VideoDataset(data.Dataset):

    def __init__(self, data_folder, sequence_length, resolution=64):

        super().__init__()
        self.sequence_length = sequence_length
        self.resolution = resolution
        folder = osp.join(data_folder, 'trainTANAY')
        files = glob.glob(osp.join(folder, '**', f'*.mp4'), recursive=True)

        print(f"Found {len(files)} files")

        warnings.filterwarnings('ignore')
        clips = VideoClips(files, clip_length_in_frames=sequence_length, frames_between_clips=sequence_length, num_workers=1)
        print("files converted")

        self.clips = []

        for i in range(clips.num_clips()):
          # Sample with probability
          if random.random() > KEEP_PROB:
            continue
          try:
            video, _, _, idx = clips.get_clip(i)
            self.clips.append(preprocess(video, resolution))
          except:
            continue

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        return self.clips[idx], self.clips[idx][:, ::2]

In [None]:
train_dataset = VideoDataset('./video_dataset', 31, resolution=64)
print((train_dataset[0][0].shape, train_dataset[0][1].shape), len(train_dataset))

In [None]:
# Copied directly from https://github.com/wilson1yan/VideoGPT
def preprocess(video, resolution, sequence_length=None):
    # video: THWC, {0, ..., 255}
    video = video.permute(0, 3, 1, 2).float() / 255. # TCHW
    t, c, h, w = video.shape

    scale = resolution / min(h, w)
    if h < w:
        target_size = (resolution, math.ceil(w * scale))
    else:
        target_size = (math.ceil(h * scale), resolution)
    video = F.interpolate(video, size=target_size, mode='bilinear',
                          align_corners=False)

    # center crop
    t, c, h, w = video.shape
    w_start = (w - resolution) // 2
    h_start = (h - resolution) // 2
    video = video[:, :, h_start:h_start + resolution, w_start:w_start + resolution]
    video = video.permute(1, 0, 2, 3).contiguous() # CTHW

    video -= 0.5

    return video

# Custom dataset class
class VideoDataset(data.Dataset):

    def __init__(self, data_folder, sequence_length, resolution=64):

        super().__init__()
        self.sequence_length = sequence_length
        self.resolution = resolution
        folder = osp.join(data_folder, 'testTANAY')
        files = glob.glob(osp.join(folder, '**', f'*.mp4'), recursive=True)

        print(f"Found {len(files)} files")

        warnings.filterwarnings('ignore')
        clips = VideoClips(files, clip_length_in_frames=sequence_length, frames_between_clips=sequence_length, num_workers=1)
        print("files converted")

        self.clips = []

        for i in range(clips.num_clips()):
          # Sample with probability
          if random.random() > KEEP_PROB:
            continue
          try:
            video, _, _, idx = clips.get_clip(i)
            self.clips.append(preprocess(video, resolution))
          except:
            continue

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        return self.clips[idx], self.clips[idx][:, ::2]

In [None]:
test_dataset = VideoDataset('./video_dataset', 31, resolution=64)
print((test_dataset[0][0].shape, test_dataset[0][1].shape), len(test_dataset))

# Saving and Cleanup

In [None]:
# Save to disk
train_file = f'train_dataset_part{OVERALL_CHUNK_TRAIN}.pt'
test_file = f'test_dataset_part{OVERALL_CHUNK_TEST}.pt'
torch.save(torch.stack(train_dataset.clips).to(torch.float16), train_file)
torch.save(torch.stack(test_dataset.clips).to(torch.float16), test_file)

In [None]:
train_loc = f'/content/drive/MyDrive/GenAIFinalProject/Data/Train/{train_file}'
test_loc = f'/content/drive/MyDrive/GenAIFinalProject/Data/Test/{test_file}'

!mv $train_file $train_loc
!mv $test_file $test_loc


In [None]:
!rm -rf video_dataset