In [79]:
from distutils.log import error
import os
import shutil
from google.cloud import storage
import argparse
from pytube import YouTube
import ffmpeg
import pandas as pd
from pytube.exceptions import VideoUnavailable
import subprocess
from decord import VideoReader
import os
import multiprocessing
from multiprocessing import Pool
import concurrent.futures
import time

In [80]:
# connect to gcloud bucket
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'survai-data-connect.json'
storage_client = storage.Client()
bucket_name = 'js_test_bucket'



In [81]:
csv_path = 'var_data.csv'

clarity_level = ['none', 'easy', 'medium', 'hard']

In [82]:
# read csv, clean data
df = pd.read_csv(csv_path)
df = df.reset_index()
df['clarity_level'] = df['clarity_level'].fillna('none') 
df['split'] = 'train'

# open label_map.txt, create list of labels with their index as key
txt = open('label_map.txt', 'r')
lines = [s.strip('\n')for s in txt]
label_map = dict((index, label) for label, index in enumerate(lines, start=0))
broken_videos = []

videos = storage_client.list_blobs(bucket_name, prefix='datasets/var/master_videos/')
video_list = [video.name for video in videos]

has_dash = False
df = df.head(10)

In [83]:
videos = []

def create_urls(row):

    label = row.label
    start = row.time_start
    end = row.time_end
    id = str(row.id)
    url = f'https://www.youtube.com/watch?v={id[0:11]}'
    
    videos.append([
        url,
        label,
        start,
        end,
        id,
        row.clarity_level
    ])


df.apply(create_urls, axis=1)

videos

[['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'crowd',
  0,
  12,
  '4pRPc04xoEo1',
  'none'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'spray',
  12,
  18,
  '4pRPc04xoEo2',
  'hard'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'running',
  12,
  21,
  '4pRPc04xoEo3',
  'hard'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'crowd',
  34,
  44,
  '4pRPc04xoEo4',
  'none'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'crowd',
  66,
  71,
  '4pRPc04xoEo5',
  'none'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'spray',
  87,
  93,
  '4pRPc04xoEo6',
  'bad'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'crowd',
  95,
  107,
  '4pRPc04xoEo7',
  'none'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'crowd',
  135,
  153,
  '4pRPc04xoEo8',
  'none'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'crowd',
  167,
  173,
  '4pRPc04xoEo9',
  'none'],
 ['https://www.youtube.com/watch?v=4pRPc04xoEo',
  'crowd',
  179,
  192,
  '4pRPc04xo

In [84]:
# function to upload videos to gcloud bucket
def upload_to_bucket(blob_name, file_path, bucket_name):
    try:
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(blob_name)
        blob.upload_from_filename(file_path)
    except Exception as e:
        print(e)

In [85]:
# function to download videos, trim, and upload to gcloud storage
def download_videos(video):

    # ignore labels not in label map or already in gcloud
    blob = f'datasets/var/master_videos/{video[1]}/{video[4]}_{str(video[2]).zfill(6)}_{str(video[3]).zfill(6)}.mp4'

    if video[1] in label_map.keys() and blob not in video_list:

        try:

            # download full video from youtube
            vid = YouTube(video[0], use_oauth=True, allow_oauth_cache=True) 
            yt_video = vid.streams.get_highest_resolution()
            yt_video.download(output_path=f'test/{video[1]}', filename=f"{video[4]}.mp4")
            print(f'{video[4]} downloaded')


            # trim video to specified start and end time
            command = [
                'ffmpeg', '-i',
                f'test/{video[1]}/{video[4]}.mp4', '-ss',
                str(video[2]), '-t',
                str(video[3] - video[2]), '-c:v', 'libx264', '-c:a', 'copy',
                '-threads', '1', '-loglevel', 'panic',
                f'test/{video[1]}/{video[4]}_{str(video[2]).zfill(6)}_{str(video[3]).zfill(6)}.mp4'
            ]
            command = ' '.join(command)
            print(f'{video[4]} clipped successfully')

            try:
                subprocess.check_output(
                    command, shell=True, stderr=subprocess.STDOUT)
            except subprocess.CalledProcessError as err:
                return err.output
            

            # upload to gcloud bucket
            upload_to_bucket(f'datasets/var/master_videos/{video[1]}/{video[4]}_{str(video[2]).zfill(6)}_{str(video[3]).zfill(6)}.mp4', 
                            f'test/{video[1]}/{video[4]}_{str(video[2]).zfill(6)}_{str(video[3]).zfill(6)}.mp4', bucket_name)
                                
            print(f'{video[4]}_{str(video[2]).zfill(6)}_{str(video[3]).zfill(6)}.mp4 uploaded to gcloud successfully') 


            # remove videos from local machine
            os.remove(f'test/{video[1]}/{video[4]}.mp4')
            os.remove(f'test/{video[1]}/{video[4]}_{str(video[2]).zfill(6)}_{str(video[3]).zfill(6)}.mp4')

        except:
            broken_videos.append(video)
               
    
    else:
        print(f'{video[4]}_{str(video[2]).zfill(6)}_{str(video[3]).zfill(6)}.mp4 SKIPPED')

In [86]:
# execute download videos as multithreaded process
with concurrent.futures.ThreadPoolExecutor() as executor:
    
    executor.map(download_videos, videos)

4pRPc04xoEo1_000000_000012.mp4 SKIPPED
4pRPc04xoEo2_000012_000018.mp4 SKIPPED
4pRPc04xoEo3_000012_000021.mp4 SKIPPED
4pRPc04xoEo4_000034_000044.mp4 SKIPPED
4pRPc04xoEo5_000066_000071.mp4 SKIPPED
4pRPc04xoEo6_000087_000093.mp4 SKIPPED
4pRPc04xoEo7_000095_000107.mp4 SKIPPED
4pRPc04xoEo8_000135_000153.mp4 SKIPPED
4pRPc04xoEo9_000167_000173.mp4 SKIPPED
4pRPc04xoEo10_000179_000192.mp4 SKIPPED


In [8]:
for video in videos:
    download_videos(video)