In [1]:
import os
import shutil
import pandas as pd
from pytube.exceptions import VideoUnavailable
from pytube import YouTube
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

In [None]:
test_path = '11-11-21_03-17-22_LEGACY_test.csv'
df = pd.read_csv(test_path)
df = df
df = df.reset_index()  # make sure indexes pair with number of rows
df.head()


In [3]:
broken_videos = set()

def retry_download(tries, url, file_name, label):
    for i in range(tries):
            try:
                video = YouTube(url)
                yt_video = video.streams.get_highest_resolution()
                yt_video.download(output_path=f'videos/{label}/', filename=f"{file_name}.mp4")

                # download videos into correct folder
                ffmpeg_extract_subclip(filename=f'videos/{label}/{file_name}.mp4', t1=start, t2=end, targetname=f'videos/{label}/{file_name}_{start}_{end}.mp4') 
                os.remove(f'videos/{label}/{file_name}.mp4')
                
            except VideoUnavailable:
                if i < tries:
                    broken_videos.add(file_name)
                    continue

In [None]:
# Iterates all videos in csv
for index, row in df.iterrows():
    file_name = str(row['youtube_id'])
    start = int(row['time_start'])
    end = int(row['time_end'])
    label = str(row['label'])
    url = f'https://www.youtube.com/watch?v={file_name[0:11]}'
    fill_start = str(start).zfill(6)
    fill_end = str(end).zfill(6)

    if os.path.exists(f"videos/{label}/{file_name}_{start}_{end}.mp4"):
            print('file already exists')
            continue
 
    try:
        video = YouTube(url)
        yt_video = video.streams.get_highest_resolution()
        yt_video.download(output_path=f'videos/{label}/', filename=f"{file_name}.mp4")

        # download videos into correct folder
        
        ffmpeg_extract_subclip(filename=f'videos/{label}/{file_name}.mp4', t1=start, t2=end, 
                                targetname=f'videos/{label}/{file_name}_{fill_start}_{fill_end}.mp4') 
        os.remove(f'videos/{label}/{file_name}.mp4')

    except:
        print('retrying...', file_name)
        retry_download(1, url, file_name, label)      


In [5]:
# Deletes broken videos
for video in broken_videos:
    df = df.drop(df.loc[df['youtube_id'] == video].index)

In [6]:
# Creates a list with all labels from label_map.txt
with open('label_map.txt', 'r') as f:
    label_list = [line.strip() for line in f]


In [7]:
# Drops all rows where df['label'] != to labels in label_list
for label in df['label']:
    if label not in label_list:
        df = df.drop(df.loc[df['label'] == label].index)
    
    

In [8]:
# Finding 35 percent of the least common label
least_label_35 = int(df['label'].value_counts().min()*.35)
# Groups randomized youtube_id's with corresponding labels, time_start, time_end
df_val = df.groupby('label')[['youtube_id', 'time_start', 'time_end', 'split']].apply(lambda s: s.sample(least_label_35))

In [9]:
df_val = pd.DataFrame(df_val)
df_val = df_val.reset_index()
df_validation = df_val.drop(columns = 'level_1')

In [10]:
# remove df_val from df
cond = df['youtube_id'].isin(df_validation['youtube_id'])
df.drop(df[cond].index, inplace = True)

In [11]:
# move val videos to new directory
for ind, row in df_validation.iterrows():
    
    id = str(row['youtube_id'])
    label = str(row['label'])
    start = row['time_start']
    end = row['time_end']
    fill_start = str(start).zfill(6) # make times the same number of digits
    fill_end = str(end).zfill(6)

    # ignore bad labels and bad videos
    if label not in label_list == True:
        continue
        
    # create directory to move val videos
    val_path = f'val/{label}'
    if not os.path.exists(val_path):
        os.makedirs(val_path)
    
    # Ignore files already downloaded
    if os.path.exists(f"val/{label}/{id}_{fill_start}_{fill_end}.mp4"):
        print('File already exists')
        continue

    # move videos to val videos
    shutil.copyfile(f"videos/{label}/{id}_{fill_start}_{fill_end}.mp4", f"val/{label}/{id}_{fill_start}_{fill_end}.mp4")



In [40]:
'''Adds every video file name from val subdirectories and adds file names into a set. 
This is not complete, works, but not 100% accurate'''

file_list = set()
for subdir, dirs, files in os.walk('val'):
    for file in files:
        for i in df_validation['label']:
                file_list.add(f'{i}/{file}')

In [41]:
# Writes video file name from set into a val txt
with open('val_list6.txt', 'w') as f:
    for i in file_list:        
        f.write(f'{i}\n')

In [14]:
# move train videos to new directory
for ind, row in df.iterrows():
    
    id = str(row['youtube_id'])
    label = str(row['label'])
    start = row['time_start']
    end = row['time_end']
    fill_start = str(start).zfill(6) # make times the same number of digits
    fill_end = str(end).zfill(6)

    # ignore bad labels and bad videos
    if label not in label_list:
        continue
        
    # create directory to for train videos
    train_path = f'train/{label}'
    if not os.path.exists(train_path):
        os.makedirs(train_path)

    # Ignore files already downloaded
    if os.path.exists(f"train/{label}/{id}_{fill_start}_{fill_end}.mp4"):
        print('File already exists')
        continue

    # move videos to train videos
    shutil.copyfile(f"videos/{label}/{id}_{fill_start}_{fill_end}.mp4", f"train/{label}/{id}_{fill_start}_{fill_end}.mp4")