In [33]:
# IMPORTS
from pathlib import Path
import json
import pandas as pd
import numpy as np
import sys
import csv
from collections import defaultdict
import json
from math import pi

from bokeh.io import output_file, show, output_notebook, export_png
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum

from IPython.display import HTML, display
import tabulate

output_notebook()
print(Category20c)
csv.field_size_limit(sys.maxsize)

{3: ['#3182bd', '#6baed6', '#9ecae1'], 4: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef'], 5: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d'], 6: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c'], 7: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c', '#fdae6b'], 8: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c', '#fdae6b', '#fdd0a2'], 9: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c', '#fdae6b', '#fdd0a2', '#31a354'], 10: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c', '#fdae6b', '#fdd0a2', '#31a354', '#74c476'], 11: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c', '#fdae6b', '#fdd0a2', '#31a354', '#74c476', '#a1d99b'], 12: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c', '#fdae6b', '#fdd0a2', '#31a354', '#74c476', '#a1d99b', '#c7e9c0'], 13: ['#3182bd', '#6baed6', '#9ecae1', '#c6dbef', '#e6550d', '#fd8d3c', '#fdae6b', '#fdd0a2', '#31a354', '#74c476', '#a1d9

9223372036854775807

In [2]:
# CONSTANTS
BASE_DIR = Path.cwd().parent
STATS_DIR = BASE_DIR / 'stats'
KINETICS_DIR = BASE_DIR / 'work_dir' / 'kinetics_700'

In [3]:
def plot_pie_chart(source, label):
    data = pd.Series(source).reset_index(name='value').rename(columns={'index':'class'})
    data['angle'] = data['value'] / data['value'].sum() * 2*pi
    data['color'] = Category20c[len(source)] if len(source) > 2 else ['#3182bd', '#c6dbef']

    p = figure(
        plot_height=400,
        plot_width=700,
        title="{} Class Distribution".format(label),
        toolbar_location=None,
        tools="hover",
        tooltips="@class: @value",
        x_range=(-0.5, 1.0)
    )

    p.wedge(
        x=0,
        y=1,
        radius=0.4,
        start_angle=cumsum('angle', include_zero=True),
        end_angle=cumsum('angle'),
        line_color="white",
        fill_color='color',
        legend='class',
        source=data
    )

    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None

    show(p)
    
    export_png(p, '{} pie.png'.format(label))

In [4]:
def get_stats_files():
    fieldnames = ['video_id', 'label', 'status', 'download_duration', 'ffmpeg_duration', 'total_duration', 'average_duration', 'elapsed', 'iteration', 'skipped_iteration', 'queue_id', 'pid']
    stats = []
    stats_dict = {}
    failed = []
    failed_dict = {} # Removes duplicates
    for file_path in STATS_DIR.iterdir():
        if 'csv' in file_path.suffix:
            with file_path.open(mode='r') as lf:                
                csv_reader = csv.DictReader(lf, delimiter=',')
                if 'stats' in file_path.stem:
                    for index, row in enumerate(csv_reader):
                        if row['video_id'] != 'video_id':
                            stats.append(dict(row))
                            stats_dict[row['video_id']] = dict(row)
                            
                if 'failed' in file_path.stem:
                    csv_reader = csv.DictReader(lf, delimiter=',', fieldnames=['video_id', 'class', 'error'])
                    for index, row in enumerate(csv_reader):
                        failed.append(row)
                        failed_dict[row['video_id']] = row
    # print(len(failed_dict.keys()), len(failed), len(failed_dict.keys()) - len(failed))         
    return stats_dict, failed_dict
stat_results, failed = get_stats_files()

In [5]:
print(len(stat_results.keys()), len(failed.keys()))

298651 15541


In [6]:
def read_results_file(source):
#     pd_json = pd.read_json(RESULTS_PATH)
    with source.open(mode='r') as results_json:
        data = json.load(results_json)
    return data

results_data = read_results_file(STATS_DIR / 'result.json')
# print(pd_json.head())

In [7]:
print(len(results_data.keys()))

646257


In [8]:
def read_kinetics_data():
    all_data = {}
    for f1 in KINETICS_DIR.iterdir():
        if f1.stem in ['test', 'train', 'val']:
            for f2 in f1.iterdir():
                if f2.suffix == '.json':
                    data = read_results_file(f2)
                    all_data.update(data)
    return all_data
            

kinetics_data = read_kinetics_data()    

In [9]:
print(len(kinetics_data.keys()))
results_keys_dict = {Path(key).stem: True for key in results_data.keys() if Path(key).suffix == '.mp4'}
kinetics_keys_dict = {key: True for key in kinetics_data.keys()}

646984


## Generate Error

In [10]:
def generate_unique_error(failed_list):
    unique_error = set([item['error'] for item in failed_list if not results_keys_dict.get(item['video_id'])])
    filtered_error = []
    error_dict = defaultdict(list)
    for index, item in enumerate(unique_error):
        if 'Got server HTTP error: HTTP Error 404: Not Found.' in item:
            filtered_error.append('HTTP Error 404')
            error_dict['HTTP Error 404'].append(item)
        
        elif 'copyright' in item:
            filtered_error.append('Copyright')
            error_dict['Copyright'].append(item)
            
        elif 'This video is unavailable' in item or 'no longer available' in item or 'is not available' in item or 'this video available' in item:
            filtered_error.append('Video Not Available')
            error_dict['Video Not Available'].append(item)
            
#         elif 'who has blocked it on copyright grounds.' in item:
#             filtered_error.append('who has blocked it on copyright grounds.')
#             error_dict['who has blocked it on copyright grounds.'].append(item)
            
#         elif 'This video is no longer available due to a copyright claim by' in item:
#             filtered_error.append('This video is no longer available due to a copyright claim by')
#             error_dict['This video is no longer available due to a copyright claim by'].append(item)
            
#         elif 'unable to download video data: Remote end closed connection without response' in item:
#             filtered_error.append('unable to download video data: Remote end closed connection without response')
#             error_dict['unable to download video data: Remote end closed connection without response'].append(item)
            
#         elif 'It is not available in your country.' in item:
#             filtered_error.append('It is not available in your country.')
#             error_dict['It is not available in your country.'].append(item)
            
#         elif 'This video is not available.' in item:
#             filtered_error.append('This video is not available.')
#             error_dict['This video is not available.'].append(item)
            
#         elif 'The uploader has not made this video available in your country.' in item:
#             filtered_error.append('The uploader has not made this video available in your country.')
#             error_dict['The uploader has not made this video available in your country.'].append(item)
            
#         elif 'This video is no longer available due to a privacy claim by a third party.' in item:
#             filtered_error.append('This video is no longer available due to a privacy claim by a third party.')
#             error_dict['This video is no longer available due to a privacy claim by a third party.'].append(item)
            
#         elif 'This video is no longer available because the YouTube account associated with this video has been terminated.' in item:
#             filtered_error.append('This video is no longer available because the YouTube account associated with this video has been terminated.')
#             error_dict['This video is no longer available because the YouTube account associated with this video has been terminated.'].append(item)
            
#         elif 'who has blocked it in your country on copyright grounds' in item:
#             filtered_error.append('who has blocked it in your country on copyright grounds')
#             error_dict['who has blocked it in your country on copyright grounds'].append(item)
            
#         elif 'The YouTube account associated with this video has been terminated due to multiple third-party notifications of copyright infringement.' in item:
#             filtered_error.append('The YouTube account associated with this video has been terminated due to multiple third-party notifications of copyright infringement.')
#             error_dict['The YouTube account associated with this video has been terminated due to multiple third-party notifications of copyright infringement.'].append(item)
        
#         elif 'unable to download video data: The read operation timed out' in item:
#             filtered_error.append('unable to download video data: The read operation timed out')
#             error_dict['unable to download video data: The read operation timed out'].append(item)
            
#         elif "This video has been removed for violating YouTube's Community Guidelines." in item:
#             filtered_error.append("This video has been removed for violating YouTube's Community Guidelines.")
#             error_dict["This video has been removed for violating YouTube's Community Guidelines."].append(item)
            
#         elif "This video is unavailable." in item:
#             filtered_error.append("This video is unavailable.")
#             error_dict["This video is unavailable."].append(item)
            
#         elif "This video is not available" in item:
#             filtered_error.append("This video is not available")
#             error_dict["This video is not available"].append(item)
            
        elif "Got server HTTP error: HTTP Error 503: Service Unavailable" in item:
            filtered_error.append("HTTP Error 503")
            error_dict["HTTP Error 503"].append(item)
            
#         elif "object has no attribute" in item:
#             filtered_error.append("object has no attribute")
#             error_dict["object has no attribute"].append(item)
            
        elif "Content Warning" in item:
            filtered_error.append("Content Warning")
            error_dict["Content Warning"].append(item)
        
        elif "This video has been removed by the user" in item:
            filtered_error.append("Video Removed By User")
            error_dict["Video Removed By User"].append(item)
            
#         elif "This video is no longer available because the uploader has closed their YouTube account." in item:
#             filtered_error.append("This video is no longer available because the uploader has closed their YouTube account.")
#             error_dict["This video is no longer available because the uploader has closed their YouTube account."].append(item)
        
#         elif "This video has been removed for violating YouTube's policy on nudity or sexual content." in item:
#             filtered_error.append("This video has been removed for violating YouTube's policy on nudity or sexual content.")
#             error_dict["This video has been removed for violating YouTube's policy on nudity or sexual content."].append(item)
        
#         elif "Name or service not known" in item:
#             filtered_error.append("Name or service not known")
#             error_dict["Name or service not known"].append(item)
            
#         elif "Unable to download webpage: Remote end closed connection without response" in item:
#             filtered_error.append("Unable to download webpage: Remote end closed connection without response")
#             error_dict["Unable to download webpage: Remote end closed connection without response"].append(item)
        
#         elif "This video has been removed for violating YouTube's policy on spam, deceptive practices, and scams." in item:
#             filtered_error.append("This video has been removed for violating YouTube's policy on spam, deceptive practices, and scams.")
#             error_dict["This video has been removed for violating YouTube's policy on spam, deceptive practices, and scams."].append(item)
        
        elif 'been removed for violating' in item:
            filtered_error.append("Violating YouTube's Terms")
            error_dict["Violating YouTube's Terms"].append(item)
            
        elif 'nable to download' in item or "object has no attribute" in item or 'Name or service not known' in item:
            filtered_error.append("Miscellaneous")
            error_dict["Miscellaneous"].append(item)
            
        elif "duplicate" in item:
            filtered_error.append("Duplicate Video")
            error_dict["Duplicate Video"].append(item)
            
        else:
            filtered_error.append(item.split("ERROR")[-1])
            error_dict[item.split("ERROR")[-1]].append(item)
            
    return set(filtered_error), error_dict

filtered_error, error_dict = generate_unique_error(list(failed.values()))

In [11]:
def generate_plot():
    print('Total Errors = ', len(filtered_error))
    pie_dict = {}
    for key, value in error_dict.items():
        len_val = len(value)
        pie_dict["{} ({})".format(key, len_val)] = len_val
        print(key, "=" , len_val)
        
    plot_pie_chart(pie_dict, 'Error')

generate_plot()

Total Errors =  9
Video Not Available = 10696
Copyright = 672
HTTP Error 404 = 943
Violating YouTube's Terms = 134
Video Removed By User = 337
Miscellaneous = 144
Duplicate Video = 2
HTTP Error 503 = 1


In [12]:
print([key for key in list(results_keys_dict.keys())[:5] if key])

['---v8pgm1eQ', '--0kKuQu4Gs', '--1f2DTKcwg', '--2V_kDPfDg', '--3X_T3dnAE']


In [13]:
results_not_in_kinetics = [key for key in results_keys_dict.keys() if not kinetics_keys_dict.get(key)]
kinetics_not_in_results = [key for key in kinetics_keys_dict.keys() if not results_keys_dict.get(key)]

# . missing_in_failed = [key for key in kinetics_not_in_results if failed_dict.get(key)]
# failed_not_in_missing = [key for key in failed_dict.keys() if kinetics_not_in_results.get(key)]

In [14]:
# print('results_not_in_kinetics', len(results_not_in_kinetics))
# print('kinetics_not_in_results', len(kinetics_not_in_results), len(kinetics_not_in_results) - len(failed))
# print('missing_in_failed', len(missing_in_failed), len(failed))

## Total File Size (628.43 GB)

In [15]:
total_file_size = sum([v['size'] for k, v in results_data.items() if Path(k).suffix == '.mp4'])
print(total_file_size)

674766307490


In [16]:
print(total_file_size / (1024 * 1024 * 1024))

628.4250947553664


## Overall Stats

In [17]:
def render_overall_results():
    kinetics_count = len(kinetics_keys_dict.keys())
    
    downloaded_count = len([key for key in results_keys_dict.keys() if kinetics_keys_dict.get(key)])
    failed_count = sum([len(value) for value in error_dict.values()])
    
    total_count = failed_count + downloaded_count
    
    missing_count = kinetics_count - total_count
    print("{} %".format(100 * failed_count / total_count))
    
    pie_dict = {
        'Downloaded Videos ({})'.format(downloaded_count): downloaded_count,
        'Failed Videos ({})'.format(failed_count): failed_count,
        'Unaccounted ({})'.format(missing_count): missing_count
    }
    plot_pie_chart(pie_dict, 'Downloaded')
    
render_overall_results()

2.377183979820212 %


## Stats Results

In [18]:
stats_df = pd.DataFrame(stat_results.values())

In [32]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3 - q1 #Interquartile range
    fence_low = q1 - 1.5 * iqr
    fence_high = q3 + 1.5 * iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

def get_quantile_stats(df):
    df = remove_outlier(df, 'download_duration')
    df = remove_outlier(df, 'ffmpeg_duration')
#     print("\n\nQuantile Mean \n", df.mean())
#     print("\n\nQuantile Max \n", df.max())
    return df

def get_download_time(df):
    return df['total_duration'].mean() * len(kinetics_data.keys()) / (3600 * 24)

def _round(val):
    return round(val, 2)

def render_table(ls_df):
    data = [['', 'Total Duration', 'Download Duration', 'FFMPEG Duration']]
    for label, df in ls_df:
        data.append([
            label,
            _round(df['total_duration']),
            _round(df['download_duration']),
            _round(df['ffmpeg_duration'])
        ])
    display(HTML(tabulate.tabulate(data, tablefmt='html')))
        
        
def get_stats_results():
#     print(stats_df.head())
    df = stats_df[[
        'average_duration',
        'download_duration',
        # 'elapsed',
        'ffmpeg_duration'
    ]].astype(float)
    
    df['download_dominates'] = np.where((df['download_duration'] > df['ffmpeg_duration']), 1, 0)
    df['ffmpeg_dominates'] = np.where((df['download_duration'] <= df['ffmpeg_duration']), 1, 0)
    
    df['total_duration'] = df['download_duration'] + df['ffmpeg_duration']
#     df.sort_values(by='average_duration').reset_index(drop=True)
    
#     print("Average Duration: ", df["total_duration"].mean(), df["total_duration"].max())
    
    print("\n\nMean \n", df.mean())
    print("\n\nMax \n", df.max())
    
#     print("Average Duration: ", df["total_duration"].mean(), df["total_duration"].max())
#     print("Average Download Duration: ", df["download_duration"].mean(), df["download_duration"].max())
#     print("Average FFMPEG Duration: ", df["ffmpeg_duration"].mean(), df["ffmpeg_duration"].max(), df["ffmpeg_duration"].std())
    
    df_quantile = get_quantile_stats(df)
    
    print('\n\nTotal Download Time', get_download_time(df))
    print('Total Quantile Download Time', get_download_time(df_quantile))
    
    df_do_dom = df[df["download_dominates"] == 1]
    df_ff_dom = df[df["ffmpeg_dominates"] == 1]
    
    df_ff_dom_q = get_quantile_stats(df_ff_dom)
#     print("\n\nFFMPEG Dominates Mean \n", df_ff_dom_q.mean())
#     print("\n\nFFMPEG Dominates Max \n", df_ff_dom_q.max())
    
    render_table([
        ['Full Mean', df.mean()],
        ['Full Max', df.max()],
        
        ['Quantile Mean', df_quantile.mean()],
        ['Quantile Max', df_quantile.max()],
        
        ['FFMPEG Dominates Mean', df_ff_dom_q.mean()],
        ['FFMPEG Dominates Max', df_ff_dom_q.max()],
    ])
    
    print("\n\nDownload Dominates", len(df_do_dom))
    print("FFMPEG Dominates", len(df_ff_dom))
    print("Frac FFMPEG Dominates", 100 * len(df_ff_dom) / (len(df_ff_dom) + len(df_do_dom)) )
    
    # print(df["download_duration"].quantile(0.75))
    
#     p = figure(plot_width=300, plot_height=300)
#     p.multi_line(
#         xs=[df1.index.values] * 2,
#         ys=[df1['download_duration'].values, df1['ffmpeg_duration'].values],
#         color=['red','green']
#     )

#     show(p)
get_stats_results()



Mean 
 average_duration      20.959834
download_duration     21.469640
ffmpeg_duration        2.054902
download_dominates     0.983285
ffmpeg_dominates       0.016715
total_duration        23.524541
dtype: float64


Max 
 average_duration        273.4
download_duration     11223.7
ffmpeg_duration         488.9
download_dominates        1.0
ffmpeg_dominates          1.0
total_duration        11246.6
dtype: float64


Total Download Time 176.15742897643312
Total Quantile Download Time 80.70279296939054


0,1,2,3
,Total Duration,Download Duration,FFMPEG Duration
Full Mean,23.52,21.47,2.05
Full Max,11246.6,11223.7,488.9
Quantile Mean,10.78,9.58,1.2
Quantile Max,48.8,45.1,3.8
FFMPEG Dominates Mean,9.84,3.67,6.17
FFMPEG Dominates Max,29.4,10.5,19.0




Download Dominates 293659
FFMPEG Dominates 4992
Frac FFMPEG Dominates 1.6715162514105093
