In [1]:
# IMPORTS
from pathlib import Path
import json
import pandas as pd
import numpy as np
import sys
import csv
from collections import defaultdict
import json

csv.field_size_limit(sys.maxsize)

131072

In [2]:
# CONSTANTS
BASE_DIR = Path.cwd().parent
STATS_DIR = BASE_DIR / 'stats'
KINETICS_DIR = BASE_DIR / 'work_dir' / 'kinetics_700'

In [3]:
def get_stats_files():
    fieldnames = ['video_id', 'label', 'status', 'download_duration', 'ffmpeg_duration', 'total_duration', 'average_duration', 'elapsed', 'iteration', 'skipped_iteration', 'queue_id', 'pid']
    stats = []
    stats_dict = {}
    failed = []
    for file_path in STATS_DIR.iterdir():
        if 'csv' in file_path.suffix:
            with file_path.open(mode='r') as lf:                
                csv_reader = csv.DictReader(lf, delimiter=',')
                if 'stats' in file_path.stem:
                    for index, row in enumerate(csv_reader):
                        if row['video_id'] != 'video_id':
                            stats.append(dict(row))
                            stats_dict[row['video_id']] = dict(row)
                            
                if 'failed' in file_path.stem:
                    csv_reader = csv.DictReader(lf, delimiter=',', fieldnames=['video_id', 'class', 'error'])
                    for index, row in enumerate(csv_reader):
                        failed.append(row)
                        
    return stats_dict, failed
output, failed = get_stats_files()

In [4]:
print(len(output), len(failed))

298651 15792


In [5]:
def generate_unique_error(failed_list):
    unique_error = set([item['error'] for item in failed])
    filtered_error = []
    error_dict = defaultdict(list)
    for index, item in enumerate(unique_error):
        if 'Got server HTTP error: HTTP Error 404: Not Found.' in item:
            filtered_error.append('Got server HTTP error: HTTP Error 404: Not Found.')
            error_dict['Got server HTTP error: HTTP Error 404: Not Found.'].append(item)
        
        elif 'copyright' in item:
            filtered_error.append('copyright')
            error_dict['copyright'].append(item)
            
        elif 'This video is unavailable' in item or 'no longer available' in item or 'is not available' in item or 'this video available' in item:
            filtered_error.append('video is not available')
            error_dict['video is not available'].append(item)
            
#         elif 'who has blocked it on copyright grounds.' in item:
#             filtered_error.append('who has blocked it on copyright grounds.')
#             error_dict['who has blocked it on copyright grounds.'].append(item)
            
#         elif 'This video is no longer available due to a copyright claim by' in item:
#             filtered_error.append('This video is no longer available due to a copyright claim by')
#             error_dict['This video is no longer available due to a copyright claim by'].append(item)
            
#         elif 'unable to download video data: Remote end closed connection without response' in item:
#             filtered_error.append('unable to download video data: Remote end closed connection without response')
#             error_dict['unable to download video data: Remote end closed connection without response'].append(item)
            
#         elif 'It is not available in your country.' in item:
#             filtered_error.append('It is not available in your country.')
#             error_dict['It is not available in your country.'].append(item)
            
#         elif 'This video is not available.' in item:
#             filtered_error.append('This video is not available.')
#             error_dict['This video is not available.'].append(item)
            
#         elif 'The uploader has not made this video available in your country.' in item:
#             filtered_error.append('The uploader has not made this video available in your country.')
#             error_dict['The uploader has not made this video available in your country.'].append(item)
            
#         elif 'This video is no longer available due to a privacy claim by a third party.' in item:
#             filtered_error.append('This video is no longer available due to a privacy claim by a third party.')
#             error_dict['This video is no longer available due to a privacy claim by a third party.'].append(item)
            
#         elif 'This video is no longer available because the YouTube account associated with this video has been terminated.' in item:
#             filtered_error.append('This video is no longer available because the YouTube account associated with this video has been terminated.')
#             error_dict['This video is no longer available because the YouTube account associated with this video has been terminated.'].append(item)
            
#         elif 'who has blocked it in your country on copyright grounds' in item:
#             filtered_error.append('who has blocked it in your country on copyright grounds')
#             error_dict['who has blocked it in your country on copyright grounds'].append(item)
            
#         elif 'The YouTube account associated with this video has been terminated due to multiple third-party notifications of copyright infringement.' in item:
#             filtered_error.append('The YouTube account associated with this video has been terminated due to multiple third-party notifications of copyright infringement.')
#             error_dict['The YouTube account associated with this video has been terminated due to multiple third-party notifications of copyright infringement.'].append(item)
        
#         elif 'unable to download video data: The read operation timed out' in item:
#             filtered_error.append('unable to download video data: The read operation timed out')
#             error_dict['unable to download video data: The read operation timed out'].append(item)
            
#         elif "This video has been removed for violating YouTube's Community Guidelines." in item:
#             filtered_error.append("This video has been removed for violating YouTube's Community Guidelines.")
#             error_dict["This video has been removed for violating YouTube's Community Guidelines."].append(item)
            
#         elif "This video is unavailable." in item:
#             filtered_error.append("This video is unavailable.")
#             error_dict["This video is unavailable."].append(item)
            
#         elif "This video is not available" in item:
#             filtered_error.append("This video is not available")
#             error_dict["This video is not available"].append(item)
            
        elif "Got server HTTP error: HTTP Error 503: Service Unavailable" in item:
            filtered_error.append("Got server HTTP error: HTTP Error 503: Service Unavailable")
            error_dict["Got server HTTP error: HTTP Error 503: Service Unavailable"].append(item)
            
#         elif "object has no attribute" in item:
#             filtered_error.append("object has no attribute")
#             error_dict["object has no attribute"].append(item)
            
        elif "Content Warning" in item:
            filtered_error.append("Content Warning")
            error_dict["Content Warning"].append(item)
        
        elif "This video has been removed by the user" in item:
            filtered_error.append("This video has been removed by the user")
            error_dict["This video has been removed by the user"].append(item)
            
#         elif "This video is no longer available because the uploader has closed their YouTube account." in item:
#             filtered_error.append("This video is no longer available because the uploader has closed their YouTube account.")
#             error_dict["This video is no longer available because the uploader has closed their YouTube account."].append(item)
        
#         elif "This video has been removed for violating YouTube's policy on nudity or sexual content." in item:
#             filtered_error.append("This video has been removed for violating YouTube's policy on nudity or sexual content.")
#             error_dict["This video has been removed for violating YouTube's policy on nudity or sexual content."].append(item)
        
#         elif "Name or service not known" in item:
#             filtered_error.append("Name or service not known")
#             error_dict["Name or service not known"].append(item)
            
#         elif "Unable to download webpage: Remote end closed connection without response" in item:
#             filtered_error.append("Unable to download webpage: Remote end closed connection without response")
#             error_dict["Unable to download webpage: Remote end closed connection without response"].append(item)
        
#         elif "This video has been removed for violating YouTube's policy on spam, deceptive practices, and scams." in item:
#             filtered_error.append("This video has been removed for violating YouTube's policy on spam, deceptive practices, and scams.")
#             error_dict["This video has been removed for violating YouTube's policy on spam, deceptive practices, and scams."].append(item)
        
        elif 'been removed for violating' in item:
            filtered_error.append("been removed for violating")
            error_dict["been removed for violating"].append(item)
            
        elif 'nable to download' in item or "object has no attribute" in item or 'Name or service not known' in item:
            filtered_error.append("miscelleneous")
            error_dict["miscelleneous"].append(item)
            
        else:
            filtered_error.append(item.split("ERROR")[-1])
            error_dict[item.split("ERROR")[-1]].append(item)
            
    return set(filtered_error), error_dict
    
filtered_error, error_dict = generate_unique_error(failed)
print(len(filtered_error))
for key, value in error_dict.items():
    print(len(value), key)

9
977 Got server HTTP error: HTTP Error 404: Not Found.
10817 video is not available
167 miscelleneous
341 This video has been removed by the user
135 been removed for violating
672 copyright
2 : This video is a duplicate of another YouTube video\n'
1 Got server HTTP error: HTTP Error 503: Service Unavailable


In [6]:
def read_results_file(source):
#     pd_json = pd.read_json(RESULTS_PATH)
    with source.open(mode='r') as results_json:
        data = json.load(results_json)
    return data

results_data = read_results_file(STATS_DIR / 'result.json')
# print(pd_json.head())

In [7]:
print(len(results_data.keys()))

646257


In [8]:
def read_kinetics_data():
    all_data = {}
    for f1 in KINETICS_DIR.iterdir():
        if f1.stem in ['test', 'train', 'val']:
            for f2 in f1.iterdir():
                if f2.suffix == '.json':
                    data = read_results_file(f2)
                    all_data.update(data)
    return all_data
            

kinetics_data = read_kinetics_data()    

In [9]:
print(len(kinetics_data.keys()))

646984


In [38]:
results_keys = {Path(key).stem: True for key in results_data.keys() if Path(key).suffix == '.mp4'}
kinetics_keys = {key: True for key in kinetics_data.keys()}

In [39]:
# print(results_keys[:5], kinetics_keys[:5])
print([key for key in list(results_keys.keys())[:5] if key])

['---v8pgm1eQ', '--0kKuQu4Gs', '--1f2DTKcwg', '--2V_kDPfDg', '--3X_T3dnAE']


In [45]:
results_not_in_kinetics = [key for key in results_keys.keys() if not kinetics_keys.get(key)]
kinetics_not_in_results = [key for key in kinetics_keys.keys() if not results_keys.get(key)]

failed_dict = {item['video_id']: True for item in failed}
missing_in_failed = [key for key in kinetics_not_in_results if failed_dict.get(key)]
# failed_not_in_missing = [key for key in failed_dict.keys() if kinetics_not_in_results.get(key)]

In [47]:
print('results_not_in_kinetics', len(results_not_in_kinetics))
print('kinetics_not_in_results', len(kinetics_not_in_results), len(kinetics_not_in_results) - len(failed))
print('missing_in_failed', len(missing_in_failed), len(failed))

results_not_in_kinetics 12278
kinetics_not_in_results 15380 -412
missing_in_failed 15380 15792
