In [1]:
import os
import json
import shutil
import requests
import datetime
from multiprocessing import Pool
from itertools import repeat
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from runtimestamp.runtimestamp import runtimestamp
runtimestamp()

Updated 2018-03-12 18:42:44.418604
By ly501
Using Python 3.6.1
On Linux-3.10.0-514.10.2.el7.x86_64-x86_64-with-centos-7.3.1611-Core


In [2]:
#Import necessary libraries and load api keys
key = os.environ.get('YT_KEY')
today = datetime.datetime.now()
root_dir = '/beegfs/work/smapp/youtube/'

In [3]:
def get_playlist_id(username, key):
    '''
    Get a playlist ID (channel) from a username
    '''
    url = ("https://www.googleapis.com/youtube/v3/channels"
           "?part=contentDetails"
           "&forUsername={}&key={}".format(username,key))
    response = requests.get(url)
    if response.ok:
        response_json = json.loads(response.text)
        if "items" in response_json:
            if response_json['items']:
                channel_id = response_json['items'][0]['id']
                return channel_id
    return -1

In [89]:
def get_video_urls_from_playlist_id(playlist_id, key):
    '''
    Returns all video URLs from a play list id.
    '''
    url = ("https://www.googleapis.com/youtube/v3/playlistItems"
           "?part=snippet&playlistId={}"
           "&maxResults=50"
           "&key={}".format(playlist_id, key))
    
    next_page_token = None
    ids = []
    run = True
    while run:
        if next_page_token: 
            url += "&pageToken={}".format(next_page_token)
        response = requests.get(url)
        if response.ok:
            response_json = json.loads(response.text)
            for item in response_json['items']:
                ids.append(item['snippet']['resourceId']['videoId'])
            try: 
                next_page_token = response_json['nextPageToken']
            except:
                run = False
            print("{} Videos".format(len(ids)))
        else:
            print(response)
            run = False
    
    return ids

In [5]:
def parse_video_metadata(item):
    '''
    Parses a JSON object for relevant fields
    '''
    video_meta = dict(
        channel_title = item["snippet"]["channelTitle"],
        channel_id =item["snippet"]["channelId"],
        video_publish_date = item["snippet"]["publishedAt"],
        video_title = item["snippet"]["title"],
        video_description = item["snippet"]["description"],
        video_category = item["snippet"]["categoryId"],
        video_view_count = item["statistics"]["viewCount"],
        video_comment_count = item["statistics"]["commentCount"],
        video_like_count = item["statistics"]["likeCount"],
        video_dislike_count = item["statistics"]["dislikeCount"],
        video_thumbnail = item["snippet"]["thumbnails"]["high"]["url"],
        collection_date = today
    )
    
    return video_meta

In [6]:
def get_video_metadata(video_id, key):
    '''
    Gets the raw video metadata, and parses it.
    '''
    http_endpoint = ("https://www.googleapis.com/youtube/v3/videos"
                     "?part=statistics,snippet"
                     "&id={}&key={}".format(video_id, key))
    response = requests.get(http_endpoint)
    if response.ok:
        response_json = json.loads(response.text)
        if 'items' in response_json:
            video_meta = response_json['items'][0]
            return parse_video_metadata(video_meta)
    return -1

In [92]:
def get_context(playlist_id):
    '''
    Makes directory for a playlist (channel)
    Returns the name of the metadata file.
    '''
    channel_dir = os.path.join(root_dir, playlist_id)
    metadata_filename = os.path.join(channel_dir, 'video_metadata.tsv')
    urls_filename = os.path.join(channel_dir, 'video_urls.csv')
    
    os.makedirs(channel_dir, exist_ok=True)
    shutil.chown(channel_dir, group='smapp')
    return metadata_filename, urls_filename

In [8]:
def is_user(channel_url):
    '''
    Checks if url is channel or user
    '''
    if 'youtube.com/user/' in channel_url:
        return True
    elif 'youtube.com/channel/' in channel_url:
        return False
    else:
        return

In [9]:
def get_youtube_id(channel_url):
    '''
    From a URL returns the YT ID.
    '''
    return channel_url.rstrip('/').split('/')[-1]

In [10]:
channels = '''https://www.youtube.com/user/SargonofAkkad100/
https://www.youtube.com/channel/UCL0u5uz7KZ9q-pe-VC8TY-w/
https://www.youtube.com/channel/UCla6APLHX6W3FeNLc8PYuvg
https://www.youtube.com/user/Timcasts/'''.split('\n')

In [119]:
df_users = pd.read_csv('../data/users.csv')

In [121]:
channels = df_users['users'].tolist()

In [111]:
!wc -l /beegfs/work/smapp/youtube/SargonofAkkad100/video_urls.csv


23701 /beegfs/work/smapp/youtube/SargonofAkkad100/video_urls.csv


In [122]:
for channel in channels:
    print(channel)
    yt_id = get_youtube_id(channel)
    playlist_id = get_playlist_id(yt_id, key) if is_user(channel) else yt_id
    playlist_id = 'UU' + playlist_id[2:]
    if not playlist_id:
        print("Getting the playlist ID is not working")
        continue
    metadata_filename, urls_filename = get_context(playlist_id)
    if not os.path.exists(metadata_filename):
        if not os.path.exists(urls_filename):
            video_urls = get_video_urls_from_playlist_id(playlist_id, key)
            if not video_urls:
                print("Listing Video URLs is not working")
                continue
            df_urls = pd.DataFrame(video_urls).to_csv(urls_filename, index=False)
        else:
            print("Cached that list")
            video_urls = pd.read_csv(urls_filename)['0'].tolist()

        # parse each video from the user
        with Pool(4) as pool:
            video_meta = pool.starmap(get_video_metadata, zip(video_urls[:5], repeat(key)))
        df = pd.DataFrame(video_meta)
        # write to csv
        df.to_csv(metadata_filename, index=False, sep='\t')  
        shutil.chown(metadata_filename, group='smapp')

In [138]:
def get_all_keys(d, key=[]):
    '''
    A recursive function that traverses json keys in a dict `d`,
    and prints the path to all keys
    '''
    if not isinstance(d, dict):
        print(''.join(['["' + k + '"]' for k in key]))
        return
    
    for k, v in d.items():
        key_path = key + [k]
        get_all_columns(d[k], key_path)

In [139]:
get_all_columns(response_json['items'][0])

["kind"]
["etag"]
["id"]
["snippet"]["publishedAt"]
["snippet"]["channelId"]
["snippet"]["title"]
["snippet"]["description"]
["snippet"]["thumbnails"]["default"]["url"]
["snippet"]["thumbnails"]["default"]["width"]
["snippet"]["thumbnails"]["default"]["height"]
["snippet"]["thumbnails"]["medium"]["url"]
["snippet"]["thumbnails"]["medium"]["width"]
["snippet"]["thumbnails"]["medium"]["height"]
["snippet"]["thumbnails"]["high"]["url"]
["snippet"]["thumbnails"]["high"]["width"]
["snippet"]["thumbnails"]["high"]["height"]
["snippet"]["thumbnails"]["standard"]["url"]
["snippet"]["thumbnails"]["standard"]["width"]
["snippet"]["thumbnails"]["standard"]["height"]
["snippet"]["thumbnails"]["maxres"]["url"]
["snippet"]["thumbnails"]["maxres"]["width"]
["snippet"]["thumbnails"]["maxres"]["height"]
["snippet"]["channelTitle"]
["snippet"]["categoryId"]
["snippet"]["liveBroadcastContent"]
["snippet"]["localized"]["title"]
["snippet"]["localized"]["description"]
["snippet"]["defaultAudioLanguage"]
["