In [None]:
from googleapiclient.discovery import build
import numpy as np
import pandas as pd

In [None]:
# Prints video data to the screen
def display_data(y_playlist_data):

    for video in y_playlist_data:
        print(f'''Published Date: {video['snippet']['publishedAt']}
        \n\nTitle: {video['snippet']['title']}
        \n\nDescription: {video['snippet']['description']}
        \n\nVideo Id: {video['contentDetails']['videoId']}
        \n\nThumbnails: 
        \nDefault: {video['snippet']['thumbnails']['default']['url']},
        \nMax Res: {video['snippet']['thumbnails']['maxres']['url']},
        \nStandard: {video['snippet']['thumbnails']['standard']['url']}\n\n''')

# helper function to create a file for the video descriptions        
def path_to_data_path(path):
    return path.replace('.csv','_Descriptions.csv')

# Saves the video data to a new file. If the file exists the data is appended.
def save_data(y_playlist_data,save_path,file_exists=False):
    # check of the file is being saved as a csv
    if not save_path.endswith('.csv'):
        print('File must be saved as a .csv')
        return []
    
    # Create the path for the video descriptions
    save_desc_path = path_to_data_path(save_path)
    
    # Create lists to store the metadata
    published_dates = []
    titles = []
    descriptions = []
    video_ids = []
    default_thumbnails = []
    max_res_thumbnails = []
    standard_thumbnails = []
    
    
    for video in y_playlist_data:
        # Store the metadata in the lists
        published_dates.append(video['snippet']['publishedAt'])
        titles.append(video['snippet']['title'])
        descriptions.append(video['snippet']['description'])
        video_ids.append(video['contentDetails']['videoId'])
        
        # use try blocks to extract potentially missing json data
        try:
            default_thumbnails.append(video['snippet']['thumbnails']['default']['url'])
        except Excaption as e:
            default_thumbnails.append(None)
        try: 
            max_res_thumbnails.append(video['snippet']['thumbnails']['maxres']['url'])
        except Exception as e:
            max_res_thumbnails.append(None)
        try:
            standard_thumbnails.append(video['snippet']['thumbnails']['standard']['url'])
        except Exception as e:
            standard_thumbnails.append(None)
    
    # Merge the data to form the final dataset
    final_data = {'Published_Date':published_dates, 'Video_Title': titles, 'Video_ID':video_ids, 'Thumbnail_Default': default_thumbnails, 'Thumbnail_Standard':standard_thumbnails, 'Thumbnail_Max_Res': max_res_thumbnails} 
    final_desc = {'Video_ID':video_ids, 'Description':descriptions}
    
    # Create dataframes for the video and description data
    playlist_data = pd.DataFrame(final_data)
    playlist_desc = pd.DataFrame(final_desc)
    
    # Save the files
    if file_exists:
        existing_file = pd.read_csv(save_path)
        new_file = pd.concat([existing_file,playlist_data])
        new_file.to_csv(save_path, encoding='utf-8', index=False)
        
        existing_desc_file = pd.read_csv(save_desc_path)
        new_desc_file = pd.concat([existing_desc_file,playlist_desc])
        new_desc_file.to_csv(save_desc_path, encoding='utf-8', index=False)
        
    else:
        # Save into csv format in the desired location
        playlist_data.to_csv(save_path, encoding='utf-8', index=False)
        playlist_desc.to_csv(save_desc_path, encoding='utf-8', index=False)
    
    # return the video dataframe
    return playlist_data

In [None]:
# Provide your api key
api_key = 'Your API Key Goes Here'

In [None]:
# Create the youtube client to make api calls
youtube = build('youtube', 'v3', developerKey=api_key)

In [None]:
# Notes:

# To gather all (up to the last 20,000) uploaded videos
# we must have the ID of the Uploads playlist on a YouTube channel.
# The Uploads playlist ID is the same as the channel ID but with the
# second letter changed from a C to a U.

# Fox News Channel id: UCXIJgqnII2ZOINSWNOGFThA
# Fox News Uploads id: UUXIJgqnII2ZOINSWNOGFThA

# CNN Channel id: UCupvZG-5ko_eiXAupbDfxWw
# CNN Uploads id: UUupvZG-5ko_eiXAupbDfxWw

# PBS Channel id: UC6ZFN9Tx6xh-skXCuRHCDpQ
# PBS Uploads id: UU6ZFN9Tx6xh-skXCuRHCDpQ

# NBC News Channel id: UCeY0bbntWzzVIaj2z3QigXg
# NBC News Uploads id: UUeY0bbntWzzVIaj2z3QigXg

# BlazeTV Channel id: UCKgJEs_v0JB-6jWb8lIy9Xw
# BlazeTV Uploads id: UUKgJEs_v0JB-6jWb8lIy9Xw


In [None]:
# Search for the news channel to locate the channel id
request = youtube.search().list(q='BlazeTV',part='snippet',type='channel',maxResults=5)
response= request.execute()
print(response)

In [None]:
# https://medium.com/analytics-vidhya/how-to-extract-youtube-video-titles-using-the-youtube-data-api-45d3f4998486

# Uses the uploads playlist id to get all uploaded videos (capped at 20,000)
def youtube_playlist_data(playlist_id, cut_off=20000, results_per_page=50, token=None):
    # Results are capped at 50 per request via YouTube
    if results_per_page > 50:
        print('You can only request 50 results per page max. Please lower the number of results per page.')
        return []
    
    video_data = []
    
    # A variable responsible for breaking out of the loop if the cutoff point is reached
    query_counter = 0
    
    # The while loop continues until the items are present in the playlist
    while True:
        # Create the request
        r = youtube.playlistItems().list(playlistId=playlist_id,
                                         part='snippet, contentDetails',
                                         maxResults=results_per_page,
                                         pageToken=token)
        
        # Execute the request
        y_playlist_data = r.execute()

        # Increment the number of records recovered
        query_counter += results_per_page
        
        #Store the data
        video_data += y_playlist_data['items']

        # Update the token so it can be used to get the next page of data
        token = y_playlist_data.get('nextPageToken')

        # If there is no token or we've reached a cutoff point break the loop
        if (token is None) or (query_counter >= cut_off):
            break
        
    # Return the final collected data and the token
    return video_data, token

In [None]:
# Get the data
data, last_token= youtube_playlist_data('UUKgJEs_v0JB-6jWb8lIy9Xw', results_per_page=50, token=None)

In [None]:
# If last_token is none, we've reached the end of the videos.
# If there is a value, it can be passed to the youtube_playlist_data 
# function to set a starting point for more data collection.
last_token

In [None]:
# Set the file name prefix
prefix = 'BlazeTV'

In [None]:
# Create the file names
file_path = f'/Data/{prefix}_Data.csv'
data_file_path = path_to_data_path(file_path)
print(file_path,'\n\n',data_file_path)

In [None]:
# Save the files
df = save_data(data,file_path,file_exists=False)

In [None]:
# Check that the data looks good
df_data = pd.read_csv(file_path)
df_data

In [None]:
# Check that the data looks good
df_desc = pd.read_csv(data_file_path)
df_desc