<a href="https://colab.research.google.com/github/SBalas/Web-scraping-and-data-prep/blob/main/YouTubeAPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Extracting playlist and videos from a YouTube channel

- YouTube API source page - https://developers.google.com/youtube/v3/getting-started
- Github for Google APIs - https://github.com/googleapis/google-api-python-client
- YouTube APIv3 info - https://googleapis.github.io/google-api-python-client/docs/dyn/youtube_v3.html


In [2]:
import googleapiclient.discovery
from IPython.display import JSON
import json
import pandas as pd

In [42]:
# Kenji Lopez Alt's Youtube channel
# https://www.youtube.com/@JKenjiLopezAlt

forHandle = '@JKenjiLopezAlt'

In [4]:
# API key stored in google collab secrets
from google.colab import userdata
api_key = userdata.get('Youtube_api_key')

In [5]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey = api_key)

request = youtube.channels().list(
    part = "snippet,contentDetails,statistics",
    forHandle = forHandle
)

response = request.execute()
JSON(response)

<IPython.core.display.JSON object>

In [16]:
# This function calls the YouTube channels API based on the forHandle (channel name)
# It then extracts relevant info about the channel and returns a dictionary

def channel_info(youtube, forHandle):

    '''
    Takes in the youtube instance and channel handle
    Returns a dictionary with info abou the channel
    '''

    # request the channels info from youtube build service
    request = youtube.channels().list(
        part = 'ContentDetails, statistics, snippet',
        forHandle = forHandle,
        maxResults = 50
    )

    # execute the request
    response = request.execute()

    # Go through the dictionary of 'response' to get the relevant data
    channelName = response['items'][0]['snippet']['title']
    channelId = response['items'][0]['id']
    viewCount = int(response['items'][0]['statistics']['viewCount'])
    subscriberCount = int(response['items'][0]['statistics']['subscriberCount'])
    videoCount = int(response['items'][0]['statistics']['videoCount'])

    # convert to dictionary
    data = {'channelName': channelName,
    'channelId': channelId,
    'viewCount': viewCount,
    'subscriberCount': subscriberCount,
    'videoCount': videoCount
    }

    return data


In [17]:
channel_data = channel_info(youtube, forHandle)
channel_data

{'channelName': 'J. Kenji López-Alt',
 'channelId': 'UCqqJQ_cXSat0KIAVfIfKkVA',
 'viewCount': 258616723,
 'subscriberCount': 1680000,
 'videoCount': 609}

In [18]:
channelID = channel_data['channelId']
channelID


'UCqqJQ_cXSat0KIAVfIfKkVA'

In [19]:
# Request sent to Playlist API to see the IDs and info about the playlists

request = youtube.playlists().list(
  part="contentDetails, snippet",
  channelId = channelID,
  maxResults = 50
)

response = request.execute()

JSON(response)



<IPython.core.display.JSON object>

In [20]:
# Function to extract the list of playlist IDs, title, and descriptuon

playlist_info = []
playlist_ID_list = []

for dict in response['items']:
  playlist_id = dict['id']
  title = dict['snippet']['title']
  description = dict['snippet']['description']

  info = {'playlist_id': playlist_id, 'title': title, 'description': description}

  playlist_info.append(info)
  playlist_ID_list.append(playlist_id)

len(playlist_info)

print(f"Number of playlists in {forHandle}'s channel is: {len(playlist_info)}")


Number of playlists in @JKenjiLopezAlt's channel is: 27


In [21]:
pd.DataFrame(playlist_info)

Unnamed: 0,playlist_id,title,description
0,PLXonhhg5tUSI7F_F_1TASoAWLfYNc-RUe,Taste Tests,Taste Tests of Ingredients and Products
1,PLXonhhg5tUSKJZhLDZ-FnxgQPFv6uJVEh,The Food Lab,"Recipes from Serious Eats and my book, The Foo..."
2,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,All the Teriyaki in Seattle,
3,PLXonhhg5tUSK_sOzqUkiJz8ACDeB85HgU,Kenji Lopez-Main,
4,PLXonhhg5tUSJ1okkBuKY3CKIov-vx9td6,Ask Kenji,Videos exploring common cooking questions
5,PLXonhhg5tUSKSN7D04ComGnHUI1zthRHl,Experiments,
6,PLXonhhg5tUSKnGuurp0wPSgO_2QgoY2zD,"The Recipe, with Kenji and Deb",Videos associated with me and Deb Perelman's p...
7,PLXonhhg5tUSL2SW-L0qZO44rL-GBsyt1h,Kenji Plays,This is my gaming playlist.
8,PLXonhhg5tUSJYKMMCNfZCKugerw6D42c_,Quick Cooking,Quick videos shot while I'm making dinner that...
9,PLXonhhg5tUSKOCmSKZKzcnPOutjIPL5Do,Viral Recipe Reviews,I cook promising-looking viral recipes and tal...


In [22]:
playlist_ID_list

['PLXonhhg5tUSI7F_F_1TASoAWLfYNc-RUe',
 'PLXonhhg5tUSKJZhLDZ-FnxgQPFv6uJVEh',
 'PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq',
 'PLXonhhg5tUSK_sOzqUkiJz8ACDeB85HgU',
 'PLXonhhg5tUSJ1okkBuKY3CKIov-vx9td6',
 'PLXonhhg5tUSKSN7D04ComGnHUI1zthRHl',
 'PLXonhhg5tUSKnGuurp0wPSgO_2QgoY2zD',
 'PLXonhhg5tUSL2SW-L0qZO44rL-GBsyt1h',
 'PLXonhhg5tUSJYKMMCNfZCKugerw6D42c_',
 'PLXonhhg5tUSKOCmSKZKzcnPOutjIPL5Do',
 'PLXonhhg5tUSK7OlSDiEyqkn_47NXSWGeS',
 'PLXonhhg5tUSI6q-In5StCtD3xr0lfBj55',
 'PLXonhhg5tUSIl44pT_pW1DgJhhhDZhc7d',
 'PLXonhhg5tUSJYbsuOFfcNI7LgbCSrEH38',
 'PLXonhhg5tUSICRE99VKwT0NRWQ1AmFQvg',
 'PLXonhhg5tUSLyoNv_972NmmfdBfoT4WEo',
 'PLXonhhg5tUSJvVvHMLsXJG0TbSNmk7j-_',
 'PLXonhhg5tUSJuH6YK5UNOoT_I8TC5Sx_h',
 'PLXonhhg5tUSI85nEAOUOaO-1EoCWuPPR6',
 'PLXonhhg5tUSLOFkX2SsqrYB5CPasqaZQl',
 'PLXonhhg5tUSKX9LWuxUMPgZmWe0M2KVzh',
 'PLXonhhg5tUSK2xM3KBqICCoDyKrOwkBHZ',
 'PLXonhhg5tUSLPMH2kZI67RYGb5V-fX-83',
 'PLXonhhg5tUSIwnMVQpgZUOEjZQ4hYCNnJ',
 'PLXonhhg5tUSI4oUv4kSbwe6Km9d4A7xL5',
 'PLXonhhg5tUSKECHu9tL9if

In [23]:
# Request sent to 'playlistItems' API to get video IDs in each playlist
# Checking just for one playlist_ID[2]

request = youtube.playlistItems().list(
    part = 'contentDetails, id, snippet, status',
    maxResults = 50,
    playlistId = playlist_ID_list[2]
)

response = request.execute()
JSON(response)

<IPython.core.display.JSON object>

In [25]:
# Get video IDs for just one playlist

video_IDs = []

for dict in response['items']:
  playlist_id = dict['snippet']['playlistId']
  video_id = dict['contentDetails']['videoId']

  data = {'playlist_id': playlist_id, 'video_id': video_id}

  video_IDs.append(data)


In [26]:
pd.DataFrame(video_IDs)

Unnamed: 0,playlist_id,video_id
0,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,_UrxR8d3EkQ
1,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,Gic3YyUGtoE
2,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,2KaLHmT6XMg
3,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,oGNH9LSS1ak
4,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,1f48YuUfn-0
5,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,_EzxA-0eT3I
6,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,LUbl6WDab2Y
7,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,CIgsowpYfS0
8,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,J6sUfY7FHD4
9,PLXonhhg5tUSLzIskt_tXUBp4cMstyfMrq,o7n-ch5tdi8


In [27]:
# get all the video IDs in a playlist
# need to make sure that the nextPageToken is used to ensure that next pages are accessed
# Since the page limit is only 50 but many playlists have many more than 50 videos, need to get the nextPageToken and use that to keep going to the next pages
# Use while loop to access all the pages until there is no next page

def get_video_id(youtube, playlist_Id):

  '''
  Takes in the youtube instance and list of playlist IDs
  Returns a list of dictionaries that contain the info about the playlist
  '''

  video_IDs = []
  count = 0
  nextPageToken = ""

  while nextPageToken is not None:

    request = youtube.playlistItems().list(
        part = 'contentDetails, id, snippet, status',
        maxResults = 50,
        playlistId = playlist_Id,
        pageToken = nextPageToken
    )
    response = request.execute()

    for dict in response['items']:
      playlist_id = dict['snippet']['playlistId']
      video_id = dict['contentDetails']['videoId']

      data = {'playlist_id': playlist_id, 'video_id': video_id}

      video_IDs.append(data)

    count += 1
    #print(count)

    nextPageToken = response.get('nextPageToken')

  return video_IDs






In [28]:
# This loops through all the playlist IDs (27 of them), calls the functions to get all the video IDs in each playlist, and then adds all the list together
# The combined list is then converted to a Data Frame


video_id_df = []
for i in playlist_ID_list:
  #print(i)
  video_id_df += get_video_id(youtube, i)


In [29]:
video_id_df = pd.DataFrame(video_id_df)
video_id_df

Unnamed: 0,playlist_id,video_id
0,PLXonhhg5tUSI7F_F_1TASoAWLfYNc-RUe,zIPZQ6yBA2c
1,PLXonhhg5tUSI7F_F_1TASoAWLfYNc-RUe,vG-NsFXNYUw
2,PLXonhhg5tUSKJZhLDZ-FnxgQPFv6uJVEh,2Sz40WqBg8E
3,PLXonhhg5tUSKJZhLDZ-FnxgQPFv6uJVEh,U5vJ3hlopUk
4,PLXonhhg5tUSKJZhLDZ-FnxgQPFv6uJVEh,Azarsj8xlBk
...,...,...
791,PLXonhhg5tUSKTPt4s5ZOBSnPOchewStj7,Tg2K0pmNXsA
792,PLXonhhg5tUSKTPt4s5ZOBSnPOchewStj7,3oTFE6FAIrw
793,PLXonhhg5tUSKTPt4s5ZOBSnPOchewStj7,neLwcOqrucc
794,PLXonhhg5tUSKTPt4s5ZOBSnPOchewStj7,L5TF6YjEZT8


In [31]:
# convert the datafram column into a list which will be used to gather info about the videos

video_id_list = video_id_df['video_id'].to_list()

In [32]:
print(f"Totat number of videos in playlist is: {len(video_id_list)}")

Totat number of videos in playlist is: 796


In [34]:
# checking
video_id_list

['zIPZQ6yBA2c',
 'vG-NsFXNYUw',
 '2Sz40WqBg8E',
 'U5vJ3hlopUk',
 'Azarsj8xlBk',
 'ptxrb2k7Y-s',
 '_UrxR8d3EkQ',
 'Gic3YyUGtoE',
 '2KaLHmT6XMg',
 'oGNH9LSS1ak',
 '1f48YuUfn-0',
 '_EzxA-0eT3I',
 'LUbl6WDab2Y',
 'CIgsowpYfS0',
 'J6sUfY7FHD4',
 'o7n-ch5tdi8',
 '6AW9tIAHUXg',
 'QJjpXEzjVNQ',
 '31SSbqQGlxM',
 '6nFKCWrF_N8',
 '35DToZpqJXY',
 'Y4bGqM-QqWs',
 'hzLZs7g3P-4',
 'eNjv3A_pd2M',
 'ozGdp8a-mLM',
 'WrZ14kzuyK8',
 'zYY0b8n0COc',
 'Wbtish4P98s',
 'R-frrl0pFgw',
 'ysEZuCXF_04',
 'sYFYzeBHjsc',
 '1wMUzUys6Rk',
 'DzlUP22vIJs',
 'iDEWVJp_HfM',
 'kmIbyd9pAhI',
 '_UrxR8d3EkQ',
 'Gic3YyUGtoE',
 '2KaLHmT6XMg',
 'oGNH9LSS1ak',
 'zUKsR_SkjgM',
 '1f48YuUfn-0',
 'mZpce9xlXa4',
 '_EzxA-0eT3I',
 'LUbl6WDab2Y',
 'Oy-7YY0PGVw',
 'CIgsowpYfS0',
 'J6sUfY7FHD4',
 'o7n-ch5tdi8',
 '6AW9tIAHUXg',
 'QJjpXEzjVNQ',
 '31SSbqQGlxM',
 '6nFKCWrF_N8',
 '35DToZpqJXY',
 'Y4bGqM-QqWs',
 'hzLZs7g3P-4',
 'eNjv3A_pd2M',
 '-1CrOQLIqMg',
 'ozGdp8a-mLM',
 'WrZ14kzuyK8',
 'zYY0b8n0COc',
 'Wbtish4P98s',
 'R-frrl0pFgw',
 'ysEZuC

In [35]:
# Let's get info about the videos from the 'videos' API
# Start with one video and then expand code to all the videos --- one video instance not shown here

request = youtube.videos().list(

    part = 'contentDetails, snippet, statistics, topicDetails',
    maxResults = 50,
    id = ','.join(video_id_list[0:50])
    )

response = request.execute()
#JSON(response)

In [36]:
# Function that takes in a list of video IDs, parses the JSON file to get the relevant info, and returns a list

def get_video_info(youtube, video_id_list):

  '''
  Takes in the youtube instance and list of video IDs
  Returns a list of dictionaries that contain the info about the videos
  '''

  video_info = []

  for i in range(0, len(video_id_list), 50):

    # for loop to go through all the video IDs at steps of 50
    # 50 because that is the max number of videos that will be listed in the API page

    request = youtube.videos().list(

        part = 'contentDetails, snippet, statistics, topicDetails',
        maxResults = 50,
        id = ','.join(video_id_list[i:i+50])
        )

    response = request.execute()
    #JSON(response)


    for dict in response['items']:
      video_id = dict.get('id', 'NA')
      title = dict['snippet'].get('title', 'NA')
      publishedAt = dict['snippet'].get('publishedAt', 'NA')

      catgoryId = dict['snippet'].get('categoryId', 'NA')

      duration = dict['contentDetails'].get('duration', 'NA')
      caption = dict['contentDetails'].get('caption', 'NA')

      viewCount = dict['statistics'].get('viewCount', 'NA')
      likeCount = dict['statistics'].get('likeCount', 'NA')
      favoriteCount = dict['statistics'].get('favoriteCount', 'NA')
      commentCount = dict['statistics'].get('commentCount', 'NA')

      # convert to dictionary

      data = {
        'video_id': video_id,
        'title': title,
        'publishedAt': publishedAt,
        'catgoryId': catgoryId,
        'duration': duration,
        'caption': caption,

        'viewCount': viewCount,
        'likeCount': likeCount,
        'favoriteCount': favoriteCount,
        'commentCount': commentCount
      }

      video_info.append(data)

  return video_info




In [39]:
# call the function, get the list of all video info, and convert to a data frame

video_info = get_video_info(youtube, video_id_list)
video_info_df = pd.DataFrame(video_info)

In [41]:
video_info_df

Unnamed: 0,video_id,title,publishedAt,catgoryId,duration,caption,viewCount,likeCount,favoriteCount,commentCount
0,z3zs76WmYdk,Kenji's Cooking Show | Keema Matar,2020-05-11T08:30:41Z,26,PT32M1S,false,606579,7774,0,659
1,2Sz40WqBg8E,"For the Best Roast Chicken, Slather Your Spatc...",2025-05-16T18:12:49Z,26,PT19M51S,false,46425,2082,0,227
2,6hVb1ViS66I,Freedom Mortadella (AKA Bologna) Sandwiched wi...,2025-05-09T17:17:16Z,26,PT10M17S,false,79586,2105,0,128
3,UE_1KhwCwT0,Chorizo and Corn Fried Rice | Kenji’s Cooking ...,2025-04-25T15:00:57Z,26,PT12M8S,false,85520,2730,0,138
4,vG-NsFXNYUw,Taste Test: Are Pre-Chopped Garlics Any Good? ...,2025-04-23T15:01:40Z,26,PT15M5S,false,138852,3639,0,298
...,...,...,...,...,...,...,...,...,...,...
772,Tg2K0pmNXsA,First Person Garlic Parmesan French Toast,2016-09-19T08:31:03Z,26,PT1M1S,false,209058,3005,0,68
773,3oTFE6FAIrw,First Person French Omelette,2016-09-15T08:07:05Z,26,PT2M9S,false,3004504,38798,0,1596
774,neLwcOqrucc,Taco Bell Breakfast Wrap,2016-09-07T16:35:16Z,26,PT54S,false,289352,5117,0,73
775,L5TF6YjEZT8,Late Night Spicy Sesame Noodles,2016-08-10T09:16:40Z,26,PT2M25S,false,1176634,18013,0,643
