In [11]:
# pip install --force-reinstall google-api-python-client==2.11.0  #if you have previously installed google-api-python-client

Requirement already up-to-date: six in c:\users\hengs\anaconda3\lib\site-packages (1.16.0)


In [None]:
# pip install google-api-python-client==2.11.0  #if you dont have google-api-python-client yet

In [None]:
# pip install --upgrade six

In [None]:
# pip install youtube_transcript_api

In [31]:
from googleapiclient.discovery import build
import pprint
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd

Replace the variable `api_key` with your own api key linked to your account (do not share this api key with anyone). Learn how to create api keys at [Python YouTube API Tutorial: Getting Started - Creating an API Key and Querying the API](https://www.youtube.com/watch?v=th5_9woFJmk&list=RDCMUCCezIgC97PvUuR4_gbFUs5g&start_radio=1&t=1164s&ab_channel=CoreySchafer)

In [2]:
api_key = 'CREATE_YOUR_API_KEY'
service = build('youtube', 'v3', developerKey=api_key)

Refer to [Google API documentation](https://developers.google.com/youtube/v3/docs) for more information

In [35]:
def search(service, query, maxResults=20):
    '''
    service: google resource as shown above
    query: string of search query, e.g. 'harassment (sexual|harass|obscene)', can also use the Boolean NOT (-) and OR (|) operators 
    to exclude videos or to find videos that are associated with one of several search terms.
    maxResults: max number of search results that should be returned, default: 20 (max: 100)
    returns: videoID of search results

    relevant link: https://developers.google.com/youtube/v3/docs/search/list#usage
    '''

    response = service.search().list(
                part="snippet",
                q=query,
                type='video',
                maxResults=maxResults).execute()

    videos = response['items']
    videoIDs = [vid['id']['videoId'] for vid in videos]
    video_titles = [vid['snippet']['title'] for vid in videos]
    
    return videoIDs, video_titles

def get_videos_info(videoIDs):
    '''
    videoIDs: list of video IDs
    returns: list of dictionary of basic videos information, such as ID, title, channel name, date of publication, description.
    '''

    response = service.videos().list(
                    part="snippet",
                    id=videoIDs).execute() 
    
    videos = response['items']
    video_info = [{'id': vid['id'],
                    'title': vid['snippet']['title'],
                    'channel_name': vid['snippet']['channelTitle'],
                    'published_date': vid['snippet']['publishedAt'],
                    'description': vid['snippet']['description']}
                    for vid in videos]

    return video_info

def get_subtitles(videoID):
    '''
    videoID: id of ONE youtube video
    returns: subtitles of video

    time: ~0.7s per videoID
    '''
    subtitle = ''
    
    try:
        transcript = YouTubeTranscriptApi.get_transcript(videoID)
    except:
        return subtitle

    for text in transcript:
        subtitle = subtitle + text['text'] + ' '

    return subtitle
    
def get_comments(videoID, maxComments=20):
    '''
    videoID: id of ONE youtube video
    maxComments: maximum number of top level comments to extract, default = 20 (max: 100)

    returns: list of comment IDs, list of top-level comments 
    '''
    try: 
        response = service.commentThreads().list(
            part=['snippet'], #add 'replies to find replies to top level comments (i.e. nested comments)
            videoId=videoID,
            maxResults=maxComments,
            order='relevance'
        ).execute()
    except: # if comments are disabled
        return [], []

    comments = response['items']
    commentIDs = [c['id'] for c in comments]
    comment_texts = [c['snippet']['topLevelComment']['snippet']['textOriginal'] for c in comments]

    return commentIDs, comment_texts

def convert_comments_to_one_string(comment_texts):
    output = ''
    for c in comment_texts:
        output = output + c + " | "
    return output

def youtube_scrapper(search_query, max_num_of_videos):
    print("(1/4) Scraping videoID...")  
    videoIDs, video_titles = search(service, search_query , max_num_of_videos)
    
    print("(2/4) Scraping video information...")
    video_info = get_videos_info(videoIDs)
    
    print("(3/4) Scraping video subtitles and comments...")
    for vid in video_info:
        videoID = vid['id']

        commentIDs, comment_texts = get_comments(videoID, 50)
        comment_string = convert_comments_to_one_string(comment_texts)
        vid['comments'] = comment_string

        subtitle = get_subtitles(videoID)
        vid['subtitle'] = subtitle

    
    print("(4/4) Converting to dataframe...")
    df = pd.DataFrame(video_info)
    df = df[['id','channel_name','title','published_date','description','subtitle','comments']]
    
    print("Completed scraping for: '{}' ".format(search_query))
    print("Number of videos scrapped: {}".format(len(video_info)))

    return df

In [36]:
search_query = 'covid mental health singapore'
max_num_of_videos = 50

df = youtube_scrapper(search_query, max_num_of_videos)
df.head()

(1/4) Scraping videoID...
(2/4) Scraping video information...
(3/4) Scraping video subtitles and comments...
(4/4) Converting to dataframe...
Completed scraping for: 'covid mental health singapore' 
Number of videos scrapped: 50


Unnamed: 0,id,channel_name,title,published_date,description,subtitle,comments
0,AIB_pFuSfpQ,CNA,COVID-19: Singapore authorities on how the men...,2020-08-06T11:32:40Z,How are the mental health needs of migrant wor...,uh so my question is about the number of uh re...,"At this point in time, I really can't care les..."
1,QShkVkVOlzs,The Straits Times,Living with anxiety during Covid-19,2020-10-10T15:48:14Z,Four Singaporeans share how Covid-19 had an im...,i graduated just thrusted into like the workin...,Living in the climate of fear. A mental tortur...
2,LWOmhmkyZBU,CNA,Mental health experts warn of psychological tr...,2020-04-17T16:12:08Z,Concern over the mental health impact of COVID...,concern over the mental health impact of cover...,"In Japan, rental apartments are offered as ti..."
3,gD7k4o8JBJI,CNN-News18,CBSE News | Delhi News | Maharashtra News | Co...,2021-02-04T06:35:31Z,Pushkar Singh Dhami on Sunday took oath as the...,,
4,53vfPhhbh-g,British Chamber of Commerce Singapore,Mental Health at Work during COVID 19 | BritCh...,2020-07-24T01:17:52Z,Watch this webinar for insights from thought l...,good afternoon and welcome everybody to this e...,


In [37]:
df.to_csv('youtube_data.csv')

## Example code to show the outputs of different API requests

In [29]:
# .search()
input_query = ['covid mental health']
maxResults = 1
response = service.search().list(
                part="snippet",
                q=input_query,
                type='video',
                maxResults=maxResults).execute()

# pprint.pprint(response)
response

{'kind': 'youtube#searchListResponse',
 'etag': 'DjVIAcUGK66S_Jqj3d9kDHdyyKo',
 'nextPageToken': 'CAEQAA',
 'regionCode': 'SG',
 'pageInfo': {'totalResults': 1000000, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'IrSjIja2tDs-AGIHrdDVB6DJ3V0',
   'id': {'kind': 'youtube#video', 'videoId': 'f4gBmOxtIzA'},
   'snippet': {'publishedAt': '2021-02-11T01:08:45Z',
    'channelId': 'UCrp_UI8XtuYfpiqluWLD7Lw',
    'title': 'Looking at the long-term mental health effects of Covid-19',
    'description': "Dr. Patrice Harris, former president of the American Medical Association, says it's important to talk about mental health to address the problem. People should ...",
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/f4gBmOxtIzA/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/f4gBmOxtIzA/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/f4gBmOxtIzA/

In [28]:
# .videos()
input_videoID = 'LWOmhmkyZBU'
response = service.videos().list(
            part="snippet",
            id=input_videoID).execute()

# pprint.pprint(response) #pretty print
response

{'kind': 'youtube#videoListResponse',
 'etag': 'cxfw3vdkR5s0wi-ZI7cn01sCNBY',
 'items': [{'kind': 'youtube#video',
   'etag': 'N0PyyKatu2okgqZCYFg1XvMO-dQ',
   'id': 'LWOmhmkyZBU',
   'snippet': {'publishedAt': '2020-04-17T16:12:08Z',
    'channelId': 'UC83jt4dlz1Gjl58fzQrrKZg',
    'title': 'Mental health experts warn of psychological trauma caused by COVID-19 pandemic',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/hqdefault.jpg',
      'width': 480,
      'height': 360},
     'standard': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/sddefault.jpg',
      'width': 640,
      'height': 480},
     'maxres': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/maxresdefault.jpg',
      'width': 1280,
      'height': 720}},
    'channelTitle': 'C

In [27]:
# .commentThreads()
input_videoID = 'LWOmhmkyZBU'
maxComments = 1

response = service.commentThreads().list(
            part=['snippet'], #add 'replies to find replies to top level comments (i.e. nested comments)
            videoId=input_videoID,
            maxResults=maxComments,
            order='relevance'
        ).execute()

# pprint.pprint(response)
response

{'kind': 'youtube#commentThreadListResponse',
 'etag': 'OfekVZ633Ly7YJGaFlhbqoLhcRY',
 'nextPageToken': 'QURTSl9pMWVkMy1mRUVkdUYtOHA1UmJrSUZDY0UyNW1RcEw3TWxDM0pvRTdJSW9hLTUxVDExSjlTaWF5R0pDcDVDbjN4T2pLYjIzQTdla3RDQQ==',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#commentThread',
   'etag': '5RXAvRAKS0limKZXlVLW4vnaE5Y',
   'id': 'UgxUHzPE1Gvc_Ha7nCp4AaABAg',
   'snippet': {'videoId': 'LWOmhmkyZBU',
    'topLevelComment': {'kind': 'youtube#comment',
     'etag': 'DhfcDl2VUg2Z4aWGOsnsdXulPjc',
     'id': 'UgxUHzPE1Gvc_Ha7nCp4AaABAg',
     'snippet': {'videoId': 'LWOmhmkyZBU',
      'textDisplay': 'In Japan, rental apartments are offered  as time apart  for couples so as to curb divorce during this pandemic. Maybe here the hotels can offer some similar services to help with the mental stress during this difficult period',
      'textOriginal': 'In Japan, rental apartments are offered  as time apart  for couples so as to curb divorce during this pand