In [11]:
# pip install --force-reinstall google-api-python-client==2.11.0  #if you have previously installed google-api-python-client

Requirement already up-to-date: six in c:\users\hengs\anaconda3\lib\site-packages (1.16.0)


In [None]:
# pip install google-api-python-client==2.11.0  #if you dont have google-api-python-client yet

In [None]:
# pip install --upgrade six

In [None]:
# pip install youtube_transcript_api

In [24]:
from googleapiclient.discovery import build
import pprint
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re
import numpy as np

Replace the variable `api_key` with your own api key linked to your account (do not share this api key with anyone). Learn how to create api keys at [Python YouTube API Tutorial: Getting Started - Creating an API Key and Querying the API](https://www.youtube.com/watch?v=th5_9woFJmk&list=RDCMUCCezIgC97PvUuR4_gbFUs5g&start_radio=1&t=1164s&ab_channel=CoreySchafer)

In [55]:
api_key = 'CREATE_YOUR_API_KEY'
service = build('youtube', 'v3', developerKey=api_key)

# run this code to restart the quota limit counter back to 10000 (resets at 3pm SGT everyday)
ans = input("Are you sure you want to restart your quota count? (Enter Y/N)")
if ans == "Y":
    remaining_quota = 10000
    print("Quota counter reset to {}".format(remaining_quota))
else:
    print("Quota counter reset unsuccessful")

def check_remaining_quota():
    print('Remaning quota for today: {}'.format(remaining_quota))

Quota counter reset unsuccessful


Refer to [Google API documentation](https://developers.google.com/youtube/v3/docs) for more information

In [66]:
def search(query, maxResults=20):
    '''
    service: google resource as shown above
    query: string of search query, e.g. 'harassment (sexual|harass|obscene)', can also use the Boolean NOT (-) and OR (|) operators 
    to exclude videos or to find videos that are associated with one of several search terms.
    maxResults: max number of search results that should be returned, default: 20 (max: 100)
    returns: list of videoID and list of video titles of search results

    relevant link: https://developers.google.com/youtube/v3/docs/search/list#usage
    '''
    
    response = service.search().list(
                part="snippet",
                q=query,
                type='video',
                maxResults=maxResults).execute()

    videos = response['items']
    videoIDs = [vid['id']['videoId'] for vid in videos]
    video_titles = [vid['snippet']['title'] for vid in videos]
    
    global remaining_quota
    remaining_quota -= np.ceil(len(videos) / response['pageInfo']['resultsPerPage']) * 100

    return videoIDs, video_titles

def search_thru_list(list_of_queries, maxResultsPerSearch=20):
    '''
    Executes the search() function for a list of queries
    
    returns: pd.DataFrame of videoID and video titles
    '''
    num_queries = len(list_of_queries)
    quota_units = num_queries * np.ceil(maxResultsPerSearch/50) * 100
    ans = input('''You currently have about {} remaining quota units, are you sure you want to query {} times? It will cost approximately {} quota units. 
                 (Enter Y/N)'''.format(remaining_quota, num_queries, quota_units))

    all_videoIDs, all_video_titles = [], []
    if ans == "Y":
        counter = 0
        for query in list_of_queries:
            videoIDs, video_titles = search(query, maxResultsPerSearch)
            all_videoIDs.extend(videoIDs)
            all_video_titles.extend(video_titles)

            counter += 1
            if counter % 10 == 0:
                print("Completed {} searches...".format(counter))
        print("{} searches executed".format(counter))
    else:
        print("Search not executed")
    
    df = pd.DataFrame({"video_id": all_videoIDs, 'video_title': all_video_titles})
    df.drop_duplicates(subset=['video_id'], inplace=True)
    return df

def split_into_batch(lst, batch_size):
    # split into batches of max size batch_size, returns list of lists
    total = len(lst)
    num_batches = int(np.ceil(total/batch_size))
    curr_index = 0
    output = []
    for batch in range(1, num_batches+1):
        last_index = min(batch*batch_size, total)
        output.append(lst[curr_index: last_index])
        curr_index = last_index
    return output

def get_videos_info(videoIDs):
    '''
    videoIDs: list of video IDs
    returns: list of dictionary of basic videos information, such as ID, title, channel name, date of publication, description.
    '''
    id_list = list(videoIDs)
    if len(id_list) == 0:
        print("videoIDs must contain at least one ID")
    else:
        batch_size = 20
        all_videos = []

        for sub_id_list in split_into_batch(id_list, batch_size):
            joined_id_str = ",".join(sub_id_list)
            response = service.videos().list(
                            part="snippet",
                            id=joined_id_str).execute()
            
            videos = response['items']
            all_videos.extend(videos)

        video_info = [{'id': vid['id'],
                        'title': vid['snippet']['title'],
                        'channel_name': vid['snippet']['channelTitle'],
                        'published_date': vid['snippet']['publishedAt'],
                        'description': vid['snippet']['description']}
                        for vid in all_videos]

        global remaining_quota
        remaining_quota -= len(all_videos) * 1

        return video_info

def get_subtitles(videoID):
    '''
    videoID: id of ONE youtube video
    returns: subtitles of video

    time: ~0.7s per videoID
    '''
    subtitle = ''
    
    try:
        transcript = YouTubeTranscriptApi.get_transcript(videoID)
    except:
        return subtitle

    for text in transcript:
        subtitle = subtitle + text['text'] + ' '

    return subtitle
    
def get_comments(videoID, maxComments=20):
    '''
    videoID: id of ONE youtube video
    maxComments: maximum number of top level comments to extract, default = 20 (max: 100)

    returns: list of comment IDs, list of top-level comments 
    '''
    try: 
        response = service.commentThreads().list(
            part=['snippet'], #add 'replies to find replies to top level comments (i.e. nested comments)
            videoId=videoID,
            maxResults=maxComments,
            order='relevance'
        ).execute()
    except: # if comments are disabled
        return [], []

    comments = response['items']
    commentIDs = [c['id'] for c in comments]
    comment_texts = [c['snippet']['topLevelComment']['snippet']['textOriginal'] for c in comments]

    global remaining_quota
    remaining_quota -= len(comments)

    return commentIDs, comment_texts

def convert_comments_to_one_string(comment_texts):
    output = ''
    for c in comment_texts:
        output = output + c + " | "
    return output

def youtube_scrapper(videoIDs): 
    '''
    videoIDs: list/pd.Series of IDs to scrape
    
    returns: pandas df containing id, channel name, title, published date, desc, subtitles
    '''
    print("Scraping video information...")
    videoIDs = list(videoIDs)
    video_info = get_videos_info(videoIDs)

    num_vid = len(video_info)
    counter = 0

    for vid in video_info:
        videoID = vid['id']

        # commentIDs, comment_texts = get_comments(videoID, 50)
        # comment_string = convert_comments_to_one_string(comment_texts)
        # vid['comments'] = comment_string

        subtitle = get_subtitles(videoID)
        vid['subtitle'] = subtitle
        counter += 1
        if counter % (num_vid//5) == 0:
            print("Scraped {}/{} videos".format(counter,num_vid))
    
    print("Completed scraping for {} videos".format(num_vid))

    
    df = pd.DataFrame(video_info)
    df = df[['id','channel_name','title','published_date','description','subtitle']]

    return df

In [36]:
def check_for_relevant_keywords(df, col_to_search, keyword_list):
    '''
    this function checks if ANY column in col_to_search contains any word in keyword_list

    df: pd.DataFrame
    col_to_search: list of col name
    keyword_list: list of keyword strings to search for

    returns: series of boolean values
    '''
    df = df[col_to_search]

    def f(x):
        contains_keyword = False
        for text in x:
            for word in keyword_list:
                if str(word).lower() in str(text).lower():
                    contains_keyword = True
        
        return contains_keyword
    
    return df.apply(f, axis=1)

In [38]:
## Scraping using keywords: "covid singapore" + a mental health keyword, and iterating through the first 10 mental health keywords. Get video ids and titles
mental_health_keywords = ['depression', 'mental', 'illness', 'unalive', 'social', 'anxiety', 'loneliness', 'stress', 'lonely', 'isolation', 'suicide', 
                        'abuse', 'death', 'post', 'traumatic', 'stress', 'disorder', 'no', 'motivation', 'therapy', 'trauma', 'counselling', 'mood', 
                        'disorder', 'mood', 'swings', 'mental', 'health', 'angst', 'emotion', 'phobia', 'addiction', 'stigma', 'self-harm', 'neurosis', 
                        'abuse', 'disorder', 'dependence', 'socialize', 'help', 'dead', 'melancholia', 'dysthemia', 'tired', 'trapped', 'paranoia', 
                        'overwhelmed', 'irritable', 'bipolar', 'psychologist', 'well-being', 'imh', 'sos', 'counsellor', 'toxic']

num_MH_keywords = 10

search_query_list = []
for word in mental_health_keywords[:num_MH_keywords]:
    search_query = "covid singapore " + word
    search_query_list.append(search_query)

print(search_query_list)              


['covid singapore depression', 'covid singapore mental', 'covid singapore illness', 'covid singapore unalive', 'covid singapore social', 'covid singapore anxiety', 'covid singapore loneliness', 'covid singapore stress', 'covid singapore lonely', 'covid singapore isolation']


In [67]:
# finding id and titles of videos that satisfy search query
maxResultsPerSearch = 20
yt_id_df = search_thru_list(search_query_list, maxResultsPerSearch)
print(yt_id_df.head())

# extracting other info, such as channel_name, date, desc and subtitles
search_results_df = youtube_scrapper(yt_id_df['video_id'])
search_results_df.head()

      video_id                                        video_title
0  U_5igBy5L3s  Depression, increased anxiety linked to COVID-...
1  YIAQu1mHQR8  Family of COVID-19 patient in Singapore appeal...
2  54INb3nvLNU  Singapore will enter recession in 2020 due to ...
3  AqOhXQ6kaL0  Singapore remains in &quot;critical situation&...
4  azZoksSWL5I  How Are Singapore&#39;s Migrant Workers Coping...
Scraping video information...
Scraped 33/169 videos
Scraped 66/169 videos
Scraped 99/169 videos
Scraped 132/169 videos
Scraped 165/169 videos
Completed scraping for 169 videos


Unnamed: 0,id,channel_name,title,published_date,description,subtitle
0,U_5igBy5L3s,CNA,"Depression, increased anxiety linked to COVID-...",2020-09-07T16:07:25Z,The amount of time spent getting updates on CO...,the amount of time spent getting updates on co...
1,YIAQu1mHQR8,CNA,Family of COVID-19 patient in Singapore appeal...,2020-03-16T14:14:38Z,The family member of a COVID-19 patient in Sin...,
2,54INb3nvLNU,CNA,Singapore will enter recession in 2020 due to ...,2020-04-28T14:47:20Z,Singapore's central bank has warned that the C...,pickets and job losses are looming for Singapo...
3,AqOhXQ6kaL0,CNA,"Singapore remains in ""critical situation"" amid...",2020-04-14T16:24:07Z,Singapore authorities have now made it mandato...,as of 12 p.m. today we have an additional two ...
4,azZoksSWL5I,CNA Insider,How Are Singapore's Migrant Workers Coping Wit...,2021-02-17T12:00:00Z,It’s been one year since the deadly COVID-19 v...,it's very squeezy and uncomfortable to having ...


In [68]:
## Check if either title or desc contains impt keywords 
video_df = search_results_df
video_df['contain_covid'] = check_for_relevant_keywords(video_df, ['title','description'],['covid', 'coronavirus'])
video_df['contain_sg'] = check_for_relevant_keywords(video_df, ['title','description'],['singapore', 'singaporean'])
video_df['contain_mental_health'] = check_for_relevant_keywords(video_df, ['title','description'], mental_health_keywords)

video_df['relevant'] = video_df['contain_covid'] & video_df['contain_sg'] & video_df['contain_mental_health'] # True if contains all types of keywords

video_df.head()

Unnamed: 0,id,channel_name,title,published_date,description,subtitle,contain_covid,contain_sg,contain_mental_health,relevant
0,U_5igBy5L3s,CNA,"Depression, increased anxiety linked to COVID-...",2020-09-07T16:07:25Z,The amount of time spent getting updates on CO...,the amount of time spent getting updates on co...,True,False,True,False
1,YIAQu1mHQR8,CNA,Family of COVID-19 patient in Singapore appeal...,2020-03-16T14:14:38Z,The family member of a COVID-19 patient in Sin...,,True,True,True,True
2,54INb3nvLNU,CNA,Singapore will enter recession in 2020 due to ...,2020-04-28T14:47:20Z,Singapore's central bank has warned that the C...,pickets and job losses are looming for Singapo...,True,True,False,False
3,AqOhXQ6kaL0,CNA,"Singapore remains in ""critical situation"" amid...",2020-04-14T16:24:07Z,Singapore authorities have now made it mandato...,as of 12 p.m. today we have an additional two ...,True,True,True,True
4,azZoksSWL5I,CNA Insider,How Are Singapore's Migrant Workers Coping Wit...,2021-02-17T12:00:00Z,It’s been one year since the deadly COVID-19 v...,it's very squeezy and uncomfortable to having ...,True,True,True,True


In [86]:
# pd.set_option('display.max_colwidth', -1) # to see non-truncated text
# pd.set_option('display.max_colwidth', 50) # to reset back
print(len(video_df[video_df['relevant'] == False])) 
video_df[video_df['relevant'] == False].sample(5) #randomly choose 5 to look at

76


Unnamed: 0,id,channel_name,title,published_date,description,subtitle,contain_covid,contain_sg,contain_mental_health,relevant
63,KvGmSdIbMFA,hmtv News,Is Kim Jong-un alive or is it his duplicate? | Jordar News | hmtv,2020-05-07T15:45:31Z,#Jordarnews #hmtv\nFor Breaking Telugu News Please Subscribe to Our Telegram : https://t.me/hmtv...,ah Moonachie Marty Supriya Omar Antonia paramiku Sachin do Sachin wanna mention Burt Kamins woul...,False,False,False,False
70,6zBbX7QqvVs,ABC News (Australia),Kim Jong Un's weight loss sparks debate about leader's grip on power | ABC News,2021-06-12T02:19:27Z,"North Korean leader Kim jong Un's new, slimmed-down appearance has sparked discussions over his ...",,True,False,True,False
78,z1QZz79XjjA,CNA,COVID-19 circuit breaker: Social workers replace human interaction with technology,2020-05-15T17:41:42Z,The jobs of social workers have been made more difficult by stricter COVID-19 circuit breaker me...,the jobs of social workers have been made more difficult by stricter circuit-breaker measures it...,True,False,True,False
143,jneYzo9NGkE,CNA Insider,3-Year-Old Goes Wet Market Shopping On His Own,2019-08-18T02:30:00Z,"For more, SUBSCRIBE to CNA INSIDER! \nhttps://www.youtube.com/cnainsider\n\n""We think he’s too y...",[Music] [Music] [Music] what a burn better things get our Stevie [Music] so across the room we k...,False,False,True,False
24,LWOmhmkyZBU,CNA,Mental health experts warn of psychological trauma caused by COVID-19 pandemic,2020-04-17T16:12:08Z,Concern over the mental health impact of COVID-19 measures is rising. As quarantines and social ...,concern over the mental health impact of covert 19 measures is rising as quarantines and social ...,True,False,True,False


In [87]:
# save data
df.to_csv('video_df.csv')

In [69]:
# read data: if you already have the IDs in csv and want scrape other info such as desc...
input_df = pd.read_csv("test.csv", index_col=0)


## Example code to show the outputs of different API requests

In [29]:
# .search()
input_query = ['covid mental health']
maxResults = 1
response = service.search().list(
                part="snippet",
                q=input_query,
                type='video',
                maxResults=maxResults).execute()

# pprint.pprint(response)
response

{'kind': 'youtube#searchListResponse',
 'etag': 'DjVIAcUGK66S_Jqj3d9kDHdyyKo',
 'nextPageToken': 'CAEQAA',
 'regionCode': 'SG',
 'pageInfo': {'totalResults': 1000000, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'IrSjIja2tDs-AGIHrdDVB6DJ3V0',
   'id': {'kind': 'youtube#video', 'videoId': 'f4gBmOxtIzA'},
   'snippet': {'publishedAt': '2021-02-11T01:08:45Z',
    'channelId': 'UCrp_UI8XtuYfpiqluWLD7Lw',
    'title': 'Looking at the long-term mental health effects of Covid-19',
    'description': "Dr. Patrice Harris, former president of the American Medical Association, says it's important to talk about mental health to address the problem. People should ...",
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/f4gBmOxtIzA/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/f4gBmOxtIzA/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/f4gBmOxtIzA/

In [58]:
# .videos()
input_videoID = 'LWOmhmkyZBU'
response = service.videos().list(
            part="snippet",
            id=input_videoID).execute()

# pprint.pprint(response) #pretty print
response

{'kind': 'youtube#videoListResponse',
 'etag': 'ijnckNpnlNYnp9wc2x6R_mhFmKw',
 'items': [{'kind': 'youtube#video',
   'etag': 'tBwIlxi3rIbG-9HSmmg2h64RzZc',
   'id': 'LWOmhmkyZBU',
   'snippet': {'publishedAt': '2020-04-17T16:12:08Z',
    'channelId': 'UC83jt4dlz1Gjl58fzQrrKZg',
    'title': 'Mental health experts warn of psychological trauma caused by COVID-19 pandemic',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/hqdefault.jpg',
      'width': 480,
      'height': 360},
     'standard': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/sddefault.jpg',
      'width': 640,
      'height': 480},
     'maxres': {'url': 'https://i.ytimg.com/vi/LWOmhmkyZBU/maxresdefault.jpg',
      'width': 1280,
      'height': 720}},
    'channelTitle': 'C

In [27]:
# .commentThreads()
input_videoID = 'LWOmhmkyZBU'
maxComments = 1

response = service.commentThreads().list(
            part=['snippet'], #add 'replies to find replies to top level comments (i.e. nested comments)
            videoId=input_videoID,
            maxResults=maxComments,
            order='relevance'
        ).execute()

# pprint.pprint(response)
response

{'kind': 'youtube#commentThreadListResponse',
 'etag': 'OfekVZ633Ly7YJGaFlhbqoLhcRY',
 'nextPageToken': 'QURTSl9pMWVkMy1mRUVkdUYtOHA1UmJrSUZDY0UyNW1RcEw3TWxDM0pvRTdJSW9hLTUxVDExSjlTaWF5R0pDcDVDbjN4T2pLYjIzQTdla3RDQQ==',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#commentThread',
   'etag': '5RXAvRAKS0limKZXlVLW4vnaE5Y',
   'id': 'UgxUHzPE1Gvc_Ha7nCp4AaABAg',
   'snippet': {'videoId': 'LWOmhmkyZBU',
    'topLevelComment': {'kind': 'youtube#comment',
     'etag': 'DhfcDl2VUg2Z4aWGOsnsdXulPjc',
     'id': 'UgxUHzPE1Gvc_Ha7nCp4AaABAg',
     'snippet': {'videoId': 'LWOmhmkyZBU',
      'textDisplay': 'In Japan, rental apartments are offered  as time apart  for couples so as to curb divorce during this pandemic. Maybe here the hotels can offer some similar services to help with the mental stress during this difficult period',
      'textOriginal': 'In Japan, rental apartments are offered  as time apart  for couples so as to curb divorce during this pand