In [1]:

import pandas as pd


from googleapiclient.discovery import build  # Google API
from IPython.display import JSON             # Disply JSON
from functools import partial                # Use with Map to fix an argument


## Data Extraction

In [2]:

api_key = 'AIzaSyBkH0jp71X7GGI0J09VLQG_FXq43U2E6Zk'

channel_ids = ['UCpQ34afVgk8cRQBjSJ1xuJQ', # MadFit
               'UCvGEK5_U-kLgO6-AMDPeTUQ', # EmiWong
               'UCIJwWYOfsCfz6PjxbONYXSg', # Blogilates
               'UCCgLoMYIyP0U56dEhEL1wXQ', # ChloeTing
               'UCi0AqmA_3DGPFCu5qY0LLSg', # Rebecca-Louise
              ]

# Get credentials and create an API client
youtube = build("youtube", "v3", developerKey=api_key)

In [5]:
def get_channel_stats(youtube,list_channel_ids):
    """
    Get channel statistics of given channels
    Params: youtube: build object from googleapiclient.discovery,
            list_channel_ids: list of youtube channel ids.
    Returns: dataframe containing channel statistics for all channels provided in the lists.
             the statistics are: title, published date, view count, subscriber count, video count, uploads playlist
             
    """
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids))
    response = request.execute()
    
    all_data = []
    for item in response['items']:
        data = dict(ChannelName = item['snippet']['title'],
                    ChannelDescription = item['snippet']['description'],
                    PublishedDate = item['snippet']['publishedAt'],
                    TotalSubscribers = item['statistics']['subscriberCount'],
                    TotalViews = item['statistics']['viewCount'],
                    TotalVideos = item['statistics']['videoCount'],
                    playlistID = item['contentDetails']['relatedPlaylists']['uploads']
                    )
        all_data.append(data)
    
    return pd.DataFrame(all_data)
    

In [7]:
get_channel_stats(youtube, channel_ids)

Unnamed: 0,ChannelName,ChannelDescription,PublishedDate,TotalSubscribers,TotalViews,TotalVideos,playlistID
0,blogilates,"Hey guys! My name is Cassey Ho, I am a certifi...",2009-06-13T09:05:48Z,8570000,2717016358,1177,UUIJwWYOfsCfz6PjxbONYXSg
1,Chloe Ting,Subscribe to my channel and find weekly workou...,2011-08-17T04:29:09Z,24600000,2964839459,405,UUCgLoMYIyP0U56dEhEL1wXQ
2,emi wong,welcome to my channel!\nhope my videos can hel...,2014-11-02T14:43:34Z,6060000,809944803,490,UUvGEK5_U-kLgO6-AMDPeTUQ
3,MadFit,"This is a place where I post REAL TIME, AT HOM...",2018-03-02T01:46:06Z,7940000,932098011,709,UUpQ34afVgk8cRQBjSJ1xuJQ
4,Rebecca-Louise,"Hey, \n\nWelcome to #TEAMBURN 🙌🏻 \n\nI am so e...",2012-09-22T18:04:00Z,718000,116943876,1238,UUi0AqmA_3DGPFCu5qY0LLSg


In [14]:
def get_video_ids(youtube,playlist_id):
    
    """
    Get list of video ids of all videos in the provided playlist
    Params: youtube: build object of googleapiclient.discovery
            playlist_id: playlist Id of the channel
    Returns: list of video ids of all videos in the playlist
    
    """
    request = youtube.playlistItems().list(
            part="contentDetails",
            maxResults=50,
            playlistId = playlist_id)
    response = request.execute()
    
    video_ids = []
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                part="contentDetails",
                maxResults=50,
                playlistId = playlist_id,
                pageToken = next_page_token)
            response = request.execute()
        
            for item in response['items']:
                video_ids.append(item['contentDetails']['videoId'])
                
            next_page_token = response.get('nextPageToken')
    
    return video_ids
    

In [15]:
list_video_ids = get_video_ids(youtube,'UUpQ34afVgk8cRQBjSJ1xuJQ')
len(list_video_ids)

710

In [None]:
def get_video_stats(youtube, list_video_ids):
    
    """
    Get all desired video stats of given video
    Params: youtube: build object from googleapiclient.discovery
            list_of_video_ids: list of videos ids
    Returns: dataframe containing the following video stats: 
                'channelTitle', 'title', 'description', 'tags', 'publishedAt',
                'viewCount', 'likeCount', 'favouriteCount', 'commentCount', 'duration', 'definition'
            None value is given for any stat not available for a given video
    
    """
    all_video_stats = []
    
    for i in range(0,len(list_video_ids),50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id = ','.join(list_video_ids[i:i+50]))
        response = request.execute()
    
        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition']
                            }
            video_stats = {}
            video_stats['video_id'] = video['id']
            
            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_stats[v] = video[k][v]
                    except:
                        video_stats[v] = None
        
            all_video_stats.append(video_stats)
                             
    return pd.DataFrame(all_video_stats)
    
    

In [None]:
def get_video_comments(youtube, list_video_ids):
    
    """
    Get top10 comments for all videos in the provided list
    Params: youtube: build object from googleapiclient.discovery
            list_of_video_ids: list of videos ids
    Returns: dataframe containing video id and list of top10 comments
    
    """
    all_comments = []
    
    for video_id in list_video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id)
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)

## Data Preprocessing