This file contains the execution of cells based on the project report. 


In [1]:
#Import statements
#------------------------------------------------

from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON
import numpy as np
from dateutil import parser
import isodate

# Data visualisation
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
sns.set(style="darkgrid", color_codes=True)

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud
from google.cloud import bigquery # top upload dataframes to bigquery

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sruth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sruth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
api_key = 'AIzaSyCGYJLXOPmDHRw810nos-vfqIoFxCU2u3U'

In [3]:
# Below are channel ids of most subscribed 10 youtube channels in the world
channel_ids = ['UC-lHJZR3Gqxm24_Vd_AJ5Yw',
               'UCq-Fj5jknLsUf-MWSy4_brA',
               'UCbCmjCuTUZos6Inko4u57UQ',
               'UCpEhnqL0y41EpW2TvWAHD7Q',
               'UCX6OQ3DkcsbYNE6H8uQQuVA',
               'UCk8GzjMOrta8yxDcKfylJYw',
               'UCvlE5gTbOvjiolFlEm-c_Ow',
               'UCJplp5SjeGSdVdwsfb9Q7lQ',
               'UCFFbwnve3yF62-tVXkTyHqg',
               'UCJ5v_MCY6GNUBTO8-D3XoAg'
              ]

In [4]:
# Code to establish connection to youtube data API using  API Key

api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

In [5]:
channel_stats = pd.DataFrame() # Data frame to store channel statistics
video_df = pd.DataFrame() # Data frame to store video details
comments_df = pd.DataFrame() # Data frame to store comments info
most_popular_videos = pd.DataFrame() # Data frame to store region wise most popular video info

Functions to collect data. 

In [6]:
# fucntion to get channel statistics

def get_channel_stats(youtube, channel_ids): 
    all_data = []

    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids))
    response = request.execute()
    
    #loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
               'subscribers': item['statistics']['subscriberCount'],
               'views': item['statistics']['viewCount'],
               'totalVideos': item['statistics']['videoCount'],
               'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
                }
        all_data.append(data)
        
    return(pd.DataFrame(all_data)) #Returns a data frame

# Function to get video ids

def get_video_ids(youtube, playlist_id):
    
    video_ids = []
    
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId= playlist_id,
        maxResults = 50
    )
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        
        request = youtube.playlistItems().list(
                    part="snippet,contentDetails",
                    playlistId= playlist_id,
                    maxResults = 50,
                    pageToken = next_page_token)
        
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')
        
    return video_ids


# Function to get video details from video ids. Video ids are passed to this function as arguments and a data frame is returned

def get_video_details(youtube, video_ids):

    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id= ','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tag', 'publishedAt'],
                            'statistics':['viewCount', 'likeCount','favoriteCount', 'commentCount'],
                            'contentDetails': ['duration', 'definition', 'caption']}
            video_info = {}
            video_info['video_id'] = video['id']
            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
        
    return pd.DataFrame(all_video_info)

#Function to get comments under videos

def get_comments_in_videos(youtube, video_ids):


    all_comments = []
    
    for video_id in video_ids:
        try:   
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()
        
            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)
            
        except: 
            # When error occurs - most likely because comments are disabled on a video
            print('Could not get comments for video ' + video_id)
        
    return pd.DataFrame(all_comments)

# Below functions are used to collect data of most popular videos based on a region

def get_response(region_code):
    request = youtube.videos().list(
            part="snippet, contentDetails, statistics",
            chart="mostPopular",
            regionCode=region_code
        )
    response = request.execute()
    return response

def get_next_page_data(response, next_page_token, region_code):
    if(next_page_token): # at the end of the reponse, next_page_token will be null
        request = youtube.videos().list(
                part="snippet, contentDetails, statistics",
                chart="mostPopular",
                regionCode=region_code,
                maxResults = 50,
                pageToken = next_page_token)
        
        response = request.execute()
        return response
    else:
        return None
    
def filter_api_response(response):

    all_video_info = []


    for video in response['items']:
        stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tag', 'publishedAt'],
                        'statistics':['viewCount', 'likeCount','favoriteCount', 'commentCount'],
                        'contentDetails': ['duration', 'definition', 'caption']}
        video_info = {}
        video_info['video_id'] = video['id']
        for k in stats_to_keep.keys():
            for v in stats_to_keep[k]:
                try:
                    video_info[v] = video[k][v]
                except:
                    video_info[v] = None

        all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)
#JSON(response)

def get_n_rows_data(response, max_number_of_rows, region_code):
    # response = get_data()
    df_list = []
    df_list.append(filter_api_response(response))

    number_of_rows = 0

    next_page_token = response.get('nextPageToken')
    while(response.get('nextPageToken')):
        df_list.append(filter_api_response(get_next_page_data(response, next_page_token, region_code)))
        number_of_rows += 50
        if(number_of_rows == max_number_of_rows):
            break

    return pd.concat(df_list)

def get_region_wise_data(region_code_list):
    video_data_df_list = []
    for region_code in region_code_list:
        response = get_response(region_code)
        video_data_df_list.append(get_n_rows_data(response, 50, region_code)) # will contain duplicates
    return pd.concat(video_data_df_list)

Now performing Data collection to above Dataframes using the functions.

In [9]:
#Loading channel stats datafame

channel_stats = get_channel_stats(youtube, channel_ids) 

In [12]:
#collecting video ids of all the videos. 

#Copying playlist id and channel name to a list from channel stats dataframe. 

playlist_id_copy = channel_stats['playlistId'].tolist()  # Copying playlist ids of each channel into a seperate list. 
print(len(playlist_id_copy))

channel_name = channel_stats['channelName'].tolist() # copying all the channel name into a seperate4 list
print(len(channel_name))

# Creating a dict to store video ids of correspoding channels

dict_channel_vids = {}

for i in range(len(channel_name)):
    dict_channel_vids[channel_name[i]] = 0 # initializing the values in dict with 0. 

# storing all video ids as a list as values to corresponding YT channels
for i in range(len(playlist_id_copy)):                            
    video_ids = get_video_ids(youtube, playlist_id_copy[i])
    dict_channel_vids[channel_name[i]] = video_ids

10
10


In [None]:
# Loading video_df dataframe

dict_copy = dict_channel_vids
dict_copy.keys()

for i in range(len(dict_channel_vids.keys())):                            
    video_df= video_df.append(get_video_details(youtube, dict_channel_vids[channel_name[i]]))

In [None]:
# Loading comments_df dataframe

# Taking only the first 10 comments of first 100 videos in a channel

for i in range(len(dict_channel_vids.keys())):                            
    temp = dict_copy[channel_name[i]]
    newList = temp[:100] 
    comments_df= comments_df.append(get_comments_in_videos(youtube, newList), ignore_index=True)

In [15]:
# Loading most_popular_videos dataframe

region_code_list = ['GB', 'BE', 'DK', 'DE', 'EE','IE'] # Country code of few european countries
most_popular_videos = get_region_wise_data(region_code_list)

Data exploration and cleaning

In [16]:
channel_stats

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
0,Like Nastya,105000000,89475741563,786,UUJplp5SjeGSdVdwsfb9Q7lQ
1,Zee Music Company,94700000,56140625139,8151,UUFFbwnve3yF62-tVXkTyHqg
2,T-Series,241000000,222369697948,19586,UUq-Fj5jknLsUf-MWSy4_brA
3,SET India,155000000,144390080595,111056,UUpEhnqL0y41EpW2TvWAHD7Q
4,Vlad and Niki,96600000,75429474956,551,UUvlE5gTbOvjiolFlEm-c_Ow
5,PewDiePie,111000000,28984754763,4709,UU-lHJZR3Gqxm24_Vd_AJ5Yw
6,MrBeast,150000000,25527580754,741,UUX6OQ3DkcsbYNE6H8uQQuVA
7,WWE,94700000,75897588295,68408,UUJ5v_MCY6GNUBTO8-D3XoAg
8,✿ Kids Diana Show,110000000,91133967627,1088,UUk8GzjMOrta8yxDcKfylJYw
9,Cocomelon - Nursery Rhymes,158000000,159060348408,901,UUbCmjCuTUZos6Inko4u57UQ


In [17]:
video_df

Unnamed: 0,video_id,channelTitle,title,description,tag,publishedAt,viewCount,likeCount,favoriteCount,commentCount,duration,definition,caption
0,eaHkDAQ6wdo,Like Nastya,Nastya and Evelyn help each other as best friends,"Friendship is not only fun, but also helping e...",,2023-05-04T08:00:17Z,1230842,5357,0,0,PT5M27S,hd,false
1,kPoi8UYlluU,Like Nastya,Nastya and Flower dance trend,Nastya and Flower dance trend #shorts,,2023-05-02T08:00:29Z,1241515,37294,0,0,PT17S,hd,false
2,Dk7mJHOSUfg,Like Nastya,Nastya and stories about diversity among kids,A collection of stories about the diversity of...,,2023-04-30T09:00:08Z,1191566,4960,0,0,PT16M32S,hd,false
3,Ar4gmOGp6qA,Like Nastya,Nastya arranged a test of patience for the kids,Nastya arranged an endurance test for 5 kids. ...,,2023-04-27T09:00:05Z,2294545,7652,0,0,PT4M32S,hd,false
4,PB1STAJV3LY,Like Nastya,Nastya and Evelyn - funny dance,Dance fun dances with us and upload to the #sh...,,2023-04-25T13:08:33Z,2275831,47851,0,0,PT18S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
897,tVpgEiBcw7M,Cocomelon - Nursery Rhymes,"Learn the ABCs: ""P"" is for Pig and Penguin","Featuring the letter ""P""! \nThis series goes t...",,2007-06-20T03:41:46Z,9272776,4678,0,0,PT1M31S,sd,false
898,7W6fEFixi5U,Cocomelon - Nursery Rhymes,"Learn the ABCs: ""L"" is for Lion and Ladybug","Featuring the letter ""L""! \nThis series goes t...",,2007-06-20T03:34:33Z,24207837,21365,0,0,PT1M48S,sd,false
899,cgC8BC1OINQ,Cocomelon - Nursery Rhymes,"Learn the ABCs: ""K"" is for Kangaroo","Featuring the letter ""K""! \nThis series goes t...",,2007-06-20T01:31:32Z,8514801,4386,0,0,PT2M13S,sd,false
900,0fw3l1z9CUQ,Cocomelon - Nursery Rhymes,ABC Song with Cute Ending,This ABC Song is one of the most popular ABC s...,,2006-09-02T01:34:53Z,289835096,328422,0,0,PT45S,sd,false


In [18]:
comments_df

Unnamed: 0,video_id,comments
0,gu-LlRSEDv0,"[the edits are 🔥, Super, Ye kuposhit bacha kya..."
1,rC-tcyFaUUs,[Touching lyrics accompanied by melody full wi...
2,qWd30w6f1qE,"[😢😢, ❤❤❤, ❤❤❤❤, 😢😢😢😢, 😮😮, 🔥 🚩M"" Respect Everyo..."
3,tRUtoAw_e8Y,"[beautiful welcome 🥰🥰, https://youtu.be/8AFNV..."
4,eCXqMHPlaoE,"[Superb🎉❤, Nice work from vinod paliwal and al..."
...,...,...
592,rvUJr1QXor8,"[😂😂😂😂👎👎👎👎👎👎👎👎, Who will stop SmackDown Womens ..."
593,L0pGH1D6P1Y,"[We miss Pat McAfee & Sarah Schreiber., 😂, I d..."
594,G56Bhb-kPCA,[Brock lesnar I want to see compete for that b...
595,l1e8sShMF0I,"[walah, i love that, We need Randy Orton baaaa..."


In [19]:
most_popular_videos

Unnamed: 0,video_id,channelTitle,title,description,tag,publishedAt,viewCount,likeCount,favoriteCount,commentCount,duration,definition,caption
0,zcbsqsLEL_Y,The United Stand,BRIGHTON vs MANCHESTER UNITED LIVE STREAM Watc...,Unlock an EXCLUSIVE 40% Off ALL* boohooMAN Men...,,2023-05-04T21:13:06Z,451821,7602,0,451,PT3H37S,hd,false
1,ibVqF9NuGDU,Tion Wayne,Tion Wayne - Healing (Official Music Video),Directed by Wowa (https://www.instagram.com/wo...,,2023-05-04T20:22:17Z,496914,97196,0,11365,PT3M1S,hd,false
2,vPvTkhIntjw,The United Stand,ATTACK TO BLAME! Brighton 1-0 Manchester Unite...,Brighton 1-0 Manchester United! Mark Goldbridg...,,2023-05-04T21:40:27Z,233758,5741,0,826,PT29M54S,hd,false
3,DIAT31WlZqI,Behzinga,I’m Having A Surgery…,💪 Gymshark: https://gym.sh/EthanGS\n💾 Second C...,,2023-05-04T17:09:11Z,465318,48923,0,975,PT6M1S,hd,false
4,F4mUnmFbVNg,Ren,Ren - Animal Flow,http://found.ee/Ren-Animal-Flow\n\nhttp://www....,,2023-05-04T19:00:08Z,251303,34996,0,4625,PT2M56S,hd,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,eDnrAGSjP3k,DAZN UEFA Women's Champions League,Arsenal vs. Wolfsburg | UEFA Women's Champions...,🇩🇪 🎙️ 👉 https://youtube.com/live/ET86gczHcrI?f...,,2023-05-01T20:25:12Z,1095379,13759,0,98,PT3H51M20S,hd,false
46,j3ILbiok_1E,Nutshell Animations,1 2 BUCKLE MY SHOE (Animation Meme),Subscribe to My Gaming Channel:\nhttps://www.y...,,2023-04-24T21:00:18Z,8721683,451832,0,5461,PT18S,hd,false
47,Vo8dSsbIppg,Scene City,Miles Intentionally Fails Test | Spider-Man: I...,Miles (Shameik Moore) fails his test on purpos...,,2023-04-19T03:00:12Z,4474821,390719,0,4412,PT28S,hd,false
48,vPvTkhIntjw,The United Stand,ATTACK TO BLAME! Brighton 1-0 Manchester Unite...,Brighton 1-0 Manchester United! Mark Goldbridg...,,2023-05-04T21:40:27Z,233758,5741,0,826,PT29M54S,hd,false


In [20]:
channel_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   channelName  10 non-null     object
 1   subscribers  10 non-null     object
 2   views        10 non-null     object
 3   totalVideos  10 non-null     object
 4   playlistId   10 non-null     object
dtypes: object(5)
memory usage: 528.0+ bytes


In [21]:
video_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75794 entries, 0 to 901
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   video_id       75794 non-null  object
 1   channelTitle   75794 non-null  object
 2   title          75794 non-null  object
 3   description    75794 non-null  object
 4   tag            0 non-null      object
 5   publishedAt    75794 non-null  object
 6   viewCount      75784 non-null  object
 7   likeCount      75299 non-null  object
 8   favoriteCount  75794 non-null  object
 9   commentCount   73353 non-null  object
 10  duration       75794 non-null  object
 11  definition     75794 non-null  object
 12  caption        75794 non-null  object
dtypes: object(13)
memory usage: 8.1+ MB


In [22]:
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   video_id  597 non-null    object
 1   comments  597 non-null    object
dtypes: object(2)
memory usage: 9.5+ KB


In [23]:
most_popular_videos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 330 entries, 0 to 49
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   video_id       330 non-null    object
 1   channelTitle   330 non-null    object
 2   title          330 non-null    object
 3   description    330 non-null    object
 4   tag            0 non-null      object
 5   publishedAt    330 non-null    object
 6   viewCount      330 non-null    object
 7   likeCount      327 non-null    object
 8   favoriteCount  330 non-null    object
 9   commentCount   325 non-null    object
 10  duration       330 non-null    object
 11  definition     330 non-null    object
 12  caption        330 non-null    object
dtypes: object(13)
memory usage: 36.1+ KB


In [26]:
channel_stats.shape

(10, 5)

In [27]:
video_df.shape

(75794, 13)

In [28]:
comments_df.shape

(597, 2)

In [29]:
most_popular_videos.shape

(330, 13)