In [1]:
from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON

## 1. Prepare api_key and channel_key

In [2]:
api_service_name = "youtube"
api_version = "v3"

youtube = build(api_service_name, 
                api_version, 
                developerKey=api_key)

# 1. Đông tây promotion
# 2. Vie Channel
# 3. FapTV
# 4. VTV GiaiTri
# 5. Dien Quan Comedy

channel_ids = ['UCFMEYTv6N64hIL9FlQ_hxBw',
               'UCkna2OcuN1E6u5I8GVtdkOw',
               'UC0jDoh3tVXCaqJ6oTve8ebA',
               'UCuJ5k3GndbHnXLYyiIR6Z8Q',
               'UCwmurIyZ6FHyVPtKppe_2_A']

## 2. Collect channels

In [3]:
def collect_stats_channel(youtube, channel_ids):
    '''
    Params: 
        channelId -> list
    Return:
        the statistical information of these channels -> DataFrame
    '''
    
    all_data = []
    
    request = youtube.channels().list(
        part = "snippet,contentDetails,statistics",
        id = ','.join(channel_ids)
    )
    response = request.execute()
    
    for item in response['items']:
        data = {'Channel': item['snippet']['title'],
                'PlaylistId' : item['contentDetails']['relatedPlaylists']['uploads'],
                'PublishedTime': item['snippet']['publishedAt'],
                'Views': item['statistics']['viewCount'],
                'Subscribers':item['statistics']['subscriberCount'],
                'Videos': item['statistics']['videoCount']}
        all_data.append(data)
        
    return pd.DataFrame(all_data, columns=all_data[0].keys())

In [4]:
channels_df = collect_stats_channel(youtube,channel_ids)
channels_df

Unnamed: 0,Channel,PlaylistId,PublishedTime,Views,Subscribers,Videos
0,VTV Giải Trí Official,UUuJ5k3GndbHnXLYyiIR6Z8Q,2018-12-20T07:33:59Z,2078335082,2450000,8906
1,FAPTV,UU0jDoh3tVXCaqJ6oTve8ebA,2014-02-26T15:44:42Z,4921739826,13300000,567
2,ĐÔNG TÂY PROMOTION OFFICIAL,UUFMEYTv6N64hIL9FlQ_hxBw,2014-12-17T03:07:44Z,4870093296,8080000,7774
3,DIEN QUAN Comedy / Hài,UUwmurIyZ6FHyVPtKppe_2_A,2014-12-17T06:57:46Z,3941828903,5940000,4052
4,Vie Channel - HTV2,UUkna2OcuN1E6u5I8GVtdkOw,2012-06-04T08:47:14Z,4312902548,9870000,8597


## 3. Collect video_id by playlist_id

In [5]:
playlist_ids = list(channels_df['PlaylistId'])

In [6]:
def collect_video_id(youtube, playlist_id):
    '''
    Params: 
        playlistIds -> list
    Return:
        video_id -> DataFrame
    '''
    
    video_ids = []
    request = youtube.playlistItems().list(
      part="snippet,contentDetails",
      maxResults=0,
      playlistId=playlist_id
    )
    response = request.execute()
    for item in response['items']:
        data = {'Channel': item['snippet']['channelTitle'],
                'Video_id': item['contentDetails']['videoId']}
        video_ids.append(data)

    # next page
    next_page_token = response.get('nextPageToken')

    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            maxResults=50,
            playlistId=playlist_id,
            pageToken = next_page_token
        ) 

    response = request.execute()
    for item in response['items']:
        data = {'Channel': item['snippet']['channelTitle'],
              'VideoId': item['contentDetails']['videoId']}
        video_ids.append(data)

    next_page_token = response.get('nextPageToken')

    return pd.DataFrame(video_ids,columns = ['Channel','VideoId'])

In [7]:
video_ids_df = pd.DataFrame()
for playlist_id in playlist_ids:
    video_ids_df = pd.concat((video_ids_df, collect_video_id(youtube, playlist_id)),axis = 0, ignore_index=True)
video_ids_df.head()

Unnamed: 0,Channel,VideoId
0,VTV Giải Trí Official,LObx_aVfpuA
1,VTV Giải Trí Official,a8-B8yByqK0
2,VTV Giải Trí Official,B2WuOIw9w9U
3,VTV Giải Trí Official,0kjiyMoIeU0
4,VTV Giải Trí Official,lL_UZK0Zdxo


## 4. Collect information video by video_id

In [8]:
def collect_information_video(youtube,video_ids):
    '''
    Solution: 
        Because the numbers of videos are big so we divide into batch, each batch contains 50 videos.
        We must to execute a loop for a last batch if the numbers of records aren't divisible 50.
    Params:
        video_ids -> list
    Return:
        information of video -> DataFrame
    '''
    all_information_video = []

    start = 0
    batchs = video_ids_df.shape[0] // 50
    for batch in range(batchs):
        request = youtube.videos().list(
              part="snippet,contentDetails,statistics",
              maxResults = 50,
              id=video_ids[start: start + 50])
        response = request.execute()

        for item in response['items']:
            data = {'VideoId' : item['id'],
                  'Channel': item['snippet']['channelTitle'],
                  'Title': item['snippet']['title'],
                  'publishedTime': item['snippet']['publishedAt'],
                  'CategoryId':item['snippet']['categoryId'],
                  'Description':item['snippet']['description'],
                  'Duration': item['contentDetails']['duration'],                   # ISO 8601 duration
                  'Dimension': item['contentDetails']['dimension'],                 # 2D or 3D
                  'Caption': item['contentDetails']['caption']}                     # subtitle
            # attribute can be miss
            try:
                data['Tags'] = item['snippet']['tags']
            except:
                data['Tags'] = 0
            try:
                data['Views'] = item['statistics']['viewCount']
            except:
                data['Views'] = 0
            try:
                data['Likes'] = item['statistics']['likeCount']
            except:
                data['Likes'] = 0
            try:
                data['Comments'] = item['statistics']['commentCount']
            except:
                data['Comments'] = 0
            all_information_video.append(data)
        start += 50

    # last batch:
    request = youtube.videos().list(
          part="snippet,contentDetails,statistics",
          maxResults = 50,
          id=video_ids[batchs*50:])
    response = request.execute()

    for item in response['items']:
        data = {'VideoId' : item['id'],
                'Channel': item['snippet']['channelTitle'],
                'Title': item['snippet']['title'],
                'publishedTime': item['snippet']['publishedAt'],
                'CategoryId':item['snippet']['categoryId'],
                'Description':item['snippet']['description'],
                'Duration': item['contentDetails']['duration'],                     # ISO 8601 duration
                'Dimension': item['contentDetails']['dimension'],                   # 2D or 3D
                'Caption': item['contentDetails']['caption']}                       # subtitle
        # attribute can be miss
        try:
            data['Tags'] = item['snippet']['tags']
        except:
            data['Tags'] = 0
        try:
            data['Views'] = item['statistics']['viewCount']
        except:
            data['Views'] = 0
        try:
            data['Likes'] = item['statistics']['likeCount']
        except:
            data['Likes'] = 0
        try:
            data['Comments'] = item['statistics']['commentCount']
        except:
            data['Comments'] = 0
        all_information_video.append(data)

    return pd.DataFrame(all_information_video, columns = list(all_information_video[0].keys()))                      


In [9]:
information_videos_df = collect_information_video(youtube,list(video_ids_df['VideoId']))
information_videos_df.head()

Unnamed: 0,VideoId,Channel,Title,publishedTime,CategoryId,Description,Duration,Dimension,Caption,Tags,Views,Likes,Comments
0,LObx_aVfpuA,VTV Giải Trí Official,Chồng cũ vợ cũ người yêu cũ tập 22 | Một pha '...,2022-07-08T15:30:09Z,24,▶️ Full Chồng cũ vợ cũ người yêu cũ tập 22: ht...,PT5M9S,2d,False,"[vtv giải trí, vtv giai tri, phim hay, phim vt...",10890,91,5
1,a8-B8yByqK0,VTV Giải Trí Official,TRỰC TIẾP VTV3 | Chồng cũ vợ cũ người yêu cũ t...,2022-07-08T14:30:12Z,24,TRỰC TIẾP VTV3 | Chồng cũ vợ cũ người yêu cũ t...,PT4M9S,2d,False,"[vtv giải trí, vtv giai tri, phim hay, phim vt...",30827,158,5
2,B2WuOIw9w9U,VTV Giải Trí Official,Lối Nhỏ Vào Đời - Tập 21 | Bác Thành đau khổ c...,2022-07-08T14:30:02Z,24,Lối Nhỏ Vào Đời - Tập 21 | Bác Thành đau khổ c...,PT5M8S,2d,False,"[vtv giải trí, vtv giai tri, phim hay, phim vt...",24188,184,16
3,0kjiyMoIeU0,VTV Giải Trí Official,TRỰC TIẾP VTV1 | TẬP 21: Lối Nhỏ Vào Đời,2022-07-08T14:00:13Z,24,TRỰC TIẾP VTV1 | TẬP 21: Lối Nhỏ Vào Đời \n👉 X...,PT4M1S,2d,False,"[vtv giải trí, vtv giai tri, phim hay, phim vt...",33323,163,17
4,lL_UZK0Zdxo,VTV Giải Trí Official,Chồng cũ vợ cũ người yêu cũ tập 21 | Vợ mới từ...,2022-07-08T11:45:03Z,24,▶️ Full Chồng cũ vợ cũ người yêu cũ tập 21: ht...,PT4M9S,2d,False,"[vtv giải trí, vtv giai tri, phim hay, phim vt...",8730,40,1


## 5. Import data into Mongodb Atlas

In [10]:
! pip install pymongo[srv]
! pip install dnspython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from pymongo import MongoClient

client = MongoClient(f'mongodb+srv://{username}:{password}@{hostname}/?retryWrites=true&w=majority')

# create database
database = client['Youtube_API']

In [15]:
# insert collections into database
database.channel.insert_many(channels_df.to_dict(orient = 'records'))
database.video_id.insert_many(video_ids_df.to_dict(orient = 'records'))
database.information_video.insert_many(information_videos_df.to_dict(orient = 'records'))

# file "category_csv" is provided from github
category_df = pd.read_csv('Data/category_video.csv').to_dict(orient = 'records')
database.category.insert_many(category_df)

<pymongo.results.InsertManyResult at 0x7f8669619dd0>