# Collect data from Youtube Data API
Last update: 30/08/2021

## Import libraries

In [1]:
import requests
import requests_cache
import time
import json
import pandas as pd 
from datetime import datetime, timedelta

## Global setup

In [2]:
sleep_time = 0.5

key = 'AIzaSyDGT1d7zucp9jCcmldgdTJFqXgdJWKyBJM'
channel_id = 'UCq-Fj5jknLsUf-MWSy4_brA'

channel_parts = ['snippet', 'statistics', 'status', 'topicDetails']
video_parts = ['snippet', 'statistics', 'status', 'topicDetails', 'contentDetails']

In [3]:
def check_cache(r):
    if 'error' in json.loads(r.text).keys():
        return False
    return True

requests_cache.install_cache(filter_fn=check_cache)

## Retrieve channel info

In [4]:
def get_channel_info(channel_info_file, key, channel_id, channel_parts):
    # Request channel data
    channel_parts_str = ','.join(channel_parts)
    url = f'https://www.googleapis.com/youtube/v3/channels?key={key}&id={channel_id}'\
    f'&part={channel_parts_str}'
    r = requests.get(url)
    json_data = json.loads(r.text)
    channel_data = json_data['items'][0]
    
    # Initialize the info row
    columns = ['id']
    channel_info = [channel_data['id']]
    
    # snippet
    if 'snippet' in channel_parts:
        columns += ['title', 'description', 'publishedAt', 'country']
        snippet = channel_data['snippet']
        channel_info += [snippet['title'], 
                         snippet['description'], 
                         snippet['publishedAt'], 
                         snippet['country']]
    
    # statistics
    if 'statistics' in channel_parts:
        columns += ['viewCount', 'subscriberCount', 'hiddenSubscriberCount', 'videoCount']
        statistics = channel_data['statistics']
        channel_info += [statistics['viewCount'], 
                         statistics['subscriberCount'], 
                         statistics['hiddenSubscriberCount'], 
                         statistics['videoCount']]
    
    # status
    if 'status' in channel_parts:
        columns += ['privacyStatus', 'isLinked', 'longUploadsStatus', 'madeForKids']
        status = channel_data['status']
        channel_info += [status['privacyStatus'], 
                         status['isLinked'], 
                         status['longUploadsStatus'], 
                         status['madeForKids']]
    
    # topicDetails
    if 'topicDetails' in channel_parts:
        columns += ['topicIds', 'topicCategories']
        topic_details = channel_data['topicDetails']
        channel_info += [topic_details['topicIds'], 
                         topic_details['topicCategories']]
    
    # Make a dataframe to write csv
    channel_df = pd.DataFrame(columns=columns)
    channel_df.loc[0] = channel_info
    
    # Write to csv file
    channel_df.to_csv(channel_info_file, sep='\t', encoding='utf-8', index=False)

In [5]:
get_channel_info('data/channel_info.csv', key, channel_id, channel_parts)
channel_df = pd.read_csv('data/channel_info.csv', sep='\t')
channel_df

Unnamed: 0,id,title,description,publishedAt,country,viewCount,subscriberCount,hiddenSubscriberCount,videoCount,privacyStatus,isLinked,longUploadsStatus,madeForKids,topicIds,topicCategories
0,UCq-Fj5jknLsUf-MWSy4_brA,T-Series,"""Music can change the world"". T-Series is Indi...",2006-03-13T14:27:05Z,IN,162554005444,191000000,False,15704,public,True,longUploadsUnspecified,False,"['/m/028sqc', '/m/04rlf']",['https://en.wikipedia.org/wiki/Music_of_Asia'...


## Retrieve video ids of channel

In [6]:
def collect_latest_video_ids(key, channel_id, start_date, end_date, 
                             step=30, max_video_ids=1000, per_page=50):
    video_ids = []
    date_format = '%Y-%m-%dT%H:%M:%SZ'
    end_date_obj = datetime.strptime(end_date, date_format).date()
    start_date_obj = datetime.strptime(start_date, date_format).date()
    
    period_end_date_obj = end_date_obj
    period_start_date_obj = end_date_obj - timedelta(days=step-1)
    
    while period_start_date_obj >= start_date_obj:
        period_end_date = period_end_date_obj.strftime(date_format)
        period_start_date = period_start_date_obj.strftime(date_format)
        
        nextPageToken = ''
        while True:
            url = f'https://www.googleapis.com/youtube/v3/search?key={key}&channelId={channel_id}'\
            f'&maxResults={per_page}&pageToken={nextPageToken}'\
            f'&part=id&type=video&order=date'\
            f'&publishedAfter={period_start_date}&publishedBefore={period_end_date}'\

            json_data = json.loads(requests.get(url).text)

            for item in json_data['items']: 
                video_id = item['id']['videoId']
                if video_id not in video_ids:
                    video_ids.append(video_id)

            if len(video_ids) >= max_video_ids:
                return video_ids

            if 'nextPageToken' in json_data.keys():
                nextPageToken = json_data['nextPageToken']
            else:
                break
            
            time.sleep(sleep_time)
        
        period_end_date_obj = period_start_date_obj
        period_start_date_obj = period_start_date_obj - timedelta(days=step-1)
        
        if period_start_date_obj < start_date_obj and period_end_date_obj > start_date_obj:
            period_start_date_obj = start_date_obj
        
        print(f'Period from {period_start_date} to {period_end_date}: done')
    
    return video_ids 

In [7]:
video_ids = collect_latest_video_ids(key, channel_id, 
                                     start_date='2020-08-30T00:00:00Z', 
                                     end_date='2021-08-30T00:00:00Z')
len(set(video_ids))

Period from 2021-08-01T00:00:00Z to 2021-08-30T00:00:00Z: done
Period from 2021-07-03T00:00:00Z to 2021-08-01T00:00:00Z: done
Period from 2021-06-04T00:00:00Z to 2021-07-03T00:00:00Z: done
Period from 2021-05-06T00:00:00Z to 2021-06-04T00:00:00Z: done
Period from 2021-04-07T00:00:00Z to 2021-05-06T00:00:00Z: done
Period from 2021-03-09T00:00:00Z to 2021-04-07T00:00:00Z: done
Period from 2021-02-08T00:00:00Z to 2021-03-09T00:00:00Z: done
Period from 2021-01-10T00:00:00Z to 2021-02-08T00:00:00Z: done
Period from 2020-12-12T00:00:00Z to 2021-01-10T00:00:00Z: done
Period from 2020-11-13T00:00:00Z to 2020-12-12T00:00:00Z: done


1002

## Retrieve info of videos (main data)

In [8]:
def collect_videos_info(videos_info_file, video_ids, key, video_parts):
    # Initialize variables
    start_indx = 0
    end_indx = 50 # Max number of ids for one time retrieve
    end_ids = False
    video_parts_str = ','.join(video_parts)
    
    # Make the first line (columns name)
    columns = ['id']
    if 'snippet' in video_parts:
        columns += ['title', 'publishedAt', 'thumbnailUrl', 'categoryId', 
                    'defaultAudioLanguage', 'tags']
    if 'statistics' in video_parts:
        columns += ['viewCount', 'likeCount', 'dislikeCount', 'commentCount']
    if 'status' in video_parts:
        columns += ['uploadStatus', 'privacyStatus', 'license', 'embeddable', 
                    'publicStatsViewable', 'madeForKids']
    if 'topicDetails' in video_parts:
        columns += ['topicCategories']
    if 'contentDetails' in video_parts:
        columns += ['duration', 'dimension', 'definition', 'caption', 'licensedContent', 
                    'contentRating', 'projection']
    
    # Make a dataframe to write csv
    video_df = pd.DataFrame(columns=columns)
    video_df.to_csv(videos_info_file, sep='\t', encoding='utf-8', index=False)
        
    # Request videos data
    while end_ids != True:
        video_ids_str = ','.join(video_ids[start_indx:end_indx])
        url = f'https://www.googleapis.com/youtube/v3/videos?key={key}&id={video_ids_str}'\
        f'&part={video_parts_str}'
        json_data = json.loads(requests.get(url).text)

        for item in json_data['items']: 
            video_data = [item['id']]

            # snippet
            if 'snippet' in video_parts:
                snippet = item.get('snippet', None)
                if snippet != None:
                    video_data += [snippet.get('title', None), 
                                   snippet.get('publishedAt', None), 
                                   snippet['thumbnails']['default']['url'],
                                   snippet.get('categoryId', None),
                                   snippet.get('defaultAudioLanguage', None),
                                   snippet.get('tags', None)]
                else:
                    video_data += [None]*6

            # statistics
            if 'statistics' in video_parts:
                statistics = item.get('statistics', None)
                if statistics != None:
                    video_data += [statistics.get('viewCount', None), 
                                   statistics.get('likeCount', None), 
                                   statistics.get('dislikeCount', None),
                                   statistics.get('commentCount', None)]
                else:
                    video_data += [None]*4

            # status
            if 'status' in video_parts:
                status = item.get('status', None)
                if status != None:
                    video_data += [status.get('uploadStatus', None), 
                                   status.get('privacyStatus', None), 
                                   status.get('license', None), 
                                   status.get('embeddable', None),
                                   status.get('publicStatsViewable', None),
                                   status.get('madeForKids', None)]
                else:
                    video_data += [None]*6

            # topicDetails
            if 'topicDetails' in video_parts:
                topic_details = item.get('topicDetails', None)
                if topic_details != None:
                    video_data += [topic_details.get('topicCategories', None)]
                else:
                    video_data += [None]

            # contentDetails
            if 'contentDetails' in video_parts:
                content_details = item.get('contentDetails', None)
                if content_details != None:
                    video_data += [content_details.get('duration', None), 
                                   content_details.get('dimension', None),
                                   content_details.get('definition', None), 
                                   content_details.get('caption', None),
                                   content_details.get('licensedContent', None), 
                                   content_details.get('contentRating', None), 
                                   content_details.get('projection', None)]
                else:
                    video_data += [None]*7

            # Write a video to csv file
            video_df.loc[0] = video_data
            video_df.to_csv(videos_info_file, mode='a', header=False, 
                            sep='\t', encoding='utf-8', index=False)
            
        if end_indx >= len(video_ids):
            end_ids = True
        else:
            start_indx = end_indx
            end_indx += 50
        
        time.sleep(sleep_time)

In [9]:
collect_videos_info('data/videos_data.csv', video_ids, key, video_parts)
video_df = pd.read_csv('data/videos_data.csv', sep='\t')
video_df

  return array(a, dtype, copy=False, order=order)


Unnamed: 0,id,title,publishedAt,thumbnailUrl,categoryId,defaultAudioLanguage,tags,viewCount,likeCount,dislikeCount,...,publicStatsViewable,madeForKids,topicCategories,duration,dimension,definition,caption,licensedContent,contentRating,projection
0,vuoaLan88i4,Making of Main Teri Mohabbat Mein /Jaane Jigar...,2021-08-29T09:42:39Z,https://i.ytimg.com/vi/vuoaLan88i4/default.jpg,10,hi,"['hindi songs', '2021 hindi songs', 'new hindi...",218972,8079,592,...,False,False,"['https://en.wikipedia.org/wiki/Music', 'https...",PT1M34S,2d,hd,False,True,{},rectangular
1,oxTJkkcqk7M,Aao Milo/Kya Mujhe Pyar Hai Teaser Ep10 |Sukri...,2021-08-28T07:30:03Z,https://i.ytimg.com/vi/oxTJkkcqk7M/default.jpg,10,hi,"['hindi songs', '2021 hindi songs', 'new hindi...",306069,8679,414,...,False,False,"['https://en.wikipedia.org/wiki/Music', 'https...",PT19S,2d,hd,False,True,{},rectangular
2,9qXBo2-rUHk,Janmashtami Celebrations | Audio Jukebox | Hap...,2021-08-28T03:58:39Z,https://i.ytimg.com/vi/9qXBo2-rUHk/default.jpg,10,hi,"['krishan janmashtami', 'krishan bhajans', 'ja...",450907,28169,672,...,False,False,"['https://en.wikipedia.org/wiki/Music', 'https...",PT25M59S,2d,hd,False,True,{},rectangular
3,_VR6rWWL-9g,Ladka Aaya Hai Dekhne | Movie Clip 1| Batti Gu...,2021-08-27T12:30:13Z,https://i.ytimg.com/vi/_VR6rWWL-9g/default.jpg,10,hi,"['latest hindi songs', 'bollywood songs', 'son...",75708,1656,188,...,False,False,['https://en.wikipedia.org/wiki/Entertainment'...,PT4M17S,2d,hd,False,True,{},rectangular
4,ZFUr3nN8GR8,TERI AANKHON MEIN Teaser ► THALAIVII | Kangana...,2021-08-27T11:30:05Z,https://i.ytimg.com/vi/ZFUr3nN8GR8/default.jpg,10,hi,"['hindi songs', '2021 hindi songs', 'new hindi...",372334,11241,1305,...,False,False,['https://en.wikipedia.org/wiki/Music'],PT29S,2d,hd,False,True,{},rectangular
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,iXvfb02mzK0,PM Narendra Modi: Chetavani (Dialogue Promo) |...,2020-10-16T06:30:03Z,https://i.ytimg.com/vi/iXvfb02mzK0/default.jpg,10,hi,"['hindi songs', '2020 hindi songs', 'new hindi...",309329,10898,4409,...,False,False,['https://en.wikipedia.org/wiki/Entertainment'...,PT10S,2d,hd,False,False,{},rectangular
997,CLJ7qlH7XEo,Jubin Nautiyal: Main Balak Tu Mata | Gulshan K...,2020-10-16T05:31:05Z,https://i.ytimg.com/vi/CLJ7qlH7XEo/default.jpg,10,hi,"['main balak tu mata sherawaliye', 'main balak...",8946017,239516,6480,...,False,False,"['https://en.wikipedia.org/wiki/Music', 'https...",PT3M38S,2d,hd,False,True,{},rectangular
998,m2hcrFZ9NUI,'Yaara Re'| Lyrical Video | Remix BY DJ SHIVA ...,2020-10-15T13:30:03Z,https://i.ytimg.com/vi/m2hcrFZ9NUI/default.jpg,10,hi,"['hindi songs', '2020 hindi songs', 'new hindi...",441329,12074,1290,...,False,False,['https://en.wikipedia.org/wiki/Electronic_mus...,PT5M38S,2d,hd,False,True,{},rectangular
999,ZtxkJuA_BYs,PM Narendra Modi: Tarakki Ka Nasha (Dialogue P...,2020-10-15T10:34:00Z,https://i.ytimg.com/vi/ZtxkJuA_BYs/default.jpg,10,hi,"['hindi songs', '2020 hindi songs', 'new hindi...",324088,13899,5037,...,False,False,['https://en.wikipedia.org/wiki/Entertainment'...,PT15S,2d,hd,False,False,{},rectangular


In [10]:
# Video ids without info found
set(video_ids) - set(video_df['id'].tolist())

{'Yce3-SAW3jY'}

## The Description of Columns

In [11]:
f = open('data/videos_data_description.txt')
print(f.read())

The description is summarized from "A video resource represents a YouTube video.", which includes 5 parts of video resources: id, snippet, statistics, status, topicDetails, contentDetails.
Link full: https://developers.google.com/youtube/v3/docs/videos

* id: string
The ID that YouTube uses to uniquely identify the video.

---

* snippet: The snippet object contains basic details about the video, such as its title, description, and category.

- title: string
The video's title.

- publishedAt: datetime
The date and time that the video was published. Note that this time might be different than the time that the video was uploaded. For example, if a video is uploaded as a private video and then made public at a later time, this property will specify the time that the video was made public. The value is specified in ISO 8601 format.

- thumbnailUrl: string
The default thumbnail image's URL.

- categoryId: string
The YouTube video category associated with the video.

- defaultAudioLanguage: