In [31]:
import os
from pprint import *
import googleapiclient.discovery
import pandas as pd
import numpy as np
import seaborn as sns
from dateutil import parser
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(11.7,8.27)})

In [32]:
# -*- coding: utf-8 -*-

# Sample Python code for youtube.channels.list
# See instructions for running these code samples locally:
# https://developers.google.com/explorer-help/code-samples#python
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "AIzaSyCR-locoMVUuDjx__ibmaA0dbAE_mY6Xps"

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey = DEVELOPER_KEY)

In [33]:
def get_videos_id_from_channel_id(channel_id):
    ids = []
    first_request = True
    nextPageToken = "Something"
    
    #Init request
    while nextPageToken:
        if first_request:
            request = youtube.search().list( part = "snippet", maxResults=3000, channelId = channel_id, order="date")
            first_request = False
        else:
            request = youtube.search().list( part = "snippet", maxResults=3000, channelId = channel_id, order="date", pageToken = nextPageToken)
            
        response = request.execute()
        if 'nextPageToken' in response.keys():
            nextPageToken = response['nextPageToken']
        else:
            nextPageToken = False
        ids += [video_data['id']['videoId'] for video_data in response['items'] if video_data['id']['kind']=='youtube#video']

    return ids

In [91]:
def retreive_dataframe_from_id_list(id_list, verbose = False):
    
    request = youtube.videos().list(part="statistics,snippet", id=id_list)
    response = request.execute()

    if verbose:
        pprint(response)
    
    df = pd.DataFrame()
    tags = pd.DataFrame()
    
    for video_data in response['items']:
        
        data = {}
        
        if "channelId" in video_data['snippet'].keys():
            data['channelId'] = video_data['snippet']['channelId']
        else:
            data['channelId'] = np.nan
        
        if "id" in video_data.keys():
            data['videoId'] =  video_data['id']
        else:
            data['videoId'] = np.nan
            
        if "publishedAt" in video_data['snippet'].keys():
            upload_date = video_data['snippet']['publishedAt'] 
            upload_date_parsed = parser.parse(upload_date)
            data['publishedAt'] = upload_date
            data['publishedYear'] =  upload_date_parsed.year
            data['publishedMonth']= upload_date_parsed.month
            data['publishedDay'] =  upload_date_parsed.day
            data['publishedWeekday']= upload_date_parsed.weekday()
            data['publishedHour']= (upload_date_parsed.hour +2) %24
        else:
            data['publishedAt'] = np.nan
            #data['publishedYear'] =  np.nan
            data['publishedMonth']= np.nan
            data['publishedDay'] =  np.nan
            data['publishedWeekday']= np.nan
            data['publishedHour']= np.nan 
        
        if 'title' in video_data['snippet'].keys():
            data['title'] = video_data['snippet']['title']
        else:
            data['title'] = np.nan
            
        stat = ['viewCount','likeCount','commentCount']
        for elt in stat:
            if elt in video_data['statistics'].keys():
                data[elt] = video_data['statistics'][elt]
            else:
                data[elt] = np.nan
            
        if "tags" in video_data['snippet'].keys():
            for elt in video_data['snippet']['tags']:
                tags_data = {
                    'videoId': video_data['id'],
                    'tag': elt
                }  
                tags = tags.append(tags_data, ignore_index = True)
            
        df = df.append(data, ignore_index = True)
    
    # ----------- Clean data type ----------------
    """
    df.videoId = df.videoId.astype(str)
    df.publishedAt = df.publishedAt.astype(str)
    df.publishedYear = df.publishedYear.astype(int)
    df.publishedMonth = df.publishedMonth.astype(int)
    df.publishedDay = df.publishedDay.astype(int)
    df.publishedWeekday = df.publishedWeekday.astype(int)
    df.publishedHour = df.publishedHour.astype(int)
    df.viewCount = df.viewCount.astype(int)
    df.likeCount = df.likeCount.astype(int)
    df.commentCount = df.commentCount.astype(int)
    """
    
    return df,tags

In [67]:
def get_stats_from_channel_id(channel_id):
    ids = get_videos_id_from_channel_id(channel_id)
    data = pd.DataFrame()
    data_tags = pd.DataFrame()
    
    for i in range(len(ids)//50 +1):
        sub_df,sub_df_tags = retreive_dataframe_from_id_list(ids[50*i:50*(i+1)])
        data = pd.concat([data, sub_df], ignore_index=True)
        data_tags = pd.concat([data_tags, sub_df_tags], ignore_index = True)
        
    return data, data_tags

In [77]:
def get_channels_data(id_list):
    
    df = pd.DataFrame()
    
    request = youtube.channels().list( part="snippet,contentDetails,statistics", id=id_list)
    response = request.execute()
    
    for channel_data in response['items']:
        data = {}
        
        if "id" in channel_data.keys():
            data['channelId'] = channel_data['id']
        else:
            data['channelId'] = np.nan
         
        if "title" in channel_data['snippet'].keys():
            data['title'] = channel_data['snippet']['title']
        else:
            data['title'] = np.nan
            
        if "publishedAt" in channel_data['snippet'].keys():
            data['publishedAt'] = channel_data['snippet']['publishedAt']
        else:
            data['publishedAt'] = np.nan
            
        stat = ['viewCount','subscriberCount','videoCount']
        for elt in stat:
            if elt in channel_data['statistics'].keys():
                data[elt] = channel_data['statistics'][elt]
            else:
                data[elt] = np.nan
        
        
        df = df.append(data, ignore_index = True)
        
        
    return df

In [82]:
youtube_channels = [
    'UCYpRDnhk5H8h16jpS84uqsA',
    'UCO6K_kkdP-lnSCiO3tPx7WA',
    'UC9GGzAhhvhJO1hL10-BcgNA',
    'UCCDz_XYeKWd0OIyjp95dqyQ',
    'UCfHn_8-ehdem86fEvlFg-Gw',
    'UChZWs6PJY0hND384d2_RrhQ',
    'UCyIV8rkza5Uk_sJIhqilBvQ',
    'UC9x4Iyu-D257P0P-VDRwSzw',
    'UCtJk8WNzhAu37r-QBmY7pRA',
    'UCar0yo-PpeMk51m116826pQ',
    'UCfsTAnYne5szhoMc381A5wg',
    'UCSKdvgqdnj72_SLggp7BDTg',
    'UCXwDLMDV86ldKoFVc_g8P0g',
    'UC1ObaaFz4XHVPN2T5IFsU4w',
    'UCU0DxnpHTAIO5WZ1Iq7GDqA',
    'UCnIWNujSpiiuH29-TBCz9aw',
    'UCrpsdF7UreiF-2r5Jp0YpRg',
    'UCXKJrYczY2_fJEZgFPGY0HQ',
    'UCD25QI36ly-8hWm9OPOz7aw',
    'UCvrNoLkkDgcFrLV2oX8mbEg',
    'UCUo1RqYV8tGjV38sQ8S5p9A',
    'UCAcAnMF0OrCtUep3Y4M-ZPw',
    'UCIMGfEAERXjmWwQeg15BFsg',
    'UCpi-4daExRSchtsJzO06rwA',
    'UCwI-JbGNsojunnHbFAc0M4Q',
    'UCewhc0fvja891XkpIPGRMxQ',
    'UCCCPCZNChQdGa9EkATeye4g' 
]
res = get_channels_data(youtube_channels)

In [83]:
res

Unnamed: 0,channelId,title,publishedAt,viewCount,subscriberCount,videoCount
0,UCYpRDnhk5H8h16jpS84uqsA,Le Monde,2006-03-17T14:41:41Z,310639457,1320000,2055
1,UCSKdvgqdnj72_SLggp7BDTg,Brut,2016-10-28T12:43:30Z,537272835,1400000,6028
2,UCewhc0fvja891XkpIPGRMxQ,LCI,2015-06-17T14:23:07Z,164788340,507000,3416
3,UC1ObaaFz4XHVPN2T5IFsU4w,L'Obs,2006-10-02T20:49:21Z,169434054,268000,6836
4,UCfHn_8-ehdem86fEvlFg-Gw,Le Parisien,2014-10-06T16:42:09Z,844038565,808000,9011
5,UCyIV8rkza5Uk_sJIhqilBvQ,L'ÉQUIPE,2013-02-28T18:07:07Z,132364578,504000,9856
6,UCpi-4daExRSchtsJzO06rwA,Quotidien,2016-09-07T09:05:05Z,119647060,478000,273
7,UCAcAnMF0OrCtUep3Y4M-ZPw,HugoDécrypte - Actus du jour,2015-11-19T19:48:59Z,265596633,1580000,967
8,UCvrNoLkkDgcFrLV2oX8mbEg,TVMag,2006-07-17T18:16:05Z,6861437,34000,399
9,UCCDz_XYeKWd0OIyjp95dqyQ,Figaro Live,2006-05-21T15:43:59Z,246857172,435000,10553


In [84]:
from pathlib import Path  

filepath = Path('./csv/channels.csv')  

filepath.parent.mkdir(parents=True, exist_ok=True)  

res.to_csv(filepath) 

In [97]:
def concat_data_from_channel_ids(videos, tags, channelIds):
    for channelId in channelIds:
        sub_videos, sub_tags = get_stats_from_channel_id(channelId)
        videos = pd.concat([videos, sub_videos], ignore_index=True)
        tags = pd.concat([tags, sub_tags], ignore_index=True)
        
    return videos, tags
        
    

In [98]:
videos = pd.DataFrame()
tags = pd.DataFrame()

In [99]:
videos, data = concat_data_from_channel_ids(videos, tags, [
    'UCYpRDnhk5H8h16jpS84uqsA',
    'UCO6K_kkdP-lnSCiO3tPx7WA',
    'UC9GGzAhhvhJO1hL10-BcgNA',
    'UCCDz_XYeKWd0OIyjp95dqyQ',
    'UCfHn_8-ehdem86fEvlFg-Gw'])

HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&maxResults=3000&channelId=UCfHn_8-ehdem86fEvlFg-Gw&order=date&pageToken=CMIDEAA&key=AIzaSyCR-locoMVUuDjx__ibmaA0dbAE_mY6Xps&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

In [101]:
videos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

In [102]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   channelId         57 non-null     object
 1   videoId           57 non-null     object
 2   publishedAt       57 non-null     object
 3   publishedYear     57 non-null     int64 
 4   publishedMonth    57 non-null     int64 
 5   publishedDay      57 non-null     int64 
 6   publishedWeekday  57 non-null     int64 
 7   publishedHour     57 non-null     int64 
 8   title             57 non-null     object
 9   viewCount         57 non-null     int64 
 10  likeCount         57 non-null     int64 
 11  commentCount      57 non-null     int64 
dtypes: int64(8), object(4)
memory usage: 5.5+ KB
