In [5]:
import requests
import pandas as pd
import json
import datetime
import os
from datetime import datetime, timedelta
import glob

# API 
search_url = "https://www.googleapis.com/youtube/v3/search"
API_keys = "" # Use your own API

# The channel list
channel_list = {"BBC":"UC16niRr50-MSBwiO3YDb3RA",
                "Guardian":"UCIRYBXDze5krPDzAEOxFGVA",
                "Sun":"UCIzXayRP7-P0ANpq-nD-h5g",
                "DailyMail":"UCw3fku0sH3qA3c3pZeJwdAw",
                "Independent":"UCshwRhftzkiov5wKR7M_LsQ"}

def getHTMLText(url, kv):
    try:
        # r = requests.get(url, timeout=30)
        r = requests.request('GET', search_url, params = kv, timeout=30)
        # print(r.url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("Out of quota!")
        return "Error"

def getVideosbyWeek(channel_name, year, first_week, last_week):
    # Output dir
    output_dir_json = "./videolist/%s/json_byWeek/"%(channel_name)
    output_dir_csv = "./videolist/%s/csv_byWeek/"%(channel_name)
    #folder1 = os.path.exists(output_dir_json)
    #folder2 = os.path.exists(output_dir_csv)
    if not os.path.exists(output_dir_json):
        os.makedirs(output_dir_json)
    if not os.path.exists(output_dir_csv):
        os.makedirs(output_dir_csv)        
    
    ChannelId = channel_list[channel_name]
    # Convert number of week to date
    # Capture news within one week at one time
    start_week = "%s-W%s"%(str(year),str(first_week))
    start_day = datetime.strptime(start_week + '-1', "%Y-W%W-%w")
    publishedAfter = str(start_day)[:10]+"T00:00:00Z"
    
    for i in range(first_week + 1 , last_week):
        week_i = "%s-W%s"%(str(year),str(i))
        start_i = datetime.strptime(week_i + '-1', "%Y-W%W-%w")
        week_i_Mon = str(start_i)[:10]
        # print(str(r)[:10])
        publishedBefore= week_i_Mon+"T00:00:00Z"
        print(publishedAfter, publishedBefore)

        param_kv = {'key':API_keys, \
                    'channelId':ChannelId, \
                    'part': "snippet,id", \
                    'order':'date', \
                    'publishedAfter':publishedAfter, \
                    'publishedBefore':publishedBefore, \
                    'maxResults':50, \
                    'safeSearch':"none"}

        result = getHTMLText(search_url, param_kv)
        data_dict = json.loads(result)
        with open(output_dir_json + 'result_%s_week_%s.json'%(str(year),str(i)), 'w') as f:
            json.dump(data_dict, f)
        # output to csv
        data_df = jsonToDf(data_dict)
        data_df.to_csv(output_dir_csv + 'result_%s_week_%s.csv'%(str(year),str(i)),header = True)
        publishedAfter = publishedBefore

def getVideosbyDay(channel_name, first_day, last_day):
    """
    input date format：2021-09-30
    """
    # set output dir
    output_dir_json = "./videolist/%s/json_byDay/"%(channel_name)
    output_dir_csv = "./videolist/%s/csv_byDay/"%(channel_name)
    #folder1 = os.path.exists(output_dir_json)
    #folder2 = os.path.exists(output_dir_csv)
    if not os.path.exists(output_dir_json):
        os.makedirs(output_dir_json)
    if not os.path.exists(output_dir_csv):
        os.makedirs(output_dir_csv)
        
    ChannelId = channel_list[channel_name]
    # adjust date format
    d1 = datetime.strptime(first_day, '%Y-%m-%d')
    d2 = datetime.strptime(last_day, '%Y-%m-%d')
    
    d_now = d1
    d_now_format = str(d_now)[:10]+"T00:00:00Z"
    publishedAfter = d_now_format

    while (d_now < d2):
        d_now = d_now + timedelta(days=1)
        # datetime format：1970-01-01T00:00:00Z
        d_now_format = str(d_now)[:10]+"T00:00:00Z"
        publishedBefore = d_now_format
        print("Processing: ",publishedAfter, publishedBefore)
        
        param_kv = {'key':API_keys, \
                    'channelId':ChannelId, \
                    'part': "snippet,id", \
                    'order':'date', \
                    'publishedAfter':publishedAfter, \
                    'publishedBefore':publishedBefore, \
                    'maxResults':50, \
                    'safeSearch':"none"}

        result = getHTMLText(search_url, param_kv)
        data_dict = json.loads(result)
        
        # output to json
        with open(output_dir_json + 'result_%s.json'%(publishedAfter[:10]), 'w') as f:
            json.dump(data_dict, f)
        # output to csv
        data_df = jsonToDf(data_dict)
        data_df.to_csv(output_dir_csv + 'result_%s.csv'%(publishedAfter[:10]),header = True)

        publishedAfter = publishedBefore

def jsonToDf(data_json):
    """
    take json，output to DataFrame
    """
    # original data
    df_data = pd.DataFrame(data_json['items'], columns=['id','snippet'])
    # initial empty df
    column_names = ['channelId', 
                    'channelTitle',
                    'videoId',
                    'vidoeTitle',
                    'description',
                    'link','time',
                    'year',
                    'month',
                    'day']
    df = pd.DataFrame(columns = column_names)

    for i in range(len(df_data)):
        # filter out the playlist in the json results
        if (df_data['id'][i]['kind'] == "youtube#playlist"):
            continue
        # use append to avoid different lengths
        datetime = df_data['snippet'][i]['publishedAt']
        new_row = pd.DataFrame({"videoId" : df_data['id'][i]['videoId'],
                                "vidoeTitle" : df_data['snippet'][i]['title'],
                                "channelId" : df_data['snippet'][i]['channelId'],
                                "channelTitle" : df_data['snippet'][i]['channelTitle'],
                                "description" : df_data['snippet'][i]['description'],
                                "link" : "https://www.youtube.com/watch?v="+df_data['id'][i]['videoId'],
                                "time" : datetime,
                                "year" : int(datetime[:4]),
                                "month" : int(datetime[5:7]),
                                "day" : int(datetime[8:10]),
                               },
                               index=["0"]) 
        df = df.append(new_row,ignore_index=True)
        
    return df

def combineCsv(channel_name, byWhat):
    """
    byWhat = "Week" or "Day"
    """
    path = os.getcwd() + "/videolist/%s/csv_by%s/"%(channel_name,byWhat)
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    column_names = ['channelId', 
                    'channelTitle',
                    'videoId',
                    'vidoeTitle',
                    'description',
                    'link','time',
                    'year',
                    'month',
                    'day']
    df_all = pd.DataFrame(columns = column_names)

    for f in csv_files:
        df = pd.read_csv(f)
        df_all = df_all.append(df, ignore_index=True)
        # Find overflow weeks(contains >50 news a week)
        if (len(df) >= 50):
            print(f.split('/')[-1])
            # print date of overflow week
            miss_last = df.loc[0,'time'][:10]
            miss_first = datetime.strptime(miss_last, '%Y-%m-%d') - timedelta(days=7)
            miss_first = str(miss_first)[:10]
            print(miss_first, miss_last)
    df_all = df_all.sort_values(by=['year','month','day'])
    df_all = df_all.drop_duplicates(keep='first')
    df_all = df_all.reset_index(drop=True)
    df_all = df_all.drop(['Unnamed: 0'],axis=1)
    df_all.to_csv('./videolist/%s/%s_videolist_by%s.csv'%(channel_name,channel_name,byWhat),header = True)    

In [6]:
# Download videolist by week(up to 50 news a week)
getVideosbyWeek("Sun",2020,0,40)

2019-12-30T00:00:00Z 2020-01-06T00:00:00Z
2020-01-06T00:00:00Z 2020-01-13T00:00:00Z
2020-01-13T00:00:00Z 2020-01-20T00:00:00Z
2020-01-20T00:00:00Z 2020-01-27T00:00:00Z
2020-01-27T00:00:00Z 2020-02-03T00:00:00Z
2020-02-03T00:00:00Z 2020-02-10T00:00:00Z
2020-02-10T00:00:00Z 2020-02-17T00:00:00Z
2020-02-17T00:00:00Z 2020-02-24T00:00:00Z
2020-02-24T00:00:00Z 2020-03-02T00:00:00Z
2020-03-02T00:00:00Z 2020-03-09T00:00:00Z
2020-03-09T00:00:00Z 2020-03-16T00:00:00Z
2020-03-16T00:00:00Z 2020-03-23T00:00:00Z
2020-03-23T00:00:00Z 2020-03-30T00:00:00Z
2020-03-30T00:00:00Z 2020-04-06T00:00:00Z
2020-04-06T00:00:00Z 2020-04-13T00:00:00Z
2020-04-13T00:00:00Z 2020-04-20T00:00:00Z
2020-04-20T00:00:00Z 2020-04-27T00:00:00Z
2020-04-27T00:00:00Z 2020-05-04T00:00:00Z
2020-05-04T00:00:00Z 2020-05-11T00:00:00Z
2020-05-11T00:00:00Z 2020-05-18T00:00:00Z
2020-05-18T00:00:00Z 2020-05-25T00:00:00Z
2020-05-25T00:00:00Z 2020-06-01T00:00:00Z
2020-06-01T00:00:00Z 2020-06-08T00:00:00Z
2020-06-08T00:00:00Z 2020-06-15T00

In [None]:
# Download videolist by day(up to 50 news a day)
getVideosbyDay("DailyMail","2020-09-29","2021-07-01")
# quota: 100 search per day

In [7]:
# combine the results
combineCsv("DailyMail","Day")