In [163]:
import requests
import pandas as pd
import json
import datetime
import os
from datetime import datetime, timedelta
import glob

# API 信息
search_url = "https://www.googleapis.com/youtube/v3/search"
API_keys = "" # 用自己的

# 频道信息
channel_list = {"BBC":"UC16niRr50-MSBwiO3YDb3RA",
                "Guardian":"UCIRYBXDze5krPDzAEOxFGVA",
                "Sun":"UCIzXayRP7-P0ANpq-nD-h5g",
                "DailyMail":"UCw3fku0sH3qA3c3pZeJwdAw",
                "Independent":"UCshwRhftzkiov5wKR7M_LsQ"}

def getHTMLText(url, kv):
    try:
        # r = requests.get(url, timeout=30)
        r = requests.request('GET', search_url, params = kv, timeout=30)
        # print(r.url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("爬取失败")
        return "Error"

def getVideosbyWeek(channel_name, year, first_week, last_week):
    # 设置输出地址
    output_dir = "/videolist/%s/json_byWeek/"%(channel_name)
    folder = os.path.exists(output_dir)
    if not folder:
        os.makedirs(output_dir)
    
    ChannelId = channel_list[channel_name]
    # 按照周数，转换成日期，每次抓取一周内的新闻 
    start_week = "%s-W%s"%(str(year),str(first_week))
    start_day = datetime.datetime.strptime(start_week + '-1', "%Y-W%W-%w")
    publishedAfter = str(start_day)[:10]+"T00:00:00Z"
    
    for i in range(first_week + 1 , last_week):
        week_i = "%s-W%s"%(str(year),str(i))
        start_i = datetime.datetime.strptime(week_i + '-1', "%Y-W%W-%w")
        week_i_Mon = str(start_i)[:10]
        # print(str(r)[:10])
        publishedBefore= week_i_Mon+"T00:00:00Z"
        print(publishedAfter, publishedBefore)

        param_kv = {'key':API_keys, \
                    'channelId':ChannelId, \
                    'part': "snippet,id", \
                    'order':'date', \
                    'publishedAfter':publishedAfter, \
                    'publishedBefore':publishedBefore, \
                    'maxResults':50, \
                    'safeSearch':"none"}

        result = getHTMLText(search_url, param_kv)
        data_dict = json.loads(result)
        with open(output_dir + 'result_%s_week_%s.json'%(str(year),str(i)), 'w') as f:
            json.dump(data_dict, f)
        publishedAfter = publishedBefore

def getVideosbyDay(channel_name, first_day, last_day):
    """
    输入格式：2021-09-30
    """
    # 设置输出地址
    output_dir_json = "./videolist/%s/json_byDay/"%(channel_name)
    output_dir_csv = "./videolist/%s/csv_byDay/"%(channel_name)
    #folder1 = os.path.exists(output_dir_json)
    #folder2 = os.path.exists(output_dir_csv)
    if not os.path.exists(output_dir_json):
        os.makedirs(output_dir_json)
    if not os.path.exists(output_dir_csv):
        os.makedirs(output_dir_csv)
        
    ChannelId = channel_list[channel_name]
    # 调整日期格式
    d1 = datetime.strptime(first_day, '%Y-%m-%d')
    d2 = datetime.strptime(last_day, '%Y-%m-%d')
    
    d_now = d1
    d_now_format = str(d_now)[:10]+"T00:00:00Z"
    publishedAfter = d_now_format

    while (d_now < d2):
        d_now = d_now + timedelta(days=1)
        # 日期格式：1970-01-01T00:00:00Z
        d_now_format = str(d_now)[:10]+"T00:00:00Z"
        publishedBefore = d_now_format
        print("Processing: ",publishedAfter, publishedBefore)
        
        param_kv = {'key':API_keys, \
                    'channelId':ChannelId, \
                    'part': "snippet,id", \
                    'order':'date', \
                    'publishedAfter':publishedAfter, \
                    'publishedBefore':publishedBefore, \
                    'maxResults':50, \
                    'safeSearch':"none"}

        result = getHTMLText(search_url, param_kv)
        data_dict = json.loads(result)
        
        # 把结果输出到
        with open(output_dir_json + 'result_%s.json'%(publishedAfter[:10]), 'w') as f:
            json.dump(data_dict, f)
        # 处理为csv
        data_df = jsonToDf(data_dict)
        data_df.to_csv(output_dir_csv + 'result_%s.csv'%(publishedAfter[:10]),header = True)

        publishedAfter = publishedBefore

def jsonToDf(data_json):
    """
    读取json数据，处理成DataFrame
    """
    # 原始数据
    df_data = pd.DataFrame(data_json['items'], columns=['id','snippet'])
    # 初始化一个空df
    column_names = ['channelId', 
                    'channelTitle',
                    'videoId',
                    'vidoeTitle',
                    'description',
                    'link','time',
                    'year',
                    'month',
                    'day']
    df = pd.DataFrame(columns = column_names)

    for i in range(len(df_data)):
        # json里大部分是video有一些是playlist，会报错
        if (df_data['id'][i]['kind'] == "youtube#playlist"):
            continue
        # 最终的df会和原本的df_data长度不一样，所以用append来搞
        datetime = df_data['snippet'][i]['publishedAt']
        new_row = pd.DataFrame({"videoId" : df_data['id'][i]['videoId'],
                                "vidoeTitle" : df_data['snippet'][i]['title'],
                                "channelId" : df_data['snippet'][i]['channelId'],
                                "channelTitle" : df_data['snippet'][i]['channelTitle'],
                                "description" : df_data['snippet'][i]['description'],
                                "link" : "https://www.youtube.com/watch?v="+df_data['id'][i]['videoId'],
                                "time" : datetime,
                                "year" : int(datetime[:4]),
                                "month" : int(datetime[5:7]),
                                "day" : int(datetime[8:10]),
                               },
                               index=["0"]) 
        df = df.append(new_row,ignore_index=True)
        
    return df

def combineCsv(channel_name):
    path = os.getcwd() + "/videolist/%s/csv_byDay/"%(channel_name)
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    column_names = ['channelId', 
                    'channelTitle',
                    'videoId',
                    'vidoeTitle',
                    'description',
                    'link','time',
                    'year',
                    'month',
                    'day']
    df_all = pd.DataFrame(columns = column_names)

    for f in csv_files:
        df = pd.read_csv(f)
        df_all = df_all.append(df, ignore_index=True)
    df_all = df_all.sort_values(by=['year','month','day'])
    df_all = df_all.reset_index(drop=True)
    df_all.to_csv('./videolist/%s/%s_videolist.csv'%(channel_name,channel_name),header = True)    

In [164]:
getVideosbyDay("Guardian","2020-03-01","2021-05-01")
# 每天只能进行100次search，API配额用完了会报错

Processing:  2020-03-01T00:00:00Z 2020-03-02T00:00:00Z
Processing:  2020-03-02T00:00:00Z 2020-03-03T00:00:00Z
Processing:  2020-03-03T00:00:00Z 2020-03-04T00:00:00Z
Processing:  2020-03-04T00:00:00Z 2020-03-05T00:00:00Z
Processing:  2020-03-05T00:00:00Z 2020-03-06T00:00:00Z
Processing:  2020-03-06T00:00:00Z 2020-03-07T00:00:00Z
Processing:  2020-03-07T00:00:00Z 2020-03-08T00:00:00Z
Processing:  2020-03-08T00:00:00Z 2020-03-09T00:00:00Z
Processing:  2020-03-09T00:00:00Z 2020-03-10T00:00:00Z
Processing:  2020-03-10T00:00:00Z 2020-03-11T00:00:00Z
Processing:  2020-03-11T00:00:00Z 2020-03-12T00:00:00Z
Processing:  2020-03-12T00:00:00Z 2020-03-13T00:00:00Z
Processing:  2020-03-13T00:00:00Z 2020-03-14T00:00:00Z
Processing:  2020-03-14T00:00:00Z 2020-03-15T00:00:00Z
Processing:  2020-03-15T00:00:00Z 2020-03-16T00:00:00Z
Processing:  2020-03-16T00:00:00Z 2020-03-17T00:00:00Z
Processing:  2020-03-17T00:00:00Z 2020-03-18T00:00:00Z
Processing:  2020-03-18T00:00:00Z 2020-03-19T00:00:00Z
Processing

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [165]:
# 合并csv文件到大表
combineCsv("Guradian")