# get_related

* 履歴の歌動画の関連動画を取得します

In [1]:
# Pythonの基本ライブラリ
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ファイル操作
import os
import glob

# Jupyter上にHTMLを表示する
from IPython.display import HTML

# YoutubeAPIの利用
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.client import flow_from_clientsecrets
from oauth2client.tools import run_flow
import google_auth_oauthlib.flow

In [5]:
month = '202208'
input_path = './output/'+month+'/labeled_viewed_videos.csv'
df_viewed = pd.read_csv(input_path)
df_viewed.head(2)

Unnamed: 0,Id,Name,ChannelId,Date,Title,Thumbnail,CategoryId,Duration,DurationOriginal,Description,ViewCount,LikeCount,DislikeCount,CommentCount,MyViewCount,VisitTimes,Label
0,--uUHn9GLUY,ヰ世界情緒 -Isekaijoucho-,UCah4_WVjmr8XA7i5aigwV-Q,2022-06-11T10:00:00Z,【歌ってみた】熱愛発覚中 / covered by ヰ世界情緒,https://i.ytimg.com/vi/--uUHn9GLUY/hqdefault.jpg,10,221,PT3M41S,わたしの好きな歌を歌います。46曲目。\n\n\nカバーさせていただきました。\n「熱愛発覚...,166063,7420,0,156,2,"2022-06-13 12:17:31.491781,2022-06-13 12:34:15...",1
1,-7mkO-IpY1Y,ば一ちゃるこれくしょん,UCbKfih0vuc_UOJDOi5C-riw,2022-06-23T12:29:48Z,【借りてきたイッヌ】ソニックアンバサダーに就任、さらに吹替版声優としてデビューしたころさん【...,https://i.ytimg.com/vi/-7mkO-IpY1Y/hqdefault.jpg,24,220,PT3M40S,▼元動画▼\nソニックステーションLIVE！ソニックバースデー＆『ソニックオリジンズ』発売記...,50712,913,0,70,1,2022-06-24 00:11:19.072884,0


In [6]:
# ここからが予測
# Youtube APIの準備

# APIキーをファイルから取得
f = open('../secret/apikey', 'r')
api_key = f.read()
f.close()

# APIキーを用いてリクエスト用のクラスを作成
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=api_key)

In [None]:
# 歌動画の関連動画の一覧を取得する
df_viewed_song = df_viewed[df_viewed['Label'] == 1]
video_ids = np.array(df_viewed_song['Id'])

# 既に存在するIDの場合は省略する
output_path = './output/'+month+'/related_ids.csv'
exists_prev = False
prev_ids = []
if os.path.exists(output_path):
    prev_df = pd.read_csv(output_path)
    prev_ids = np.array(prev_df['Id'])
    exists_prev = True

for video_id in video_ids:
    if video_id not in prev_ids:
        try:
            videos = youtube.search().list(
                part = 'id', 
                relatedToVideoId = video_id,
                order = 'relevance',
                type = 'video',
                maxResults = 50, 
            ).execute()
        except HttpError as e:
            print('データ参照中にエラーが発生しました')
            # print(e)
            break
        
        related_video_ids = []
        # 既存のDataFrameに追加する形で用意する
        for video_item in videos['items']:
            related_video_ids.append(video_item['id']['videoId'])    
        related_video_ids_str = ','.join(np.array(related_video_ids))
        add_df = pd.DataFrame(data=np.array([[video_id, related_video_ids_str]]), 
                                            columns=['Id', 'RelatedVideoIds'])
        if exists_prev:
            prev_df = pd.concat([prev_df, add_df])
        else:
            prev_df = add_df
            exists_prev = True
        prev_df.to_csv(output_path, index=False)

In [8]:
# サムネイルを表示する
def output_html_related(base_ids, related_ids, top_n=10):    
    html = '<h1>動画一覧を表示</h1>'
    html += '<div style="float:left;">'
    for i, base_id in enumerate(base_ids):
        html += ('<img src="http://img.youtube.com/vi/'+base_id+'/sddefault.jpg "alt="取得できませんでした" width="100">')
        html += ('<a href="https://www.youtube.com/watch?v='+base_id+'">'+base_id+'</a><br>') 
        for related_id in related_ids[i][0:top_n]:
            html += ('<img src="http://img.youtube.com/vi/'+related_id+'/sddefault.jpg "alt="取得できませんでした" width="50">')
        html += '<br>'
    html += '</div>'
    return html

def output_html_top_related(video_ids, top_n=10):
    html = '<h1>動画一覧を表示</h1>'
    html += '<div style="float:left;">'
    for video_id in video_ids[:top_n]:
        html += ('<img src="http://img.youtube.com/vi/'+video_id+'/sddefault.jpg "alt="取得できませんでした" width="100">')
        html += ('<a href="https://www.youtube.com/watch?v='+video_id+'">'+video_id+'</a><br>')
    html += '</div>'
    return html

In [11]:
# 関連動画がどのようなものか確認
input_path = './output/'+month+'/related_ids.csv'
related_ids_df = pd.read_csv(input_path)

video_ids = np.array(related_ids_df['Id'])
related_ids = np.array(related_ids_df['RelatedVideoIds'])
# 階層的なものと平滑化したものの二つを取得
related_ids_flat = []
related_ids_hierarchy = []
related_num = 30
for related_id in related_ids:
    related_id_str = related_id.split(',')
    related_ids_hierarchy_tmp = []
    for related_id_2 in related_id_str[:related_num]:
        related_ids_hierarchy_tmp.append(related_id_2)
        related_ids_flat.append(related_id_2)
    related_ids_hierarchy.append(related_ids_hierarchy_tmp)

related_ids_flat = np.array(related_ids_flat)
unique_related_ids_flat, unique_related_ids_flat_counts = np.unique(related_ids_flat, return_counts=True)
top_ids_argsort = np.argsort(unique_related_ids_flat_counts)[::-1]
top_ids = unique_related_ids_flat[top_ids_argsort]    

# HTML(output_html_top_related(top_ids, top_n=5))
HTML(output_html_related(video_ids[0:5], related_ids_hierarchy, top_n=10))

FileNotFoundError: [Errno 2] No such file or directory: './output/202208labeled_viewed_videos.csv'

In [12]:
# ISO表記の動画時間を秒に変換
def pt2sec(pt_time):
    s_list, m_list, h_list = [], [], []
    conc_s, conc_m, conc_h = '', '', ''
    flag = ''
    
    for i in reversed(pt_time):
        if i == 'S':
            flag = 'S'
        elif i == 'M':
            flag = 'M'
        elif i == 'H':
            flag = 'H'
        elif i == 'T':
            break
        else:
            if flag == 'S':
                s_list.append(i)
            elif flag == 'M':
                m_list.append(i)
            elif flag == 'H':
                h_list.append(i)
    
    for s in reversed(s_list):
        conc_s += s
    for m in reversed(m_list):
        conc_m += m
    for h in reversed(h_list):
        conc_h += h
    conc_s = 0 if conc_s == '' else int(conc_s)
    conc_m = 0 if conc_m == '' else int(conc_m)
    conc_h = 0 if conc_h == '' else int(conc_h)

    times = conc_h*3600 + conc_m*60 + conc_s
    return times

In [28]:
# 関連動画のIDから詳細情報を取得する

# 既に存在するIDの場合は省略する
output_path = './output/'+month+'/related_videos.csv'
exists_prev = False
prev_ids = []
if os.path.exists(output_path):
    prev_df = pd.read_csv(output_path)
    prev_ids = np.array(prev_df['Id'])
    exists_prev = True

for i, video_id in enumerate(unique_related_ids_flat):
    if i % (unique_related_ids_flat.size//30) == 0:
        print(i, "/", unique_related_ids_flat.size)
    if video_id not in prev_ids:
        try:
            video_detail = youtube.videos().list(
                part = 'snippet,statistics,contentDetails', 
                id = video_id, 
            ).execute()
        except HttpError as e:
            print('エラーが発生しました')
            # print(e)
            break

        # 公開されていない動画など、取得できない場合がある
        if len(video_detail['items']) == 0:
            continue
        video_snippet = video_detail['items'][0]['snippet']
        video_statistics = video_detail['items'][0]['statistics']
        video_content_details = video_detail['items'][0]['contentDetails']
        # snippetから取得
        date = video_snippet['publishedAt']
        title = video_snippet['title']
        channel_name = video_snippet['channelTitle']
        channel_id = video_snippet['channelId']
        description = video_snippet['description']
        thumbnail = video_snippet['thumbnails']['high']['url']
        category_id = video_snippet['categoryId']
        if 'tags' in video_snippet.keys():
            tags = video_snippet['tags']
            tags_str = ','.join(tags)
        else:
            tags_str = 'no_tag'
        
        # contentDetailsから取得
        duration = pt2sec(video_content_details['duration'])
        duration_origin = video_content_details['duration']       
        
        # statisticsから取得
        # 評価数、コメントが非公開の場合は0で埋める
        view_count = 0
        like_count = 0
        dislike_count = 0
        comment_count = 0
        if 'viewCount' in video_statistics.keys():
            view_count = video_statistics['viewCount']
        if 'likeCount' in video_statistics.keys():
            like_count = video_statistics['likeCount']
        if 'dislikeCount' in video_statistics.keys():
            dislike_count = video_statistics['dislikeCount']
        if 'commentCount' in video_statistics.keys():
            comment_count = video_statistics['commentCount']
        # 履歴情報を追加する
        suggest_counts = unique_related_ids_flat_counts[i]
        viewed = video_id in viewed_ids

        add_df = pd.DataFrame(data=np.array([[video_id, channel_name, channel_id, date, title, 
                                              thumbnail, category_id, tags_str, duration, duration_origin, description, 
                                              view_count, like_count, dislike_count, comment_count, 
                                              suggest_counts, viewed]]), 
                              columns=['Id', 'Name', 'ChannelId', 'Date', 'Title', 
                                       'Thumbnail', 'CategoryId', 'Tags', 'Duration', 'DurationOriginal', 'Description',
                                       'ViewCount', 'LikeCount', 'DislikeCount', 'CommentCount', 
                                       'SuggestCount', 'Viewed'])
        if exists_prev:
            prev_df = pd.concat([prev_df, add_df])
        else:
            prev_df = add_df
            exists_prev = True
        prev_df.to_csv(output_path, index=False)

0 / 2950
98 / 2950
196 / 2950
294 / 2950
392 / 2950
490 / 2950
588 / 2950
686 / 2950
784 / 2950
882 / 2950
980 / 2950
1078 / 2950
1176 / 2950
1274 / 2950
1372 / 2950
1470 / 2950
1568 / 2950
1666 / 2950
1764 / 2950
1862 / 2950
1960 / 2950
2058 / 2950
2156 / 2950
2254 / 2950
2352 / 2950
2450 / 2950
2548 / 2950
2646 / 2950
2744 / 2950
2842 / 2950
2940 / 2950
