# get_suggest

* Chromeの履歴を元にオススメの動画一覧を取得します

In [None]:
# Pythonの基本ライブラリ
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ファイル操作
import os
import glob

# Jupyter上にHTMLを表示する
from IPython.display import HTML

# 画像の保存
import requests

# YoutubeAPIの利用
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.client import flow_from_clientsecrets
from oauth2client.tools import run_flow
import google_auth_oauthlib.flow

# 機械学習
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

In [None]:
outputPath = './output/viewed_videos.csv'
df_viewed = pd.read_csv(outputPath)
# df_viewed.head(2)

In [None]:
# データセットから歌動画の抽出を行い推薦元データの作成

# 適当に時間でフィルタリングして確認する
filetered_df_viewed = df_viewed[np.logical_and(df_viewed['Duration']<480, df_viewed['Duration']>90)]
print("再生時間によるフィルタリング後：", filetered_df_viewed.shape[0])

In [None]:
# ラベリングように画像を作成
# os.makedirs('./train/unlabeled/thumbnails/', exist_ok=True)
# os.makedirs('./train/label_0/thumbnails/', exist_ok=True)
# os.makedirs('./train/label_1/thumbnails/', exist_ok=True)

# for i, row in filetered_df_viewed.iterrows():
#     video_id, title, thumbnail = row['Id'], row['Title'], row['Thumbnail']
    
#     response = requests.get(thumbnail)
#     image = response.content
    
#     title = title.replace('/', '')
#     file_name = './train/unlabeled/thumbnails/'+title+'&separate&'+video_id+'.png'
#     with open(file_name, "wb") as write_file:
#         write_file.write(image)

In [None]:
# ラベリングされた動画からテーブルを作成し直す
video_ids = np.array(filetered_df_viewed['Id'])
labels = np.zeros(video_ids.size, dtype=int)
for label in range(2):
    file_names = glob.glob('./train/thumbnails/label_'+str(label)+'/*')
    for i, file_name in enumerate(file_names):
        video_id = file_name.split('&separate&')[1].replace('.png', '')        
        idx = np.where(video_ids == video_id)[0][0]
        labels[idx] = label

filetered_df_viewed['Label'] = labels

filetered_df_viewed = filetered_df_viewed.reset_index()
filetered_df_viewed = filetered_df_viewed.drop(['index'], axis=1)

# Descriptionがnullのデータを補完
filetered_df_viewed = filetered_df_viewed.fillna({'Description': 'Descriptionが記入されていません'})
# filetered_df_viewed = filetered_df_viewed.replace({'Description': {'NULL': 'Descriptionが記入されていません'}})
filetered_df_viewed.to_csv('./output/labeled_viewed_videos.csv', index=False)

In [None]:
df_viewed = pd.read_csv('./output/labeled_viewed_videos.csv')
# df_viewed.isnull().sum() # 確認用
# df_viewed.head(2)

In [None]:
# トークナイザの準備
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
# print(bert_sc.config) # 語彙数が32000であることを確認

In [None]:
def make_feature(row, unique_channel_id, unique_category_id):
    text = row['Title'] + row['Description']
        
    encoding = tokenizer(
        text,
        max_length=max_length, 
        padding='max_length',
        truncation=True
    )
    
    input_ids = encoding['input_ids']
    input_ids = np.unique(input_ids) # countは一旦無視してuniqueにする
    input_ids = input_ids[input_ids>=5] # [PAD], [UNK], [CLS], [SEP], [MASK]を削除
    
    words = np.zeros(32000)
    words[input_ids] = 1
    
    channel_id = row['ChannelId']
    if channel_id in unique_channel_id:
        channel_onehot = np.eye(unique_channel_id.size)[unique_channel_id == channel_id][0]
    else:
        channel_onehot = np.zeros(unique_channel_id.size)
    
    category_id = row['CategoryId']
    if category_id in unique_category_id:
        category_onehot = np.eye(unique_category_id.size)[unique_category_id == category_id][0]
    else:
        category_onehot = np.zeros(unique_category_id.size)
    
    # scaleが違うので外しておく
    # duration = row['Duration']
    # view_count = row['ViewCount']
    # like_count = row['LikeCount']
    # comment_count = row['CommentCount']
    # stats = np.array([duration, view_count, like_count, comment_count])
    return np.concatenate([words, channel_onehot, category_onehot], 0)

In [None]:
# 単語分割して学習データを作成

max_length = 512 # 最大で512
X_train, Y_train = [], []
unique_channel_id = np.unique(np.array(df_viewed['ChannelId']))
unique_category_id = np.unique(np.array(df_viewed['CategoryId']))

for i, row in df_viewed.iterrows():
    X_row = make_feature(row, unique_channel_id, unique_category_id)
    X_train.append(X_row)
    Y_train.append(row['Label'])

X_train = np.array(X_train)
# 不要な特徴量を削除（計算量が重くなってきたら利用する）
# no_use_idx = np.where(np.sum(X, axis=0) == 0)[0]
# print(no_use_idx)
# X = np.delete(X, no_use_idx, 1)

Y_train = np.array(Y_train)

In [None]:
# CVで正解率を評価
def cross_validation(X, Y, k=5):
    # XとYをシャッフル
    X, Y = shuffle(X, Y, random_state=0)
    
    # XとYをk分割
    n = X.shape[0]
    X_devs, Y_devs = [], []
    for i in range(k):
        if i != k-1:
            X_dev, Y_dev = X[i*(n//5):(i+1)*(n//5)], Y[i*(n//5):(i+1)*(n//5)]
        else:
            X_dev, Y_dev = X[i*(n//5):], Y[i*(n//5):]
        X_devs.append(X_dev)
        Y_devs.append(Y_dev)
        
    # 1つをvalidation, 1つをテストとしてテスト誤差を計算する
    test_accuracy = 0
    for i in range(k):
        print('k-cross-validation :', i+1, '/', k)
        X_train_tmp, Y_train_tmp = [], []
        for j in range(k-2):
            X_train_tmp.append(X_devs[(i+j)%k])
            Y_train_tmp.append(Y_devs[(i+j)%k])
        X_train = np.concatenate(X_train_tmp)
        Y_train = np.concatenate(Y_train_tmp)
        X_val, Y_val = X_devs[(i+k-2)%k], Y_devs[(i+k-2)%k]
        X_test, Y_test = X_devs[(i+k-1)%k], Y_devs[(i+k-1)%k]
        
        # logscaleでハイパラの候補を準備
        lr = LogisticRegression(penalty='l1', solver='liblinear')
        lr.fit(X_train, Y_train)    
        Y_pred = lr.predict(X_val)
        test_accuracy_k = np.sum(Y_pred==Y_val)
        test_accuracy += test_accuracy_k 

    return test_accuracy/n

In [None]:
# CVを実施
print('CV Accuracy', cross_validation(X_train, Y_train))

In [None]:
# Youtube APIの準備

# APIキーをファイルから取得
f = open('secret/apikey', 'r')
api_key = f.read()
f.close()

# APIキーを用いてリクエスト用のクラスを作成
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=api_key)

In [None]:
# 歌動画の関連動画の一覧を取得する
df_viewed_song = df_viewed[df_viewed['Label'] == 1]
video_ids = np.array(df_viewed_song['Id'])

# 既に存在するIDの場合は省略する
outputPath = './output/related_ids.csv'
pred_df, prev_ids = None, []
if os.path.exists(outputPath):
    prev_df = pd.read_csv(outputPath)
    prev_ids = np.array(prev_df['Id'])

for video_id in video_ids:
    if video_id not in prev_ids:
        related_video_ids = []
        try:
            videos = youtube.search().list(
                part = 'id', 
                relatedToVideoId = video_id,
                order = 'relevance',
                type = 'video',
                maxResults = 50, 
            ).execute()
        except HttpError as e:
            print('データ参照中にエラーが発生しました')
            print(e)
            break
    
        # 既存のDataFrameに追加する形で用意する
        for video_item in videos['items']:
            related_video_ids.append(video_item['id']['videoId'])    
        related_video_ids_str = ','.join(np.array(related_video_ids))
        add_df = pd.DataFrame(data=np.array([video_id, related_video_ids_str]), 
                                            columns=['Id', 'RelatedVideoIds'])
        if prev_df != None:
            prev_df = pd.concat([prev_df, add_df])
        else:
            prev_df = add_df
        
        prev_df.to_csv(outputPath, index=False)

In [None]:
# サムネイルを表示する
def output_html_related(base_ids, related_ids, top_n=10):    
    html = '<h1>動画一覧を表示</h1>'
    html += '<div style="float:left;">'
    for i, base_id in enumerate(base_ids):
        html += ('<img src="http://img.youtube.com/vi/'+base_id+'/sddefault.jpg "alt="取得できませんでした" width="100">')
        html += ('<a href="https://www.youtube.com/watch?v='+base_id+'">'+base_id+'</a><br>') 
        for related_id in related_ids[i][0:top_n]:
            html += ('<img src="http://img.youtube.com/vi/'+related_id+'/sddefault.jpg "alt="取得できませんでした" width="50">')
            # html += ('<a href="https://www.youtube.com/watch?v='+related_url+'">'+related_url+'</a><br>')
        html += '<br>'
    html += '</div>'
    return html

In [None]:
# 関連動画がどのようなものか確認
related_ids_df = pd.read_csv('./output/suggest_videos_tmp.csv')
video_ids = np.array(related_ids_df['Id'])
related_ids = np.array(related_ids_df['RelatedVideoIds'])
related_ids_tmp1 = []
for related_id1 in related_ids:
    related_id_str = related_id1.split(',')
    related_ids_tmp2 = []
    for related_id2 in related_id_str:
        related_ids_tmp2.append(related_id2)
    related_ids_tmp1.append(related_ids_tmp2)

# HTML(output_html_related(video_ids[0:5], related_ids_tmp1, top_n=18))

In [None]:
# サムネイルを表示する
def output_html(video_ids, top_n=10):    
    html = '<h1>動画一覧を表示</h1>'
    html += '<div style="float:left;">'
    for video_id in video_ids[:top_n]:
        html += ('<img src="http://img.youtube.com/vi/'+video_id+'/sddefault.jpg "alt="取得できませんでした" width="100">')
        html += ('<a href="https://www.youtube.com/watch?v='+video_id+'">'+video_id+'</a><br>')
    html += '</div>'
    return html

In [None]:
related_ids_df = pd.read_csv('./output/suggest_videos_tmp.csv')
viewed_ids = np.array(related_ids_df['Id'])
related_ids = np.array(related_ids_df['RelatedVideoIds'])

related_ids_flat = []
for related_id1 in related_ids:
    related_id_str = related_id1.split(',')
    for related_id2 in related_id_str:
        related_ids_flat.append(related_id2)
    
related_ids_flat = np.array(related_ids_flat)
unique_related_ids_flat, unique_related_ids_flat_counts = np.unique(related_ids_flat, return_counts=True)

top_ids_argsort = np.argsort(unique_related_ids_flat_counts)[::-1]
top_ids = unique_related_ids_flat[top_ids_argsort]
# HTML(output_html(top_ids, top_n=5))

In [None]:
# ISO表記の動画時間を秒に変換
def pt2sec(pt_time):
    s_list, m_list, h_list = [], [], []
    conc_s, conc_m, conc_h = '', '', ''
    flag = ''
    
    for i in reversed(pt_time):
        if i == 'S':
            flag = 'S'
        elif i == 'M':
            flag = 'M'
        elif i == 'H':
            flag = 'H'
        elif i == 'T':
            break
        else:
            if flag == 'S':
                s_list.append(i)
            elif flag == 'M':
                m_list.append(i)
            elif flag == 'H':
                h_list.append(i)
    
    for s in reversed(s_list):
        conc_s += s
    for m in reversed(m_list):
        conc_m += m
    for h in reversed(h_list):
        conc_h += h
    conc_s = 0 if conc_s == '' else int(conc_s)
    conc_m = 0 if conc_m == '' else int(conc_m)
    conc_h = 0 if conc_h == '' else int(conc_h)

    times = conc_h*3600 + conc_m*60 + conc_s
    return times

In [None]:
# 関連動画のIDから詳細情報を取得する(5000件程度なら取得可能)
video_details = []
for i, video_id in enumerate(unique_related_ids_flat):
    try:
        video_detail = youtube.videos().list(
            part = 'snippet,statistics,contentDetails', 
            id = video_id, 
        ).execute()
    except HttpError as e:
        print('エラーが発生しました')
        # print(e)
        break
    
    # 公開されていない動画など、取得できない場合がある
    if len(video_detail['items']) == 0:
        continue
    video_snippet = video_detail['items'][0]['snippet']
    video_statistics = video_detail['items'][0]['statistics']
    video_content_details = video_detail['items'][0]['contentDetails']
    # snippetから取得
    date = video_snippet['publishedAt']
    title = video_snippet['title']
    channel_name = video_snippet['channelTitle']
    channel_id = video_snippet['channelId']
    description = video_snippet['description']
    thumbnail = video_snippet['thumbnails']['high']['url']
    category_id = video_snippet['categoryId']
    # contentDetailsから取得
    duration = pt2sec(video_content_details['duration'])
    duration_origin = video_content_details['duration']
    # statisticsから取得
    # 評価数、コメントが非公開の場合は0で埋める
    view_count = 0
    like_count = 0
    dislike_count = 0
    comment_count = 0
    if 'viewCount' in video_statistics.keys():
        view_count = video_statistics['viewCount']
    if 'likeCount' in video_statistics.keys():
        like_count = video_statistics['likeCount']
    if 'dislikeCount' in video_statistics.keys():
        dislike_count = video_statistics['dislikeCount']
    if 'commentCount' in video_statistics.keys():
        comment_count = video_statistics['commentCount']
    # 履歴情報を追加する
    suggest_counts = unique_related_ids_flat_counts[i]
    viewed = video_id in viewed_ids
    
    # リストのリストとして情報を格納する
    video_details.append([video_id, channel_name, channel_id, date, title, 
                          thumbnail, category_id, duration, duration_origin, description, 
                          view_count, like_count, dislike_count, comment_count, 
                          suggest_counts, viewed])

if len(video_details) != 0:
    video_details_numpy = np.array(video_details)
    video_details_pandas = pd.DataFrame(data=video_details_numpy, 
                                        columns=['Id', 'Name', 'ChannelId', 'Date', 'Title', 
                                                'Thumbnail', 'CategoryId', 'Duration', 'DurationOriginal', 'Description',
                                                'ViewCount', 'LikeCount', 'DislikeCount', 'CommentCount', 
                                                'SuggestCount', 'Viewed'])

    outputPath = './output/related_videos.csv'
    video_details_pandas.to_csv(outputPath, index=False)

In [None]:
# LRを学習（アンバランスなのは一旦放置）
related_videos_df = pd.read_csv(outputPath)
X_test = []

for i, row in related_videos_df.iterrows():
    X_row = make_feature(row, unique_channel_id, unique_category_id)
    X_test.append(X_row)
    
X_test = np.array(X_test)
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(X_train, Y_train)
Y_test = lr.predict(X_test)

In [None]:
related_videos_df['Label'] = Y_test
related_videos_df.to_csv(outputPath, index=False)