# classify_related

* 関連動画を歌動画とそうでないものに分類します

In [1]:
# Pythonの基本ライブラリ
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ファイル操作
import os
import glob

# Jupyter上にHTMLを表示する
from IPython.display import HTML

# 画像の保存
import requests

# 機械学習
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
# LightGBMは精度の向上が見込め無さそうなので削除
# import lightgbm as lgb

In [None]:
# テストデータを取得
month = '202208'
input_path = './output/'+month+'/viewed_videos.csv'
non_filtered_df_test = pd.read_csv(input_path)
df_test.head(2)

In [None]:
# Descriptionがnullのデータを補完
non_filtered_df_test = non_filtered_df_test.fillna({'Description': 'Descriptionが記入されていません'})

# 適当に時間でフィルタリングして確認する
df_test = non_filtered_df_test[np.logical_and(non_filtered_df_test['Duration']<480, non_filtered_df_test['Duration']>90)]
print("再生時間によるフィルタリング後：", df_test.shape[0])

In [None]:
unique_channel_id = 
unique_category_id = 

In [None]:
def make_feature(row, unique_channel_id, unique_category_id):
    text = row['Title'] + row['Description']
        
    encoding = tokenizer(
        text,
        max_length=max_length, 
        padding='max_length',
        truncation=True
    )
    
    input_ids = encoding['input_ids']
    input_ids = np.unique(input_ids) # countは一旦無視してuniqueにする
    input_ids = input_ids[input_ids>=5] # [PAD], [UNK], [CLS], [SEP], [MASK]を削除
    
    words = np.zeros(32000)
    words[input_ids] = 1
    
    channel_id = row['ChannelId']
    if channel_id in unique_channel_id:
        channel_onehot = np.eye(unique_channel_id.size)[unique_channel_id == channel_id][0]
    else:
        channel_onehot = np.zeros(unique_channel_id.size)
    
    category_id = row['CategoryId']
    if category_id in unique_category_id:
        category_onehot = np.eye(unique_category_id.size)[unique_category_id == category_id][0]
    else:
        category_onehot = np.zeros(unique_category_id.size)
    
    return np.concatenate([words, channel_onehot, category_onehot], 0)

In [None]:
# ここで新規データの予測

max_length = 512 # 最大で512
X_test = []

for i, row in df_test.iterrows():
    X_row = make_feature(row, unique_channel_id, unique_category_id)
    X_test.append(X_row)
X_test = np.array(X_test)

In [None]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(X_train, Y_train)

In [None]:
Y_test = lr.predict(X_test)

In [None]:
df_test['Label'] = Y_test
df_test.to_csv('./output/'+month+'/labeled_related_videos.csv', index=False)

In [None]:
# サムネイルを表示する
def output_html_labeled(video_ids, counts, labels, titles, top_n=10):    
    html = '<h1>動画一覧を表示</h1>'
    html += '<div style="float:left;">'
    for i, video_id in enumerate(video_ids[:top_n]):
        html += ('<img src="http://img.youtube.com/vi/'+video_id+'/sddefault.jpg "alt="取得できませんでした" width="100">')
        html += ('<a href="https://www.youtube.com/watch?v='+video_id+'">'+titles[i]+','+str(counts[i])+' ,'+str(labels[i])+'</a><br>')
    html += '</div>'
    return html

In [None]:
video_ids = related_videos_df['Id']
suggest_counts = related_videos_df['SuggestCount']
labels = related_videos_df['Label']
titles = related_videos_df['Title']

# まずまずな感じなのであとはバリエーションをどうするかかね
HTML(output_html_labeled(video_ids, suggest_counts, labels, titles, top_n=10))

In [None]:
# ここは一旦保留か？

# もう一度曲に対してVTuberかそうでないかを識別
# 3000件の動画に対してラベリングを行い識別器を作る
# これらは別のnotebookに分けてモデルだけ取得する形にした方が良いかも
# これらのデータは次に使うデータとは別でtrainデータとして持ってくと良いかも

outputPath = './output/related_videos.csv'
related_videos_df = pd.read_csv(outputPath)
X_test = []

max_length = 512 # 最大で512
X_train, Y_train = [], []
unique_channel_id = np.unique(np.array(df_viewed['ChannelId']))
unique_category_id = np.unique(np.array(df_viewed['CategoryId']))

for i, row in df_viewed.iterrows():
    X_row = make_feature(row, unique_channel_id, unique_category_id)
    X_train.append(X_row)
    Y_train.append(row['Label'])

X_train = np.array(X_train)
# 不要な特徴量を削除（計算量が重くなってきたら利用する）
# no_use_idx = np.where(np.sum(X, axis=0) == 0)[0]
# print(no_use_idx)
# X = np.delete(X, no_use_idx, 1)

Y_train = np.array(Y_train)