# classify_history

* 履歴の動画を歌動画とそうでないものに分類します

In [36]:
# Pythonの基本ライブラリ
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ファイル操作
import os
import glob

# Jupyter上にHTMLを表示する
from IPython.display import HTML

# 画像の保存
import requests

# 機械学習
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
# LightGBMは精度の向上が見込め無さそうなので削除
# import lightgbm as lgb

## アノテーション

* 追加したデータを訓練データとして扱う場合は以下でアノテーションしてください
* テストデータとして利用する場合は、以下の実行は不要です

In [None]:
# ラベリング用に画像を作成
os.makedirs('./train/unlabeled/thumbnails/', exist_ok=True)
os.makedirs('./train/label_0/thumbnails/', exist_ok=True)
os.makedirs('./train/label_1/thumbnails/', exist_ok=True)

for i, row in filetered_df_viewed.iterrows():
    video_id, title, thumbnail = row['Id'], row['Title'], row['Thumbnail']
    
    response = requests.get(thumbnail)
    image = response.content
    
    title = title.replace('/', '')
    file_name = './train/unlabeled/thumbnails/'+title+'&separate&'+video_id+'.png'
    with open(file_name, "wb") as write_file:
        write_file.write(image)

* ここで生成された画像ファイルをlabel_0, label_1に振り分け

In [None]:
# ラベリングされた動画からテーブルを作成し直す
video_ids = np.array(filetered_df_viewed['Id'])
labels = np.zeros(video_ids.size, dtype=int)
for label in range(2):
    file_names = glob.glob('./train/thumbnails/label_'+str(label)+'/*')
    for i, file_name in enumerate(file_names):
        video_id = file_name.split('&separate&')[1].replace('.png', '')        
        idx = np.where(video_ids == video_id)[0][0]
        labels[idx] = label

filetered_df_viewed['Label'] = labels

filetered_df_viewed = filetered_df_viewed.reset_index()
filetered_df_viewed = filetered_df_viewed.drop(['index'], axis=1)

# Descriptionがnullのデータを補完
filetered_df_viewed = filetered_df_viewed.fillna({'Description': 'Descriptionが記入されていません'})
# filetered_df_viewed = filetered_df_viewed.replace({'Description': {'NULL': 'Descriptionが記入されていません'}})
filetered_df_viewed.to_csv('./train/videos/labeled_viewed_videos.csv', index=False)

## 分類器の作成
* アノテーションしない場合はここから実施してください

In [25]:
# 訓練データを取得
df_train = pd.read_csv('./train/videos/labeled_viewed_videos.csv')
df_train.head(2)

Unnamed: 0,Id,Name,ChannelId,Date,Title,Thumbnail,CategoryId,Duration,DurationOriginal,Description,ViewCount,LikeCount,DislikeCount,CommentCount,MyViewCount,VisitTimes,Label
0,--uUHn9GLUY,ヰ世界情緒 -Isekaijoucho-,UCah4_WVjmr8XA7i5aigwV-Q,2022-06-11T10:00:00Z,【歌ってみた】熱愛発覚中 / covered by ヰ世界情緒,https://i.ytimg.com/vi/--uUHn9GLUY/hqdefault.jpg,10,221,PT3M41S,わたしの好きな歌を歌います。46曲目。\n\n\nカバーさせていただきました。\n「熱愛発覚...,126236,6998,0,153,2,"2022-06-13 12:17:31.491781,2022-06-13 12:34:15...",1
1,-7mkO-IpY1Y,ば一ちゃるこれくしょん,UCbKfih0vuc_UOJDOi5C-riw,2022-06-23T12:29:48Z,【借りてきたイッヌ】ソニックアンバサダーに就任、さらに吹替版声優としてデビューしたころさん【...,https://i.ytimg.com/vi/-7mkO-IpY1Y/hqdefault.jpg,24,220,PT3M40S,▼元動画▼\nソニックステーションLIVE！ソニックバースデー＆『ソニックオリジンズ』発売記...,50223,904,0,68,1,2022-06-24 00:11:19.072884,0


In [10]:
# トークナイザの準備
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
# print(bert_sc.config) # 語彙数が32000であることを確認

In [11]:
def make_feature(row, unique_channel_id, unique_category_id):
    text = row['Title'] + row['Description']
        
    encoding = tokenizer(
        text,
        max_length=max_length, 
        padding='max_length',
        truncation=True
    )
    
    input_ids = encoding['input_ids']
    input_ids = np.unique(input_ids) # countは一旦無視してuniqueにする
    input_ids = input_ids[input_ids>=5] # [PAD], [UNK], [CLS], [SEP], [MASK]を削除
    
    words = np.zeros(32000)
    words[input_ids] = 1
    
    channel_id = row['ChannelId']
    if channel_id in unique_channel_id:
        channel_onehot = np.eye(unique_channel_id.size)[unique_channel_id == channel_id][0]
    else:
        channel_onehot = np.zeros(unique_channel_id.size)
    
    category_id = row['CategoryId']
    if category_id in unique_category_id:
        category_onehot = np.eye(unique_category_id.size)[unique_category_id == category_id][0]
    else:
        category_onehot = np.zeros(unique_category_id.size)
    
    # scaleが違うので外しておく
    # duration = row['Duration']
    # view_count = row['ViewCount']
    # like_count = row['LikeCount']
    # comment_count = row['CommentCount']
    # stats = np.array([duration, view_count, like_count, comment_count])
    return np.concatenate([words, channel_onehot, category_onehot], 0)

In [14]:
# 単語分割して学習データを作成

max_length = 512 # 最大で512
X_train, Y_train = [], []
# ここはモデルを利用する際に保存しておく必要がある
unique_channel_id = np.unique(np.array(df_train['ChannelId']))
unique_category_id = np.unique(np.array(df_train['CategoryId']))

for i, row in df_train.iterrows():
    X_row = make_feature(row, unique_channel_id, unique_category_id)
    X_train.append(X_row)
    Y_train.append(row['Label'])

X_train = np.array(X_train)
# 不要な特徴量を削除（計算量が重くなってきたら利用する）
# no_use_idx = np.where(np.sum(X, axis=0) == 0)[0]
# print(no_use_idx)
# X = np.delete(X, no_use_idx, 1)

Y_train = np.array(Y_train)

In [13]:
# CVによりモデルを評価
def cross_validation(X, Y, k=5):
    # XとYをシャッフル
    X, Y = shuffle(X, Y, random_state=0)
    
    # XとYをk分割
    n = X.shape[0]
    X_devs, Y_devs = [], []
    for i in range(k):
        if i != k-1:
            X_dev, Y_dev = X[i*(n//5):(i+1)*(n//5)], Y[i*(n//5):(i+1)*(n//5)]
        else:
            X_dev, Y_dev = X[i*(n//5):], Y[i*(n//5):]
        X_devs.append(X_dev)
        Y_devs.append(Y_dev)
        
    # 1つをvalidation, 1つをテストとしてテスト誤差を計算する
    test_accuracy = 0
    for i in range(k):
        print('k-cross-validation :', i+1, '/', k)
        X_train_tmp, Y_train_tmp = [], []
        for j in range(k-2):
            X_train_tmp.append(X_devs[(i+j)%k])
            Y_train_tmp.append(Y_devs[(i+j)%k])
        X_train = np.concatenate(X_train_tmp)
        Y_train = np.concatenate(Y_train_tmp)
        X_val, Y_val = X_devs[(i+k-2)%k], Y_devs[(i+k-2)%k]
        X_test, Y_test = X_devs[(i+k-1)%k], Y_devs[(i+k-1)%k]
        
        lr = LogisticRegression(penalty='l1', solver='liblinear')
        lr.fit(X_train, Y_train)    
        Y_pred = lr.predict(X_val)
        test_accuracy_k = np.sum(Y_pred==Y_val)
        test_accuracy += test_accuracy_k 

    return test_accuracy/n

print('CV Accuracy', cross_validation(X_train, Y_train))

k-cross-validation : 1 / 5
k-cross-validation : 2 / 5
k-cross-validation : 3 / 5
k-cross-validation : 4 / 5
k-cross-validation : 5 / 5
CV Accuracy 0.9616519174041298


## 予測

In [28]:
# テストデータを取得
month = '202208'
input_path = './output/'+month+'/viewed_videos.csv'
non_filtered_df_test = pd.read_csv(input_path)
df_test.head(2)

Unnamed: 0,Id,Name,ChannelId,Date,Title,Thumbnail,CategoryId,Duration,DurationOriginal,Description,ViewCount,LikeCount,DislikeCount,CommentCount,MyViewCount,VisitTimes
1,--uUHn9GLUY,ヰ世界情緒 -Isekaijoucho-,UCah4_WVjmr8XA7i5aigwV-Q,2022-06-11T10:00:00Z,【歌ってみた】熱愛発覚中 / covered by ヰ世界情緒,https://i.ytimg.com/vi/--uUHn9GLUY/hqdefault.jpg,10,221,PT3M41S,わたしの好きな歌を歌います。46曲目。\n\n\nカバーさせていただきました。\n「熱愛発覚...,166063,7420,0,156,2,"2022-06-13 12:17:31.491781,2022-06-13 12:34:15..."
2,-7mkO-IpY1Y,ば一ちゃるこれくしょん,UCbKfih0vuc_UOJDOi5C-riw,2022-06-23T12:29:48Z,【借りてきたイッヌ】ソニックアンバサダーに就任、さらに吹替版声優としてデビューしたころさん【...,https://i.ytimg.com/vi/-7mkO-IpY1Y/hqdefault.jpg,24,220,PT3M40S,▼元動画▼\nソニックステーションLIVE！ソニックバースデー＆『ソニックオリジンズ』発売記...,50712,913,0,70,1,2022-06-24 00:11:19.072884


In [29]:
# Descriptionがnullのデータを補完
non_filtered_df_test = non_filtered_df_test.fillna({'Description': 'Descriptionが記入されていません'})

# 適当に時間でフィルタリングして確認する
df_test = non_filtered_df_test[np.logical_and(non_filtered_df_test['Duration']<480, non_filtered_df_test['Duration']>90)]
print("再生時間によるフィルタリング後：", df_test.shape[0])

再生時間によるフィルタリング後： 647


In [30]:
# ここで新規データの予測

max_length = 512 # 最大で512
X_test = []

for i, row in df_test.iterrows():
    X_row = make_feature(row, unique_channel_id, unique_category_id)
    X_test.append(X_row)
X_test = np.array(X_test)

In [31]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(X_train, Y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [32]:
Y_test = lr.predict(X_test)

In [35]:
df_test['Label'] = Y_test
df_test.to_csv('./output/'+month+'/labeled_viewed_videos.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
