In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
import pandas as pd

In [2]:
from tqdm.notebook import trange

In [3]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking", max_length=512)
model = AutoModel.from_pretrained("bandainamco-mirai/distilbert-base-japanese") 

Some weights of the model checkpoint at bandainamco-mirai/distilbert-base-japanese were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
playlists_df = pd.read_pickle('all_playlists.pkl')
playlists_text = pd.read_pickle('playlists_text.pkl')
categories = playlists_df["category"].drop_duplicates().reset_index()

### カテゴリ、プレイリストから分散表現にし、保存

In [5]:
def getTextVector(model, tokenizer, text):
    wakati_ids = tokenizer.encode(text, return_tensors='pt')
    model.eval()
    with torch.no_grad():
        output = model(wakati_ids)
    lastlayer = output.last_hidden_state
    return lastlayer[:, 0, :]

カテゴリ

In [8]:
categories_tensor = getTextVector(model, tokenizer, categories["category"][0])
for i in trange(1, len(categories)):
    tv = getTextVector(model, tokenizer, categories["category"][i])
    categories_tensor = torch.cat((categories_tensor, tv), 0)
# torch.save(categories_tensor, 'categories_vectors.pt')

  0%|          | 0/44 [00:00<?, ?it/s]

プレイリスト全体

In [9]:
tvs_tensor = getTextVector(model, tokenizer, playlists_text[0])

for i in trange(1, len(playlists_text)):
    tv = getTextVector(model, tokenizer, playlists_text[i])
    tvs_tensor = torch.cat((tvs_tensor, tv), 0)
    
# torch.save(tvs_tensor, 'playlist_text_vectors.pt')

  0%|          | 0/1760 [00:00<?, ?it/s]

In [10]:
#load_pt = torch.load('playlist_text_vectors.pt')

In [11]:
tvs_tensor

tensor([[ 0.0010, -0.0024, -0.0277,  ..., -0.0048, -0.0022,  0.0038],
        [ 0.0015, -0.0046, -0.0253,  ..., -0.0068, -0.0100, -0.0002],
        [ 0.0039, -0.0005, -0.0245,  ..., -0.0117, -0.0028, -0.0063],
        ...,
        [ 0.0005, -0.0014, -0.0205,  ..., -0.0181, -0.0064, -0.0108],
        [ 0.0036, -0.0039, -0.0214,  ..., -0.0159, -0.0070, -0.0100],
        [ 0.0019, -0.0003, -0.0203,  ..., -0.0163, -0.0006, -0.0050]])

### 単語から似ているか計算

In [12]:
def calcCosineSimilarity(sampletv, datanum, tensor):
    cos_sims = []
    for i in trange(datanum):
        cs = F.cosine_similarity(sampletv, tensor[i])
        cos_sims.append(cs.item())
    return pd.DataFrame(cos_sims)[0]

似ているカテゴリ→似ているプレイリスト

In [16]:
input_text = "home 聞きたい"
sampletv = getTextVector(model, tokenizer, input_text)

print(" categ inference")
cos_sims = calcCosineSimilarity(sampletv, len(categories), categories_tensor)
most_sim = cos_sims.idxmax()
print(cos_sims[most_sim], ":", categories["category"][most_sim])

# カテゴリの名前
ctg_name = categories["category"][most_sim]
# playlist全体のtensorから抜き出す
ctg_tvs_tensor = tvs_tensor[playlists_df["category"]==ctg_name]
# 抜き出す最初のindexを取り出し
idx = categories["index"][most_sim]

print("\n playlist inference")
cos_sims = calcCosineSimilarity(sampletv, len(ctg_tvs_tensor), ctg_tvs_tensor)
most_sim = cos_sims.idxmax()
print(cos_sims[most_sim], ":", playlists_text[idx+most_sim])
playlistId = playlists_df["id"][idx+most_sim]
print("id = ", playlistId)

 categ inference


  0%|          | 0/45 [00:00<?, ?it/s]

0.9959666132926941 : funk

 playlist inference


  0%|          | 0/25 [00:00<?, ?it/s]

0.9962126016616821 : funk Funk Outta Here Funk to these good ol' funky soul classics!
id =  37i9dQZF1DX70TzPK5buVf


いきなり似ているプレイリスト

In [17]:
input_text = "home 聞きたい"
sampletv = getTextVector(model, tokenizer, input_text)

cos_sims = calcCosineSimilarity(sampletv, len(tvs_tensor), tvs_tensor)
most_sim = cos_sims.idxmax()
print("similarity: ", cos_sims[most_sim])
print("playlist: ", playlists_text[most_sim])
playlistId = playlists_df["id"][most_sim]
print("id = ", playlistId)

  0%|          | 0/1761 [00:00<?, ?it/s]

similarity:  0.997899055480957
playlist:  karaoke 歌うJ-Rap - Sing-Along - Spotifyの新機能シンガロングを使って、ラッパー気分！*歌詞ページ左下の🎤ボタンを押してお楽しみください。
id =  37i9dQZF1DXbXiYi0xyeiI
