In [1]:
#必要ライブラリのインポート
#pip install --upgrade jupyter ipywidgets

import os
import json
import torch
import numpy as np
from torch.nn.functional import cosine_similarity
from sentence_transformers import SentenceTransformer

In [2]:
#モデルの定義
model = SentenceTransformer('all-MiniLM-L12-v2')

In [3]:
#使用データの選択
dataset_path = '/taiga/Datasets/scicap_plus'

cap_path = 'captions/test/1001.0196v1-Figure2-1.json' #captionデータ1
men_paths = [
    'mentions_paragraph/test/1001.0196v1-Figure2-1_mentions.npy', #mentionデータ1
    'mentions_paragraph/test/1001.1020v1-Figure1-1_mentions.npy', #mentionデータ2
    'mentions_paragraph/test/1001.3663v1-Figure4-1_mentions.npy', #mentionデータ3
    'mentions_paragraph/test/1001.3689v1-Figure5-1_mentions.npy', #mentionデータ4
    'mentions_paragraph/test/1001.4519v1-Figure6-1_mentions.npy', #mentionデータ5
]


datas = []

#capデータ
with open(os.path.join(dataset_path, cap_path), 'r', encoding='utf-8') as file:
    data = json.load(file)
    datas.append(data['2-normalized']['2-1-basic-num']['caption']) #captionデータは'2-normalized'されたデータを使用

#menデータ
for men_path in men_paths:
    data = np.load(os.path.join(dataset_path, men_path), allow_pickle=True).item()
    datas.append(data['mentions'][0])

In [4]:
#sentence_embedingの実行
embed_datas = []

for data in datas:
    embed_sentence = model.encode(data)
    embed_datas.append(embed_sentence)

print("embed_sentences : ", np.shape(embed_sentence)) #埋め込みベクトルの埋め込み次元数の確認 >> (384,) 384次元のベクトルが生成されていることを確認

embed_sentences :  (384,)


In [5]:
# data[0] (cap1) とdat[1:] (men1-5) の埋め込み表現のコサイン類似度を算出

similarity_scores = []
print("caption : ", datas[0])
for index, embed_data in enumerate(embed_datas[1:]):
     similarity_score = cosine_similarity(torch.unsqueeze(torch.tensor(embed_datas[0]), dim=0), torch.unsqueeze(torch.tensor(embed_data), dim=0))
     similarity_scores.append(similarity_score)
     print('-'*50)
     print(f"cap1, men{index+1} : {similarity_score}")
     print("mention : ", datas[index+1])

caption :  write performance of the wan file systems .
--------------------------------------------------
cap1, men1 : tensor([0.5802])
mention :  operation in all our measurements to include the cost of cache flushes. The throughput for the write and read performance is shown in Figure 2 and Figure 3.
--------------------------------------------------
cap1, men2 : tensor([0.1575])
mention :  Figure 1 provides the comparisons on the six (correlated) noise datasets: M-Noise1 to M-Noise6. Table 4 compares the error rates on M-Basic, M-Rotate, M-Image, M-Rand, and M-RotImg.
--------------------------------------------------
cap1, men3 : tensor([0.1917])
mention :  This section shows the comparison of the topological measurements across the three level collaboration networks. Figure 4 depicts the network sizes at the three level collaboration networks over the eight years. Because the author level collaboration networks have a large number of nodes and edges than corresponding networks at 

In [6]:
def extract_first_num_words(text, num=10):
    # 文字列を空白で分割して単語のリストを作成
    words = text.split()
    # 最初の10単語を取得し、空白で連結して文字列にする
    first_num_words = ' '.join(words[:num])
    return first_num_words

In [7]:
#文章の長さに関係があるのか調査
#全文章を先頭から20単語分のみ抽出

extracted_datas = []
for data in datas:
    extract_text = extract_first_num_words(text=data, num=20)
    extracted_datas.append(extract_text)


#sentence_embedingの実行
embed_extracted_datas = []

for extracted_data in extracted_datas:
    embed_extracted_sentence = model.encode(extracted_data)
    embed_extracted_datas.append(embed_extracted_sentence)

print("embed_sentences : ", np.shape(embed_extracted_sentence)) #埋め込みベクトルの埋め込み次元数の確認 >> (384,) 384次元のベクトルが生成されていることを確認

# data[0] (cap1) とdat[1:] (men1-5) の埋め込み表現のコサイン類似度を算出

similarity_scores_extracted = []
print("caption : ", datas[0])
for index, embed_extracted_data in enumerate(embed_extracted_datas[1:]):
     similarity_score_extracted = cosine_similarity(torch.unsqueeze(torch.tensor(embed_extracted_datas[0]), dim=0), torch.unsqueeze(torch.tensor(embed_extracted_data), dim=0))
     similarity_scores_extracted.append(similarity_score_extracted)
     print('-'*50)
     print(f"cap1, men{index+1} : {similarity_score_extracted}")
     print("mention : ", datas[index+1])

embed_sentences :  (384,)
caption :  write performance of the wan file systems .
--------------------------------------------------
cap1, men1 : tensor([0.5995])
mention :  operation in all our measurements to include the cost of cache flushes. The throughput for the write and read performance is shown in Figure 2 and Figure 3.
--------------------------------------------------
cap1, men2 : tensor([0.1617])
mention :  Figure 1 provides the comparisons on the six (correlated) noise datasets: M-Noise1 to M-Noise6. Table 4 compares the error rates on M-Basic, M-Rotate, M-Image, M-Rand, and M-RotImg.
--------------------------------------------------
cap1, men3 : tensor([0.2110])
mention :  This section shows the comparison of the topological measurements across the three level collaboration networks. Figure 4 depicts the network sizes at the three level collaboration networks over the eight years. Because the author level collaboration networks have a large number of nodes and edges than 