In [1]:
sentences1 = ["私は公園で遊びます。", "彼女は本を読みます。", "彼はテニスをします。", "私たちは映画を見に行きます。", "私は日本語を勉強しています。"]

sentences2 = ["私は公園で遊びます。", "私は公園で楽しみます。", "私は公園で転びます。", "私は公園で騒ぎます", "私は公園で歩きます。"]


In [2]:
sentences1 = ["私は公園で遊びます。 彼女は本を読みます。", "彼はテニスをします。 私たちは映画を見に行きます。", "私は公園で騒ぎます。 私は日本語を勉強しています。"]

sentences2 = ["私は公園で遊びます。 私は公園で楽しみます。", "私は公園で転びます。 私は公園で騒ぎます", "私は公園で転びます。 私は公園で歩きます。"]

- pre-trained BERT
- word2vec
- doc2vec
- sentence BERT

In [3]:
# pre-trained BERT

import numpy as np
import torch
from transformers import BertJapaneseTokenizer, BertModel

BERT_MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(BERT_MODEL_NAME)
model_bert = BertModel.from_pretrained(BERT_MODEL_NAME)

def sentence_to_vector_bert(model_bert, tokenizer, sentence):
    tokens = tokenizer(sentence, add_special_tokens=True)["input_ids"]
    input = torch.tensor(tokens).unsqueeze(0)
    with torch.no_grad():
        outputs = model_bert(input, output_hidden_states=True)
        last_hidden_state = outputs[0][:, 0, :]
        averaged_hidden_state = last_hidden_state.mean(
            dim=0).unsqueeze(0).reshape(-1)
    return averaged_hidden_state


Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# sentence_to_vector_bert(model_bert, tokenizer, sentences1[0])

In [5]:
# word2vec

import csv
import numpy as np
import MeCab
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

model_dir = '/Users/iomacbookair2/Documents/lab/DEIM2023/entity_vector/entity_vector.model.bin'
model_word2vec = KeyedVectors.load_word2vec_format(model_dir, binary=True)

# http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/


In [6]:
def calc_similarity_word2vec(sentence1, sentence2, model):
  mecab = MeCab.Tagger()

  pre_sentence1 = sentence1.split(" ")
  sentence1_words = []
  for s1 in pre_sentence1:
    sentence1_words += [line.split("\t")[0] for line in mecab.parse(s1).split("\n")[:-2]]
  print(f"sentence1_words: {sentence1_words}")

  pre_sentence2 = sentence2.split(" ")
  sentence2_words = []
  for s2 in pre_sentence2:
    sentence2_words += [line.split("\t")[0]
                        for line in mecab.parse(s2).split("\n")[:-2]]
  print(f"sentence2_words: {sentence2_words}")

  # Compute word embeddings for each sentence
  sentence1_embedding = np.mean([model[word]
                                for word in sentence1_words], axis=0)
  sentence2_embedding = np.mean([model[word]
                                for word in sentence2_words], axis=0)
  similarity = cosine_similarity(
      [sentence1_embedding], [sentence2_embedding])[0][0]

  return similarity


def calc_average_similarity_word2vec(sentences):
    similarities = []
    for i in range(len(sentences)):
        for j in range(i + 1, len(sentences)):
            similarity = calc_similarity_word2vec(
                sentences[i], sentences[j], model_word2vec)
            similarities.append(similarity)

    q = [0, 0.25, 0.5, 0.75, 1]
    quartiles = {}
    for i in range(len(q)):
        quartiles[f"quantile_{i}"] = np.quantile(similarities, q[i])
    quartiles["average_similarity"] = np.mean(similarities)
    size = len(list(np.sort(np.array(similarities))))
    width_q = quartiles["quantile_3"] - quartiles["quantile_1"]

    return quartiles, width_q, size


In [7]:
quartiles, width_q, size = calc_average_similarity_word2vec(sentences1)
quartiles, width_q, size


sentence1_words: ['私', 'は', '公園', 'で', '遊び', 'ます', '。', '彼女', 'は', '本', 'を', '読み', 'ます', '。']
sentence2_words: ['彼', 'は', 'テニス', 'を', 'し', 'ます', '。', '私', 'たち', 'は', '映画', 'を', '見', 'に', '行き', 'ます', '。']
sentence1_words: ['私', 'は', '公園', 'で', '遊び', 'ます', '。', '彼女', 'は', '本', 'を', '読み', 'ます', '。']
sentence2_words: ['私', 'は', '公園', 'で', '騒ぎ', 'ます', '。', '私', 'は', '日本語', 'を', '勉強', 'し', 'て', 'い', 'ます', '。']
sentence1_words: ['彼', 'は', 'テニス', 'を', 'し', 'ます', '。', '私', 'たち', 'は', '映画', 'を', '見', 'に', '行き', 'ます', '。']
sentence2_words: ['私', 'は', '公園', 'で', '騒ぎ', 'ます', '。', '私', 'は', '日本語', 'を', '勉強', 'し', 'て', 'い', 'ます', '。']


({'quantile_0': 0.9181925,
  'quantile_1': 0.9320336282253265,
  'quantile_2': 0.9458747506141663,
  'quantile_3': 0.9484353065490723,
  'quantile_4': 0.95099586,
  'average_similarity': 0.9383543},
 0.016401678323745728,
 3)

In [8]:
quartiles, width_q, size = calc_average_similarity_word2vec(sentences2)
quartiles, width_q, size


sentence1_words: ['私', 'は', '公園', 'で', '遊び', 'ます', '。', '私', 'は', '公園', 'で', '楽しみ', 'ます', '。']
sentence2_words: ['私', 'は', '公園', 'で', '転び', 'ます', '。', '私', 'は', '公園', 'で', '騒ぎ', 'ます']
sentence1_words: ['私', 'は', '公園', 'で', '遊び', 'ます', '。', '私', 'は', '公園', 'で', '楽しみ', 'ます', '。']
sentence2_words: ['私', 'は', '公園', 'で', '転び', 'ます', '。', '私', 'は', '公園', 'で', '歩き', 'ます', '。']
sentence1_words: ['私', 'は', '公園', 'で', '転び', 'ます', '。', '私', 'は', '公園', 'で', '騒ぎ', 'ます']
sentence2_words: ['私', 'は', '公園', 'で', '転び', 'ます', '。', '私', 'は', '公園', 'で', '歩き', 'ます', '。']


({'quantile_0': 0.9716927,
  'quantile_1': 0.9763145446777344,
  'quantile_2': 0.9809364080429077,
  'quantile_3': 0.9834160506725311,
  'quantile_4': 0.9858957,
  'average_similarity': 0.9795082},
 0.007101505994796753,
 3)