In [1]:
import logging
from gensim.models import Word2Vec, KeyedVectors

from wiki_analyzer.corpus_manager import SQLiteCorpus
from wiki_analyzer.config import (
    DB_PATH,
    WIKI_MODEL_PATH,
    WORD2_VEC_MODEL_PATH
)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki_model_path_str = str(WIKI_MODEL_PATH)

# wikipediaのデータからword2vecコーパスを作成

In [2]:
# コーパスを読み込む
corpus = SQLiteCorpus(DB_PATH)

# Word2Vecモデルの訓練
model = Word2Vec(
    sentences=corpus,
    vector_size=200,
    window=5,
    min_count=5,
    workers=4,
    sample=1e-3,
    negative=5,
    sg=0 # 0: CBOW, 1: Skip-gram
)

# モデルを保存する
model.save(wiki_model_path_str)

2024-05-29 22:49:59,540 : INFO : collecting all words and their counts
2024-05-29 22:52:48,135 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-05-29 22:52:55,387 : INFO : PROGRESS: at sentence #10000, processed 13352025 words, keeping 288409 word types
2024-05-29 22:53:03,384 : INFO : PROGRESS: at sentence #20000, processed 23556931 words, keeping 408139 word types
2024-05-29 22:53:08,399 : INFO : PROGRESS: at sentence #30000, processed 31701601 words, keeping 491842 word types
2024-05-29 22:53:13,416 : INFO : PROGRESS: at sentence #40000, processed 40123853 words, keeping 565150 word types
2024-05-29 22:53:17,271 : INFO : PROGRESS: at sentence #50000, processed 47266457 words, keeping 622157 word types
2024-05-29 22:53:21,960 : INFO : PROGRESS: at sentence #60000, processed 54410060 words, keeping 676361 word types
2024-05-29 22:53:24,988 : INFO : PROGRESS: at sentence #70000, processed 61090809 words, keeping 724588 word types
2024-05-29 22:53:28,920 :

In [3]:
# モデルのロードとテスト
model = Word2Vec.load(wiki_model_path_str)

# モデルのバイナリ形式で保存
model.wv.save_word2vec_format(WORD2_VEC_MODEL_PATH, binary=True)

2024-05-30 00:53:51,301 : INFO : loading Word2Vec object from data\word2vec.model
2024-05-30 00:53:52,475 : INFO : loading wv recursively from data\word2vec.model.wv.* with mmap=None
2024-05-30 00:53:52,477 : INFO : loading vectors from data\word2vec.model.wv.vectors.npy with mmap=None
2024-05-30 00:53:53,369 : INFO : loading syn1neg from data\word2vec.model.syn1neg.npy with mmap=None
2024-05-30 00:53:54,154 : INFO : setting ignored attribute cum_table to None
2024-05-30 00:54:05,931 : INFO : Word2Vec lifecycle event {'fname': 'data\\word2vec.model', 'datetime': '2024-05-30T00:54:05.931421', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22635-SP0', 'event': 'loaded'}
2024-05-30 00:54:07,684 : INFO : storing 993253x200 projection weights into data\word2vec.model.pt


In [2]:
# モデルのキーを取得
wv = KeyedVectors.load_word2vec_format(WORD2_VEC_MODEL_PATH, binary=True, encoding='utf-8')

2024-10-28 16:43:55,731 : INFO : loading projection weights from data\word2vec.model.pt
2024-10-28 16:44:02,035 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (993253, 200) matrix of type float32 from data\\word2vec.model.pt', 'binary': True, 'encoding': 'utf-8', 'datetime': '2024-10-28T16:44:02.035730', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22635-SP0', 'event': 'load_word2vec_format'}


In [38]:
# 類似単語の取得
target_word = input('Enter a word: ')

if target_word not in wv:
    # 単語がモデルに存在しない場合
    print(f'{target_word} is not in the vocabulary.')
    exit()

results = wv.most_similar(positive=target_word, topn=20)
print(f'Top 10 words similar to {target_word}:')

# 結果の表示
for result in results:
    print(result)

Top 10 words similar to 呪術廻戦:
('HUNTER×HUNTER', 0.7779816389083862)
('チェンソーマン', 0.7658897638320923)
('進撃の巨人', 0.749148428440094)
('僕のヒーローアカデミア', 0.7490714192390442)
('カードキャプターさくら', 0.7444055676460266)
('ガールズ&パンツァー', 0.7430760860443115)
('幽☆遊☆白書', 0.7356335520744324)
('天才バカボン', 0.7304477095603943)
('魔法先生ネギま!', 0.7299984693527222)
('あの日見た花の名前を僕達はまだ知らない。', 0.7292018532752991)
('銀魂', 0.7291019558906555)
('おそ松さん', 0.7290772199630737)
('咲-Saki-', 0.7281158566474915)
('ハヤテのごとく!', 0.7250310778617859)
('ローゼンメイデン', 0.7234790325164795)
('ジョジョの奇妙な冒険', 0.7226316928863525)
('ハクション大魔王', 0.7222316265106201)
('家庭教師ヒットマンREBORN!', 0.7217860817909241)
('STEINS;GATE', 0.7207491993904114)
('ハイキュー!!', 0.717651903629303)


In [39]:
# 類似度計算
similarity = wv.similarity('HUNTER×HUNTER', '呪術廻戦')
print(similarity)

0.7779816
