In [2]:
import logging
from gensim.models import Word2Vec, KeyedVectors

from wiki_analyzer.corpus import SQLiteCorpus
from wiki_analyzer.config import (
    DATABASE_PATH,
    WIKI_MODEL_PATH,
    WORD2_VEC_MODEL_PATH
)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki_model_path_str = str(WIKI_MODEL_PATH)

# wikipediaのデータからword2vecコーパスを作成

In [2]:
# コーパスを読み込む
corpus = SQLiteCorpus(DATABASE_PATH)

# Word2Vecモデルの訓練
model = Word2Vec(
    sentences=corpus,
    vector_size=200,
    window=5,
    min_count=5,
    workers=4,
    sample=1e-3,
    negative=5,
    sg=0 # 0: CBOW, 1: Skip-gram
)

# モデルを保存する
model.save(wiki_model_path_str)

2025-01-02 23:00:39,740 : INFO : collecting all words and their counts
2025-01-02 23:01:27,294 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-01-02 23:01:31,467 : INFO : PROGRESS: at sentence #10000, processed 16484481 words, keeping 331548 word types
2025-01-02 23:01:34,794 : INFO : PROGRESS: at sentence #20000, processed 29768024 words, keeping 465511 word types
2025-01-02 23:01:37,500 : INFO : PROGRESS: at sentence #30000, processed 40677516 words, keeping 562895 word types
2025-01-02 23:01:40,030 : INFO : PROGRESS: at sentence #40000, processed 50602426 words, keeping 636536 word types
2025-01-02 23:01:42,381 : INFO : PROGRESS: at sentence #50000, processed 59654618 words, keeping 699853 word types
2025-01-02 23:01:44,914 : INFO : PROGRESS: at sentence #60000, processed 68330797 words, keeping 756248 word types
2025-01-02 23:01:47,400 : INFO : PROGRESS: at sentence #70000, processed 76570782 words, keeping 806389 word types
2025-01-02 23:01:49,436 :

In [3]:
# モデルのロードとテスト
model = Word2Vec.load(wiki_model_path_str)

# モデルのバイナリ形式で保存
model.wv.save_word2vec_format(WORD2_VEC_MODEL_PATH, binary=True)

2025-01-03 00:35:27,229 : INFO : loading Word2Vec object from data\word2vec.model
2025-01-03 00:35:27,929 : INFO : loading wv recursively from data\word2vec.model.wv.* with mmap=None
2025-01-03 00:35:27,930 : INFO : loading vectors from data\word2vec.model.wv.vectors.npy with mmap=None
2025-01-03 00:35:29,315 : INFO : loading syn1neg from data\word2vec.model.syn1neg.npy with mmap=None
2025-01-03 00:35:30,665 : INFO : setting ignored attribute cum_table to None
2025-01-03 00:35:38,036 : INFO : Word2Vec lifecycle event {'fname': 'data\\word2vec.model', 'datetime': '2025-01-03T00:35:38.036227', 'gensim': '4.3.3', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22635-SP0', 'event': 'loaded'}
2025-01-03 00:35:39,364 : INFO : storing 1069493x200 projection weights into data\word2vec.model.pt


In [3]:
# モデルのキーを取得
wv = KeyedVectors.load_word2vec_format(WORD2_VEC_MODEL_PATH, binary=True, encoding='utf-8')

2025-01-03 01:25:16,916 : INFO : loading projection weights from data\word2vec.model.pt
2025-01-03 01:25:31,925 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (1069493, 200) matrix of type float32 from data\\word2vec.model.pt', 'binary': True, 'encoding': 'utf-8', 'datetime': '2025-01-03T01:25:31.925179', 'gensim': '4.3.3', 'python': '3.10.10 (tags/v3.10.10:aad5f6a, Feb  7 2023, 17:20:36) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22635-SP0', 'event': 'load_word2vec_format'}


In [11]:
# 類似単語の取得
target_word = input('Enter a word: ')

if target_word not in wv:
    # 単語がモデルに存在しない場合
    print(f'{target_word} is not in the vocabulary.')

results = wv.most_similar(positive=target_word, topn=20)
print(f'Top 10 words similar to {target_word}:')

# 結果の表示
for result in results:
    print(result)

Top 10 words similar to 月姫:
('真月譚', 0.7661639451980591)
('ドラゴンクエストIV', 0.7615541219711304)
('Fate', 0.7576828002929688)
('ふしぎ遊戯', 0.7534662485122681)
('ギャラクシーエンジェル', 0.7447072267532349)
('涼宮ハルヒシリーズ', 0.7416298389434814)
('天地無用!', 0.7401193976402283)
('咲-Saki-', 0.7382599711418152)
('Kanon', 0.7350034713745117)
('げんしけん', 0.733918309211731)
('ロードス島戦記', 0.7335675954818726)
('うたわれるもの', 0.7328515648841858)
('CLANNAD', 0.7322487831115723)
('遊☆戯☆王デュエルモンスターズGX', 0.7306022644042969)
('三國志', 0.7305446267127991)
('とある魔術の禁書目録', 0.7282618284225464)
('スクールランブル', 0.7276273965835571)
('魔法少女リリカルなのは', 0.7268280386924744)
('スレイヤーズ', 0.7266961336135864)
('ときめきメモリアル', 0.7260203957557678)


In [12]:
# 類似度計算
similarity = wv.similarity('Fate', 'TYPE-MOON')
print(similarity)

0.5723701


In [15]:
# 単語同士のベクトル演算
results = wv.most_similar(positive=['Fate', '月姫'], negative=['魔法少女リリカルなのは'])

# 結果の表示
for result in results:
    print(result)

('Kanon', 0.6125396490097046)
('スーパーロボット大戦シリーズ', 0.6076908707618713)
('涼宮ハルヒシリーズ', 0.6052647233009338)
('ロードス島戦記', 0.6046294569969177)
('スーパードンキーコング', 0.6044653058052063)
('聖剣伝説', 0.6006618738174438)
('ときめきメモリアル', 0.5943046808242798)
('ファイアーエムブレム', 0.5925250053405762)
('.hack', 0.5915224552154541)
('天外魔境', 0.5886648297309875)
