In [7]:
import re
import pickle
from janome.tokenizer import Tokenizer

with open("wagahaiwa_nekodearu.txt", mode="r", encoding="utf-8") as f:  # ファイルの読み込み
    wagahai_original = f.read()

wagahai = re.sub("《[^》]+》", "", wagahai_original) # ルビの削除
wagahai = re.sub("［[^］]+］", "", wagahai) # 読みの注意の削除
wagahai = re.sub("[｜ 　「」\n]", "", wagahai) # | と全角半角スペース、「」と改行の削除

seperator = "。"  # 。をセパレータに指定
wagahai_list = wagahai.split(seperator)  # セパレーターを使って文章をリストに分割する
wagahai_list.pop() # 最後の要素は空の文字列になるので、削除
wagahai_list = [x+seperator for x in wagahai_list]  # 文章の最後に。を追加
        
t = Tokenizer()

wagahai_words = []
for sentence in wagahai_list:
    wagahai_words.append(t.tokenize(sentence, wakati=True))   # 文章ごとに単語に分割し、リストに格納
    
with open('wagahai_words.pickle', mode='wb') as f:  # pickleに保存
    pickle.dump(wagahai_words, f)

In [9]:
with open('wagahai_words.pickle', mode='rb') as f:
    wagahai_words = pickle.load(f)

In [11]:
from gensim.models import word2vec

model = word2vec.Word2Vec(wagahai_words,
                         size=100,
                         min_count=5,
                         window=5,
                         iter=20,
                         sg=0)

In [12]:
print(model.wv.vectors.shape)
print(model.wv.vectors)

(3309, 100)
[[ 0.3477761  -0.26250306  0.21070902 ...  0.1269266   0.89546144
   0.44100738]
 [-0.09539624 -1.2965348  -0.6089768  ... -0.92097396  0.3817033
  -1.3129643 ]
 [-0.7266322   0.24112898 -0.16109174 ...  0.08816173  0.8394262
   0.47669557]
 ...
 [-0.20145875 -0.08004301  0.00278761 ...  0.21030714 -0.26648027
   0.21512267]
 [ 0.04634978  0.16751134  0.05343492 ...  0.106382    0.0874413
  -0.08753338]
 [ 0.09112152  0.04584696  0.02083238 ... -0.00239505 -0.01751359
  -0.09843313]]


In [14]:
print(len(model.wv.index2word))
print(model.wv.index2word[:10])

3309
['の', '。', 'て', '、', 'は', 'に', 'を', 'と', 'が', 'た']


In [16]:
print(model.wv.vectors[0])
print(model.wv.__getitem__("の"))

[ 0.3477761  -0.26250306  0.21070902 -0.30162653 -1.0978594   0.12515591
 -0.43797913  0.6782027   0.6267708   0.34921595 -0.00596868  1.0759177
 -1.3425335   0.36570206  0.50163573 -0.4645328   0.68633944  0.24988528
 -0.5217439   0.31614736 -0.2586805  -1.0627158   0.14498131 -0.67572314
  0.50936216  1.0069667   0.05948824 -0.89010143  0.609023    0.7215569
 -0.6875353  -1.0485088   1.033036   -0.5729574   0.67277384 -0.48509452
 -0.21755464  0.7032668  -0.46789923  0.08748992  0.3638896  -0.08979254
 -0.8436585  -0.49327317  0.5899161   0.00321887 -0.23258394  0.8969044
  0.37736684 -0.07837201  0.03698083 -0.41281605  0.00604828  0.33612573
  0.13183753 -0.28815687 -0.18619378 -0.37625545  0.66813433  0.607646
 -0.17127855 -0.18090063 -0.13942415  0.2598807   0.53892505  0.30539176
 -0.07634597 -0.8268861  -1.0695198   0.17276376 -0.5139004  -0.38974205
  0.36168593  0.02061707 -0.31228796 -0.6114346  -0.4298325   0.02243316
  0.8277563   0.10814247  0.07656042 -0.38366172  0.5205

In [17]:
print(model.wv.vectors[0])
print(model.wv.__getitem__("猫"))

[ 0.3477761  -0.26250306  0.21070902 -0.30162653 -1.0978594   0.12515591
 -0.43797913  0.6782027   0.6267708   0.34921595 -0.00596868  1.0759177
 -1.3425335   0.36570206  0.50163573 -0.4645328   0.68633944  0.24988528
 -0.5217439   0.31614736 -0.2586805  -1.0627158   0.14498131 -0.67572314
  0.50936216  1.0069667   0.05948824 -0.89010143  0.609023    0.7215569
 -0.6875353  -1.0485088   1.033036   -0.5729574   0.67277384 -0.48509452
 -0.21755464  0.7032668  -0.46789923  0.08748992  0.3638896  -0.08979254
 -0.8436585  -0.49327317  0.5899161   0.00321887 -0.23258394  0.8969044
  0.37736684 -0.07837201  0.03698083 -0.41281605  0.00604828  0.33612573
  0.13183753 -0.28815687 -0.18619378 -0.37625545  0.66813433  0.607646
 -0.17127855 -0.18090063 -0.13942415  0.2598807   0.53892505  0.30539176
 -0.07634597 -0.8268861  -1.0695198   0.17276376 -0.5139004  -0.38974205
  0.36168593  0.02061707 -0.31228796 -0.6114346  -0.4298325   0.02243316
  0.8277563   0.10814247  0.07656042 -0.38366172  0.5205