In [7]:
import zipfile
import os.path
import urllib.request as req
import MeCab
from gensim import models
from gensim.models.doc2vec import TaggedDocument

# MeCabの初期化
mecab = MeCab.Tagger()
mecab.parse("")

# 学習対象とする青空文庫のリスト
list = [
    {"author":{
        "name":"宮沢賢治",
        "url":"https://www.aozora.gr.jp/cards/000081/files/"},
     "books":[
         {"name":"銀河鉄道の夜","zipname":"43737_ruby_19028.zip"},
         {"name":"注文の多い料理店","zipname":"1927_ruby_17835.zip"},
         {"name":"セロ弾きのゴージュ","zipname":"470_ruby_3987.zip"},
         {"name":"やまなし","zipname":"46605_ruby_29758.zip"},
         {"name":"どんぐりと山猫","zipname":"43752_ruby_17595.zip"},   
     ]},
    {"author":{
        "name":"芥川龍之介",
        "url":"https://www.aozora.gr.jp/cards/000879/files/"},
     "books":[
         {"name":"羅生門","zipname":"127_ruby_150.zip"},
         {"name":"鼻","zipname":"42_ruby_154.zip"},
         {"name":"河童","zipname":"69_ruby_1321.zip"},
         {"name":"歯車","zipname":"42377_ruby_34744.zip"},
         {"name":"老年","zipname":"131_ruby_241.zip"},   
     ]},
    {"author":{
        "name":"太宰治",
        "url":"https://www.aozora.gr.jp/cards/000035/files/"},
     "books":[
         {"name":"斜陽","zipname":"1565_ruby_8220.zip"},
         {"name":"走れメロス","zipname":"1567_ruby_4948.zip"},
         {"name":"津軽","zipname":"2282_ruby_1996.zip"},
         {"name":"お伽草紙","zipname":"307_ruby_3042.zip"},
         {"name":"人間失格","zipname":"301_ruby_5915.zip"},   
     ]},
    {"author":{
        "name":"夏目漱石",
        "url":"https://www.aozora.gr.jp/cards/000148/files/"},
     "books":[
         {"name":"吾輩は猫である","zipname":"789_ruby_5639.zip"},
         {"name":"坊ちゃん","zipname":"752_ruby_2438.zip"},
         {"name":"草枕","zipname":"776_ruby_6020.zip"},
         {"name":"虜美人草","zipname":"761_ruby_1861.zip"},
         {"name":"三四郎","zipname":"794_ruby_4237.zip"},   
     ]},
]

# 作品リスト取得→ループ処理へ
def book_list():
    for novelist in list:
        author = novelist["author"]
        for book in novelist["books"]:
            # yield その時点の戻り値を返し、関数再開できる
            yield author, book

# zipファイル開き、中の文書取得
def read_book(author, book):
    zipname = book["zipname"]
    # Zipファイルがなければ取得
    if not os.path.exists(zipname):
        req.urlretrieve(author["url"] + zipname, zipname)
    zipname = book["zipname"]
    # Zipファイル展開
    with zipfile.ZipFile(zipname, "r") as zf:
        # ZIpファイルに含まれるファイルを展開
        for filename in zf.namelist():
            with zf.open(filename, "r") as f:
                # ファイルがshift-jisなので形式指定でデコード
                return f.read().decode("shift-jis")

# 引数のテキストを分かち書きにして配列に格納する
def split_words(text):
    node = mecab.parseToNode(text)
    wakati_words = []
    while node is not None:
        hinshi = node.feature.split(",")[0]
        # 名詞であれば文字列をそのまま格納
        if hinshi in ["名詞"]:
            wakati_words.append(node.surface)
        # 形容詞、動詞は原型を格納
        elif hinshi in ["動詞","形容詞"]:
            wakati_words.append(node.feature.split(",")[6])
        node = node.next
    return wakati_words

# 作品リストをDoc2Vecが読めるTaggedDocument形式にし、配列に追加できる
documents = []

# 作品リストをループで回す
for author, book in book_list():
    # 作品を文字列で取得
    words = read_book(author, book)
    # 作品の文字列を分かち書きにする
    wakati_words = split_words(words)
    # TaggedDocumentsの要素作成
   #  TaggedDocument(文書=分かち書きの作品,タグ=作者：作品名])
    document = TaggedDocument(wakati_words,[author["name"] + ":" + book["name"]])
    documents.append(document)
    
# TaggedDocumentの配列を使い学習モデル作成
model = models.Doc2Vec(documents, dm = 1, vector_size =300, window = 5, min_count = 1)

# 学習モデル保存
model.save('aozora.model')

print("モデル作成完了")
    

モデル作成完了
