データのスクレイピング（URLの部分に、pdfに記載のURLを入力して実行）

In [None]:
import os
import re
import bs4
import time
import requests
import pprint

def load(url):
    res = requests.get(url)
    res.raise_for_status()

    return res.text

def pickup_tag(html, find_tag):
    soup = bs4.BeautifulSoup(str(html), 'html.parser')
    paragraphs = soup.find_all(find_tag)

    return paragraphs

def parse(html):
    soup = bs4.BeautifulSoup(str(html), 'html.parser')
    # htmlタグの排除
    kashi_row = soup.getText()
    kashi_row = kashi_row.replace('\n', '')
    kashi_row = kashi_row.replace('　', '')

    # 英数字の排除
    kashi_row = re.sub(r'[a-zA-Z0-9]', '', kashi_row)
    # 記号の排除
    kashi_row = re.sub(r'[ ＜＞♪`‘’“”・…_！？!-/:-@[-`{-~]', '', kashi_row)
    # 注意書きの排除
    kashi = re.sub(r'注意：.+', '', kashi_row)

    return kashi

def main():
    with open('yonedu_kashi.txt', 'a') as f:
        # アーティストページのアドレス
        url = f'https://www.uta-net.com/search/?Aselect=6&Bselect=3&Keyword=あ&sort=&pnum=1'
        
        # 曲ページの先頭アドレス
        base_url = f'https://www.uta-net.com'

        # ページの取得
        html = load(url)

        # 曲ごとのurlを格納
        musics_url = []
        # 歌詞を格納
        kashis = ''

        """ 曲のurlを取得 """
        # td要素の取り出し
        for td in pickup_tag(html, 'td'):
            # a要素の取り出し
            for a in pickup_tag(td, 'a'):
                # href属性にsongを含むか
                if 'song' in a.get('href'): 
                    # urlを配列に追加
                    musics_url.append(base_url + a.get('href'))
        # pprint.pprint(musics_url)

        """ 歌詞の取得 """
        for i, page in enumerate(musics_url):
            print('{}曲目:{}'.format(i + 1, page))
            html = load(page)
            for div in pickup_tag(html, 'div'):
                # id検索がうまく行えなかった為、一度strにキャスト
                div = str(div)
                # 歌詞が格納されているdiv要素か
                if r'itemprop="text"' in div:
                    # 不要なデータを取り除く
                    kashi = parse(div)

                    print(kashi, end = '\n\n')
                    # 歌詞を１つにまとめる
                    kashis += kashi + '\n'

                    # １秒待機
                    time.sleep(1)
                    break
        # 歌詞の書き込み
        f.write(kashis)

if __name__ == '__main__':
    main()

LSTMを実行してみる

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

path = '/Users/yamashitashiori/Desktop/python3/yonedu_kashi.txt'
text = open(path, "r").read()

chars = sorted(list(set(text)))
char_indices = dict((c,i) for i,c in enumerate(chars))
indices_char = dict((i,c) for i,c in enumerate(chars))

maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

# テキストのベクトル化
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

# モデルを定義する
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

for iteration in range(1,120):
    print()
    print('-' *50)
    print('繰り返し回数: ', iteration)
    model.fit(X, y, batch_size=128, epochs=1)

    start_index = random.randint(0, len(text)-maxlen-1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('-----diveristy', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Seedを生成しました: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(100):
            x = np.zeros((1,maxlen,len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


データの前処理（分かち書き）

In [None]:
import MeCab
tagger = MeCab.Tagger('-Owakati')
with open("/Users/yamashitashiori/Desktop/Python3/kashi.txt", "r", encoding="utf-8") as f:
    text = f.readlines()
    for line in text:
        
        result = tagger.parse(line)
        with open("Desktop/Python3/kashi_wakati.txt", "a", encoding="utf-8") as of:
                of.write(result[0:])
        
print("Finish")

In [None]:
import MeCab
tagger = MeCab.Tagger('-Owakati')
with open("/Users/yamashitashiori/Desktop/Python3/yonedu_kashi.txt", "r", encoding="utf-8") as f:
    text = f.readlines()
    for line in text:
        
        result = tagger.parse(line)
        with open("Desktop/Python3/yonezu_wakati.txt", "a", encoding="utf-8") as of:
                of.write(result[0:])
        
print("Finish")

データの前処理（id変換）

In [None]:
import torch
from torch.utils.data import DataLoader
from tensorflow.keras.preprocessing.sequence import pad_sequences

class EncoderDecoder(object):
    def __init__(self, sentences, bos=False, eos=False):
        # word_to_idの辞書
        self.w2i = {}
        # id_to_wordの辞書
        self.i2w = {}
        # 文頭の記号を入れるか
        self.bos = bos
        # 文末の記号を入れるか
        self.eos = eos
        # 予約語(パディング, 文章の始まり)
        self.special_chars = ['<pad>', '<s>', '</s>', '<unk>']
        self.bos_char = self.special_chars[1]
        self.eos_char = self.special_chars[2]
        self.oov_char = self.special_chars[3]
        # 全ての単語を読み込んで辞書作成
        self.fit(sentences)

    # コールされる関数
    def __call__(self, sentence):
        return self.transform(sentence)

    # 辞書作成
    def fit(self, sentences):
        self._words = set()

        # 未知の単語の集合を作成する
        for sentence in sentences:
            self._words.update(sentence)

        # 予約語分ずらしてidを振る
        self.w2i = {w: (i + len(self.special_chars))
                    for i, w in enumerate(self._words)}

        # 予約語を辞書に追加する(<pad>:0, <s>:1, </s>:2, <unk>:3)
        for i, w in enumerate(self.special_chars):
            self.w2i[w] = i

        # word_to_idの辞書を用いてid_to_wordの辞書を作成する
        self.i2w = {i: w for w, i in self.w2i.items()}

    # 1文をidに変換する
    def transform(self, sentence):
        # 指定があれば始まりと終わりの記号を追加する
        if self.bos:
            sentence = [self.bos_char] + sentence
        if self.eos:
            sentence = sentence + [self.eos_char]
        output = self.encode(sentence)

        return output

    # 1文ずつidにする
    def encode(self, sentence):
        output = []
        for w in sentence:
            if w not in self.w2i:
                idx = self.w2i[self.oov_char]
            else:
                idx = self.w2i[w]
            output.append(idx)

        return output

    # # １文ずつ単語リストに直す
    # def decode(self, sentence):
    #     return [self.i2w[id] for id in sentence]

with open('/Users/yamashitashiori/Desktop/Python3/wakati.txt') as f:
    l = f.readlines()
    lst = []
    for x in l:
        x2 = x.replace('\n','')
        y = x2.split(' ')
        lst.append(y)
    
    
    transform = EncoderDecoder(lst, bos=True, eos=True)   
    
with open('Desktop/Python3/kashi_id.txt','w') as f:
    for x in lst:
        a = transform(x)
        del a[0]
        del a[-1]
        
        y = ' '.join([str(i) for i in a])
        

        f.write(y + '\n')
      
      #ここから上80曲分を手動でyonezu_id.txtに移動

本体（SeqGAN）

In [None]:
!wget https://www.shoeisha.co.jp/static/book/download/9784798159928/RL_Book.zip

In [None]:
!unzip RL_Book.zip
%cd ./RL_Book/
!pip install -r docker/requirements.txt
!git clone https://github.com/benelot/pybullet-gym
%cd pybullet-gym
!pip install -e .
%cd ..
!apt-get -qq -y install xvfb freeglut3-dev ffmpeg> /dev/null
!pip install pyglet
!pip install pyopengl
!pip install pyvirtualdisplay

Tensorflowのバージョン変更

In [None]:
%tensorflow_version 1.x

In [None]:
import io
import base64
from IPython.display import HTML

def play_movie(mp4_path):
  video = io.open(mp4_path, 'r+b').read()
  encoded = base64.b64encode(video)
  return HTML(data='''<video alt="test" controls>
                      <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                      </video>'''.format(encoded.decode('ascii')))

In [None]:
%cd ./RL_Book/
%ls contents

コードはここから


In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.utils import to_categorical
import tensorflow as tf


class Agent(object):
    def __init__(self, sess, vocab_size, emb_size, hidden_size,
                 T, lr):
        self.sess = sess
        self.size = 1
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.lr = lr
        self.T = T  # sentence_size

        self.pre_generator = self._build_pre_generator(
            vocab_size,
            emb_size,
            hidden_size
        )
        self.generator = Actor(
            sess,
            self.size,
            vocab_size,
            emb_size,
            hidden_size,
            T,
            lr
        )
        self.rollouter = Actor(
            sess,
            self.size,
            vocab_size,
            emb_size,
            hidden_size,
            T,
            lr
        )

    def _build_pre_generator(self, vocab_size, emb_size, hidden_size):
        data_inp = Input(shape=(None, ), dtype='int32', name='input')
        out = Embedding(
            vocab_size, emb_size, mask_zero=False, name='embedding'
        )(data_inp)
        out = LSTM(hidden_size, return_sequences=True, name='LSTM')(out)
        out = TimeDistributed(
            Dense(vocab_size, activation='softmax', name='dense_softmax'),
            name='time_dense_softmax')(out)
        pre_generator = Model(data_inp, out)
        return pre_generator

    def pre_train(self, g_data, g_pre_episodes, weight_path, g_pre_lr):
        g_optimizer = Adam(g_pre_lr)
        self.pre_generator.compile(g_optimizer, 'categorical_crossentropy')
        self.pre_generator.summary()
        self.pre_hist = self.pre_generator.fit_generator(
            g_data,
            steps_per_epoch=None,
            epochs=g_pre_episodes
        )
        self.pre_generator.save_weights(weight_path)
        self.inherit_weights(self.pre_generator, self.generator)
        self.inherit_weights(self.pre_generator, self.rollouter)

    def sample_words(self, prob):
        action = np.zeros((self.size, ), dtype=np.int32)
        for i in range(self.size):
            p = prob[i]
            action[i] = np.random.choice(self.vocab_size, p=p)
        return action

    def sample_sentences(self, actor, T, BOS=1):
        actor.reset_rnn_state()
        action = np.zeros([self.size, 1], dtype=np.int32)
        action[:, 0] = BOS
        actions = action
        for _ in range(T):
            prob, _, _ = actor.predict(action)
            action = self.sample_words(prob).reshape(-1, 1)
            actions = np.concatenate([actions, action], axis=-1)
        actions = actions[:, 1:]
        actor.reset_rnn_state()
        return actions

    def generate_id_samples(self, actor, T, sample_num, output_file):
        sentences_ids = []
        for _ in range(sample_num):
            actions = self.sample_sentences(actor, T)
            actions_list = actions.tolist()
            for ids in actions_list:
                ids_str = [str(id) for id in ids]
                sentences_ids.append(ids_str)
        output_str = ''
        for i in range(sample_num):
            output_str += ' '.join(sentences_ids[i]) + '\n'
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(output_str)

    def get_action(self, state):
        s_t = state[:, -1:].reshape([-1, 1])
        prob, h, c = self.generator.predict(s_t)
        action_t = self.sample_words(prob).reshape([1, 1])
        is_end = self.projection(state)
        return action_t * is_end, h, c

    def rollout_sampling(self, action, epsilon=0.0):
        prob, h, c = self.rollouter.predict(action)
        action_t = self.sample_words(prob).reshape([1, 1])
        is_end = self.projection(action)
        return action_t * is_end

    def projection(self, state, PAD=0, EOS=2):
        is_PAD = state[:, -1:] == PAD
        is_EOS = state[:, -1:] == EOS
        is_END = 1 - is_PAD.astype(np.int) - is_EOS.astype(np.int)
        return is_END.reshape([1, 1])

    def rollout(self, step, state, action):
        Y_i = state[:, 1:]
        Y_i = np.concatenate([Y_i, action], axis=-1)
        for _ in range(self.T - 1 - step):
            _action = self.rollout_sampling(action)
            Y_i = np.concatenate([Y_i, _action], axis=-1)
            action = _action
        return Y_i

    def inherit_weights(self, agent, to_agent):
        i = 0
        for layer in agent.layers:
            if len(layer.get_weights()) != 0:
                w = layer.get_weights()
                to_agent.layers[i].set_weights(w)
                i += 1

    def initialize(self, g_pre_weight):
        self.pre_generator.load_weights(g_pre_weight)
        self.inherit_weights(self.pre_generator, self.generator)
        self.inherit_weights(self.pre_generator, self.rollouter)

    def reset_rnn_states(self):
        self.generator.reset_rnn_state()
        self.rollouter.reset_rnn_state()


class Actor(object):
    def __init__(self, sess, size, vocab_size, emb_size, hidden_size,
                 T, lr):
        self.sess = sess
        self.size = size
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.T = T
        self.lr = lr
        self._build_graph()
        self.reset_rnn_state()

    def _build_graph(self):
        state_in = tf.placeholder(tf.float32, shape=(None, 1))
        h_in = tf.placeholder(tf.float32, shape=(None, self.hidden_size))
        c_in = tf.placeholder(tf.float32, shape=(None, self.hidden_size))
        action = tf.placeholder(tf.float32, shape=(None, self.vocab_size))
        reward = tf.placeholder(tf.float32, shape=(None, 1))

        self.layers = []

        embedding = Embedding(
            self.vocab_size, self.emb_size, mask_zero=False, name='embedding'
        )
        out = embedding(state_in)
        self.layers.append(embedding)

        lstm = LSTM(
            self.hidden_size, return_state=True, name='LSTM'
        )
        out, next_h, next_c = lstm(out, initial_state=[h_in, c_in])
        self.layers.append(lstm)

        dense = Dense(
            self.vocab_size, activation='softmax', name='densesoftmax'
        )
        prob = dense(out)
        self.layers.append(dense)

        log_prob = tf.log(tf.reduce_sum(prob * action, axis=-1))
        loss = - log_prob * reward
        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        minimize = optimizer.minimize(loss)

        self.state_in = state_in
        self.h_in = h_in
        self.c_in = c_in
        self.action = action
        self.reward = reward
        self.prob = prob
        self.next_h = next_h
        self.next_c = next_c
        self.minimize = minimize
        self.loss = loss

        self.init_op = tf.global_variables_initializer()
        self.sess.run(self.init_op)

    def reset_rnn_state(self):
        self.h = np.zeros([self.size, self.hidden_size])
        self.c = np.zeros([self.size, self.hidden_size])

    def set_rnn_state(self, h, c):
        self.h = h
        self.c = c

    def get_rnn_state(self):
        return self.h, self.c

    def predict(self, state, stateful=True):
        h = self.h
        c = self.c
        feed_dict = {
            self.state_in: state,
            self.h_in: h,
            self.c_in: c
        }
        prob, next_h, next_c = self.sess.run(
            [self.prob, self.next_h, self.next_c],
            feed_dict
        )

        self.h = next_h
        self.c = next_c
        return prob, next_h, next_c

    def update(self, state, action, reward, h=None, c=None, stateful=True):
        if h is None:
            h = self.h
        if c is None:
            c = self.c
        state = state.reshape(-1, 1)
        action = action.reshape(-1, 1)
        reward = reward.reshape(-1, 1)
        feed_dict = {
            self.state_in: state,
            self.h_in: h,
            self.c_in: c,
            self.action: to_categorical(action, self.vocab_size),
            self.reward: reward
        }
        _, loss, next_h, next_c = self.sess.run(
            [self.minimize, self.loss, self.next_h, self.next_c],
            feed_dict
        )

        if stateful:
            self.h = next_h
            self.c = next_c
            return loss
        else:
            return loss, next_h, next_c

In [None]:
import linecache
import random

import numpy as np
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical


class Vocab(object):
    def __init__(self, sentences_path):
        self.PAD = 0
        self.BOS = 1
        self.EOS = 2
        self.UNK = 3
        self.PAD_TOKEN = '<PAD>'
        self.BOS_TOKEN = '<S>'
        self.EOS_TOKEN = '</S>'
        self.UNK_TOKEN = '<UNK>'
        self.word2id = {
            self.PAD_TOKEN: self.PAD,
            self.BOS_TOKEN: self.BOS,
            self.EOS_TOKEN: self.EOS,
            self.UNK_TOKEN: self.UNK,
        }
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.sentences = load_data(sentences_path)
        self.build_vocab(self.sentences)
        self.vocab_num = len(self.word2id)
        self.sentence_num = len(self.sentences)

    def build_vocab(self, sentences, min_count=1):
        word_counter = {}
        for sentence in sentences:
            for word in sentence:
                word_counter[word] = word_counter.get(word, 0) + 1
        for word, count in sorted(
            word_counter.items(), key=lambda x: x[1], reverse=True
        ):
            if count < min_count:
                break
            _id = len(self.word2id)
            self.word2id.setdefault(word, _id)
            self.id2word[_id] = word

    def write_word2id(self, sentences_path, output_path):
        ids_sentences = []
        for line in open(sentences_path, encoding='utf-8',errors='ignore'):
            words = line.strip().split()
            ids_words = [
                str(self.word2id.get(word, self.UNK)) for word in words
            ]
            ids_sentences.append(ids_words)
        self.data_num = len(ids_sentences)
        output_str = ''
        for i in range(count_data(sentences_path)):
            output_str += ' '.join(ids_sentences[i]) + '\n'
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(output_str)

    def write_id2word(self, ids_path, output_path):
        sentences = []
        for line in open(ids_path, encoding='utf-8',errors='ignore'):
            ids = line.strip().split()
            words_ids = [
                self.id2word.get(int(id), self.UNK_TOKEN) for id in ids
            ]
            sentences.append(words_ids)
        output_str = ''
        for i in range(count_data(ids_path)):
            output_str += ' '.join(sentences[i]) + '\n'
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(output_str)


def load_data(file_path):
    data = []
    for line in open(file_path, encoding='utf-8',errors='ignore'):
        words = line.strip().split()
        data.append(words)
    return data


def count_data(file_path):
    data_num = 0
    for line in open(file_path, encoding='utf-8',errors='ignore'):
        data_num += 1
    return data_num


def padding(sentences, T, PAD=0):
    sentences += [PAD for i in range(T - len(sentences))]
    return sentences


def sentence_to_ids(vocab, sentence, UNK=3):
    ids = [vocab.word2id.get(word, UNK) for word in sentence]
    return ids


class DataForGenerator(Sequence):
    def __init__(self, ids_path, batch_size, T, vocab, shuffle=True):
        self.ids_path = ids_path
        self.batch_size = batch_size
        self.n_data = count_data(ids_path)
        self.T = T
        self.shuffle = shuffle
        self.vocab = vocab
        self.reset()

    def __len__(self):
        return self.n_data // self.batch_size

    def __getitem__(self, idx):
        batch_x = []
        batch_y = []
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size
        for i in range(batch_start, batch_end):
            each_x = []
            each_y = []
            line_index = self.shuffled_indices[i] + 1
            id_sentence = linecache.getline(self.ids_path, line_index)
            id_words = id_sentence.strip().split()

            each_x = [self.vocab.BOS, *id_words, self.vocab.EOS]
            batch_x.append(each_x)

            each_y = [*id_words, self.vocab.EOS]
            batch_y.append(each_y)

        for i, id_words in enumerate(batch_x):
            batch_x[i] = batch_x[i][:self.T]
        for i, id_words in enumerate(batch_y):
            batch_y[i] = batch_y[i][:self.T]

        batch_x = [padding(sentences, self.T) for sentences in batch_x]
        batch_x = np.array(batch_x, dtype=np.int32)

        batch_y = [padding(sentences, self.T) for sentences in batch_y]
        batch_y = np.array(batch_y, dtype=np.int32)
        batch_y = to_categorical(batch_y, num_classes=self.vocab.vocab_num)

        return batch_x, batch_y

    def reset(self):
        self.idx = 0
        if self.shuffle:
            self.shuffled_indices = np.arange(self.n_data)
            random.shuffle(self.shuffled_indices)

    def on_epoch_end(self):
        self.reset()


class DataForDiscriminator(Sequence):
    def __init__(self, pos_path, neg_path, batch_size, T, vocab, shuffle=True):
        self.pos_path = pos_path
        self.neg_path = neg_path
        self.batch_size = batch_size
        self.T = T
        self.shuffle = shuffle
        self.vocab = vocab
        self.pos_n_data = count_data(pos_path)
        self.neg_n_data = count_data(neg_path)
        self.n_data = self.pos_n_data + self.neg_n_data
        self.reset()

    def __len__(self):
        return self.n_data // self.batch_size

    def __getitem__(self, idx):
        batch_x = []
        batch_y = []
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size
        for i in range(batch_start, batch_end):
            each_x = []
            line_index = self.shuffled_indices[i] + 1
            if line_index < self.pos_n_data:
                id_sentence = linecache.getline(self.pos_path, line_index)
                is_pos = 1
            else:
                line_index = line_index - self.pos_n_data
                id_sentence = linecache.getline(self.neg_path, line_index)
                is_pos = 0
            id_words = id_sentence.strip().split()

            each_x = [*id_words, self.vocab.EOS]
            batch_x.append(each_x)
            batch_y.append(is_pos)
            linecache.clearcache()

        for i, id_words in enumerate(batch_x):
            batch_x[i] = batch_x[i][:self.T]

        batch_x = [padding(sentences, self.T) for sentences in batch_x]
        batch_x = np.array(batch_x, dtype=np.int32)

        return batch_x, batch_y

    def reset(self):
        self.idx = 0
        if self.shuffle:
            self.shuffled_indices = np.arange(self.n_data)
            random.shuffle(self.shuffled_indices)

    def on_epoch_end(self):
        self.reset()

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, Embedding, LSTM
from tensorflow.keras.optimizers import Adam


class Environment(object):
    def __init__(self, batch_size, vocab_size, emb_size, hidden_size,
                 T, dropout, lr):
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.T = T
        self.batch_size = batch_size
        self.dropout = dropout
        self.lr = lr
        self.discriminator = self._build_graph(
            self.vocab_size,
            self.emb_size,
            self.hidden_size,
            self.dropout
        )

    def _build_graph(self, vocab_size, emb_size, hidden_size, dropout):
        data_inp = Input(shape=(None, ), dtype='int32', name='input')
        out = Embedding(
            vocab_size, emb_size, mask_zero=False, name='embedding'
        )(data_inp)
        out = LSTM(hidden_size)(out)
        out = Dropout(dropout, name='dropout')(out)
        out = Dense(1, activation='sigmoid', name='dense_sigmoid')(out)
        discriminator = Model(data_inp, out)
        return discriminator

    def pre_train(self, d_data, d_pre_episodes, d_pre_weight, d_pre_lr):
        d_optimizer = Adam(d_pre_lr)
        self.discriminator.compile(d_optimizer, 'binary_crossentropy')
        self.discriminator.summary()
        self.discriminator.fit_generator(
            d_data,
            steps_per_epoch=None,
            epochs=d_pre_episodes
        )
        self.discriminator.save_weights(d_pre_weight)

    def initialize(self, d_pre_weight):
        self.discriminator.load_weights(d_pre_weight)
        d_optimizer = Adam(self.lr)
        self.discriminator.compile(d_optimizer, 'binary_crossentropy')


In [None]:
"""
overview:
SeqGANの学習を行う

args:
各種パラメータ設定値は、本コード中に明記される

output:
フォルダ data に以下の要素が出力される
- pre_generated_sentences.txt: 事前学習した生成器が生成した文章データ
- pre_id_generated_sentences.txt: 事前学習した生成器が生成した文章idデータ
- generated_sentences.txt: 強化学習した生成器が生成した文章データ
- id_generated_sentences.txt: 強化学習した生成器が生成した文章idデータ
フォルダ save に以下の要素が出力される
- pre_d_weights.h5: 事前学習した識別器の重みパラメータ
- pre_g_weights.h5: 事前学習した生成器の重みパラメータ
- adversarial_n_generated_sentences.txt: nエピソード時の生成器が生成した文章データ
- adversarial_n_id_generated_sentences.txt: nエピソード時の生成器が生成した文章idデータ

usage-example:
python3 main.py
"""

import os

import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K

#モジュールのインポートは省略

sess = tf.Session()
K.set_session(sess)


# hyperparameters
batch_size = 30
T = 25  # max_length of sentences
emb_size = 128  # embedding size
g_hidden = 128  # generator hidden size
d_hidden = 64  # discriminator hidden size
g_lr = 1e-3  # generator learning rate in the reinforcement learning
d_lr = 1e-3  # discriminator learning rate in the reinforcement learning
dropout = 0.0

# pretraining parameters
g_pre_lr = 1e-2  # generator pre_training learning rate
d_pre_lr = 1e-2  # discriminator pre_training learning rate
g_pre_episodes = 10  # generator pre_training epochs
d_pre_episodes = 4  # discriminator pre_training epochs
d_epochs = 1

# training parameters
adversarial_nums = 10
g_train_nums = 1  # number of generator train per adversarial learning
d_train_nums = 1  # number of discriminator train per adversarial learning
g_episodes = 50  # sentence num per generator update
n_sampling = 16  # number of monte carlo tree
frequency = 1

# preprocess
input_data = os.path.join('data', '/content/yonezu_wakati.txt')
id_input_data = os.path.join('data', '/content/yonezu_id.txt')

#generatorへのデータを追加
input_data2 = os.path.join('data','/content/kashi_wakati.txt')
id_input_data2 = os.path.join('data','/content/kashi_id.txt')

#元コードに戻る
pre_output_data = os.path.join(
    'data', 'pre_generated_sentences.txt')
pre_id_output_data = os.path.join(
    'data', 'pre_id_generated_sentences.txt')
output_data = os.path.join('data',
                           'generated_sentences.txt')
id_output_data = os.path.join(
    'data', 'id_generated_sentences.txt')
os.makedirs('data/save', exist_ok=True)
g_pre_weight = os.path.join('data', 'save',
                            'pre_g_weights.h5')
d_pre_weight = os.path.join('data', 'save',
                            'pre_d_weights.h5')

vocab = Vocab(input_data)
vocab_size = vocab.vocab_num
pos_sentence_num = vocab.sentence_num
vocab.write_word2id(input_data, id_input_data)
sampling_num = vocab.data_num

#追加データのベクトル化
vocab2 = Vocab(input_data2)
vocab_size2 = vocab2.vocab_num
pos_sentence_num2 = vocab2.sentence_num
vocab2.write_word2id(input_data2, id_input_data2)
sampling_num2 = vocab2.data_num

#元コードに戻る
env = Environment(batch_size, vocab_size, emb_size,
                  d_hidden, T, dropout, d_lr)
agent = Agent(sess, vocab_size2, emb_size, g_hidden, T,
              g_lr)


def pre_train():
    g_data = DataForGenerator(
        id_input_data2,    #id_input_data -> id_input_data2
        batch_size,
        T,
        vocab2     #vocab ->vocab2
    )
    agent.pre_train(
        g_data,
        g_pre_episodes,
        g_pre_weight,
        g_pre_lr
    )
    agent.generate_id_samples(
        agent.generator,
        T,
        sampling_num2,        #sumpling_num -> sumpling_num2
        pre_id_output_data
    )
    vocab2.write_id2word(pre_id_output_data,
                        pre_output_data)
    d_data = DataForDiscriminator(
        id_input_data,
        pre_id_output_data,
        batch_size,
        T,
        vocab
    )
    env.pre_train(d_data, d_pre_episodes, d_pre_weight,
                  d_pre_lr)


def train():
    agent.initialize(g_pre_weight)
    env.initialize(d_pre_weight)
    for adversarial_num in range(adversarial_nums):

        print('---------------------------------------------')
        print('Adversarial Training: ', adversarial_num + 1)

        for _ in range(g_train_nums):
            g_train()

        print('Generator is trained')

        for _ in range(d_train_nums):
            d_train()

        print('Discriminator is trained')

        if adversarial_num % frequency == 0:
            sentences_history(
                adversarial_num,
                agent,
                T,
                vocab2,               #vocab -> vocab2
                sampling_num2         #sampling_num -> sumpling_num2
            )


def g_train():
    batch_states = np.array([[]], dtype=np.int32)
    batch_actions = np.array([[]], dtype=np.int32)
    batch_rewards = np.array([[]], dtype=np.float32)
    batch_hs = np.array([[]], dtype=np.float32)
    batch_cs = np.array([[]], dtype=np.float32)
    for g_episode in range(g_episodes):
        agent.reset_rnn_states()
        states = np.zeros([1, 1], dtype=np.int32)
        states[:, 0] = vocab2.BOS          #vocab -> vocab2
        actions = np.array([[]], dtype=np.int32)
        rewards = np.array([[]], dtype=np.float32)
        hs = np.zeros([1, g_hidden], dtype=np.float32)
        cs = np.zeros([1, g_hidden], dtype=np.float32)
        for step in range(T):
            action, next_h, next_c = agent.get_action(
                states)
            agent.rollouter.reset_rnn_state()
            reward = mc_search(step, states, action,
                               next_h, next_c)
            states = np.concatenate([states, action],
                                    axis=-1)
            rewards = np.concatenate([rewards, reward],
                                     axis=-1)
            actions = np.concatenate([actions, action],
                                     axis=-1)
            hs = np.concatenate([hs, next_h], axis=0)
            cs = np.concatenate([cs, next_c], axis=0)
        states = states[:, :-1]
        hs = hs[:-1]
        cs = cs[:-1]
        batch_states = np.concatenate(
            [batch_states, states], axis=-1)
        batch_actions = np.concatenate(
            [batch_actions, actions], axis=-1)
        batch_rewards = np.concatenate(
            [batch_rewards, rewards], axis=-1)
        batch_hs = np.append(batch_hs,
                             hs).reshape(-1, g_hidden)
        batch_cs = np.append(batch_cs,
                             cs).reshape(-1, g_hidden)
    agent.generator.update(batch_states, batch_actions,
                           batch_rewards, batch_hs,
                           batch_cs)
    agent.inherit_weights(agent.generator,
                          agent.rollouter)


def d_train():
    agent.generate_id_samples(
        agent.generator,
        T,
        sampling_num2,                                    #sumpling_num -> sumpling_num2
        id_output_data,
    )
    vocab2.write_id2word(id_output_data, output_data)     #vocab -> vocab2

    d_data = DataForDiscriminator(id_input_data,
                                  id_output_data,
                                  batch_size, T, vocab)

    env.discriminator.fit_generator(d_data,
                                    steps_per_epoch=None,
                                    epochs=1)


def sentences_history(episode, agent, T, vocab,
                      sampling_num):
    id_output_history = os.path.join(
        'data',
        'adversarial_{}_id_generated_sentences.txt'.
        format(episode + 1))
    output_history = os.path.join(
        'data',
        'adversarial_{}_generated_sentences.txt'.format(
            episode + 1))
    agent.generate_id_samples(agent.generator, T,
                              sampling_num,      
                              id_output_history)
    vocab.write_id2word(id_output_history,
                        output_history)        


def mc_search(step, states, action, next_h, next_c):
    reward_t = np.zeros([1, 1], dtype=np.float32)
    agent.rollouter.reset_rnn_state()
    if step < T - 1:
        agent.rollouter.set_rnn_state(next_h, next_c)
        for i in range(n_sampling):
            Y = agent.rollout(step, states, action)
            reward_t += env.discriminator.predict(
                Y) / n_sampling
    else:
        Y = np.concatenate([states[:, 1:], action],
                           axis=-1)
        reward_t = env.discriminator.predict(Y)
    return reward_t


if __name__ == "__main__":
    pre_train()
    train()


パーセプトロンによる生成データの精度確認

In [None]:
#coding=UTF-8

import pandas as pd
import requests
import nltk
from nltk.corpus import wordnet as wn

#dfの準備
lst = []

with open('Desktop/Python3/AKB.txt','r') as f:
    a = f.readlines()
    lst = lst + a

with open('Desktop/Python3/arashi.txt','r') as f:
    a = f.readlines()
    lst = lst + a

with open('Desktop/Python3/BUMP.txt','r') as f:
    a = f.readlines()
    lst = lst + a


with open('Desktop/Python3/ikimonogakari.txt','r') as f:
    a = f.readlines()
    lst = lst + a

with open('Desktop/Python3/nakajimamiyuki.txt','r') as f:
    a = f.readlines()
    lst = lst + a

with open('Desktop/Python3/nishinokana.txt','r') as f:
    a = f.readlines()
    lst = lst + a

with open('Desktop/Python3/poruno.txt','r') as f:
    a = f.readlines()
    lst = lst + a

with open('Desktop/Python3/shiinaringo.txt','r') as f:
    a = f.readlines()
    lst = lst + a

with open('Desktop/Python3/SMAP.txt','r') as f:
    a = f.readlines()
    lst = lst + a


with open('Desktop/Python3/yonedu_kashi.txt','r') as f:
    a = f.readlines()
    lst = lst + a


lst_artist = ['AKB48'] * 200 + ['嵐']*200 + ['BUMP OF CHICKEN']*126  +['いきものがかり']*143+['中島みゆき']*200+['西野カナ']*172+['ポルノグラフティ']*198+['椎名林檎']*108+['SMAP']*196+['米津玄師']*80

df = pd.DataFrame([lst,lst_artist],index=['lyrics','artist'])
df = df.T


import argparse
import csv
import os
import pickle

import pandas as pd
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



#データセットの前処理
def tokenize(tagger, text):
    result = tagger.parse(text).strip()
    return result



parser = argparse.ArgumentParser()
parser.add_argument('--dir')
parser.add_argument('--out', default='data')
args = parser.parse_args()

tagger = MeCab.Tagger('-Owakati')
df['lyrics'] = df['lyrics'].map(lambda x:tagger.parse(str(x)))
print(df.head(10))

# 学習データとテストデータに分割
train, test = train_test_split(df)

vectorizer = TfidfVectorizer(
    input='content')
# 素性ベクトル変換器を構築し、学習データに適用
train_titles = train['lyrics']
train_X = vectorizer.fit_transform(train_titles)

# テストデータに素性ベクトル変換器を適用
test_titles = test['lyrics']
test_X = vectorizer.transform(test_titles)

label_encoder = LabelEncoder()
# ラベルをIDへ変換する変換器を構築し、学習データに適用
train_categories = train['artist']
train_y = label_encoder.fit_transform(train_categories)

# 構築したラベル変換器をテストデータに適用
test_categories = test['artist']
test_y = label_encoder.transform(test_categories)

if not os.path.exists(args.out):
    os.makedirs(args.out)

train_file = os.path.join(args.out, 'train.pickle')
test_file = os.path.join(args.out, 'test.pickle')
vectorizer_file = os.path.join(args.out, 'vectorizer.pickle')
label_encoder_file = os.path.join(args.out, 'label_encoder.pickle')

with open(train_file, 'wb') as f:
    pickle.dump([train_X, train_y], f)

with open(test_file, 'wb') as f:
    pickle.dump([test_X, test_y], f)

with open(vectorizer_file, 'wb') as f:
    pickle.dump(vectorizer, f)

with open(label_encoder_file, 'wb') as f:
    pickle.dump(label_encoder, f)



#分類器の学習
import argparse
import pickle
import os

from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_recall_fscore_support

parser = argparse.ArgumentParser()
parser.add_argument('--dir', default='data')
args = parser.parse_args()

train_file = os.path.join(args.dir, 'train.pickle')
with open(train_file, 'rb') as f:
    train_X, train_y = pickle.load(f)
    print('train', train_X.shape, train_y.shape)

test_file = os.path.join(args.dir, 'test.pickle')
with open(test_file, 'rb') as f:
    test_X, test_y = pickle.load(f)
    print('test:', test_X.shape, test_y.shape)

model = Perceptron(
    penalty='l2',
    shuffle=True,
    verbose=2)

# パーセプトロンを学習
model.fit(train_X, train_y)
# テストデータに対して予測
test_y_pred = model.predict(test_X)

# テストデータにおける適合率、再現率、F値を算出
precision, recall, fscore, _ = precision_recall_fscore_support(
    test_y,
    test_y_pred,
    average='micro')

print('Precision:', precision)
print('Recall:', recall)
print('F-score:', fscore)

model_file = os.path.join(args.dir, 'model.pickle')
with open(model_file, 'wb') as f:
    pickle.dump(model, f)


import argparse
import os
import pickle

import MeCab

  # この記事の前処理用プログラム


parser = argparse.ArgumentParser()
parser.add_argument('--dir', default='data')
args = parser.parse_args()

model_file = os.path.join(args.dir, 'model.pickle')
with open(model_file, 'rb') as f:
    model = pickle.load(f)

label_encoder_file = os.path.join(args.dir, 'label_encoder.pickle')
with open(label_encoder_file, 'rb') as f:
    label_encoder = pickle.load(f)

vectorizer_file = os.path.join(args.dir, 'vectorizer.pickle')
with open(vectorizer_file, 'rb') as f:
    vectorizer = pickle.load(f)

tagger = MeCab.Tagger('-Owakati')


text = 'ここにテキストを入力'
tokenized = tokenize(tagger, text)

x = vectorizer.transform([tokenized])
y = model.predict(x)
    # ラベルをIDから対応する文字に変換
label = label_encoder.inverse_transform(y)[0]
print('Tokenized:', tokenized)
print('Label:', label)