In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import random
import os

import keras
import numpy as np
from keras.callbacks import LambdaCallback
from keras.models import Input, Model, load_model
from keras.layers import LSTM, Dropout, Dense
from keras.optimizers import Adam

#from data_utils import *


Using TensorFlow backend.


In [0]:
def preprocess_file(Config):
    # 文本内容
    files_content = ''
    with open(Config.poetry_file, 'r',encoding='UTF-8') as f:
        for line in f:
            x = line.strip() + "]"
            x = x.split(":")[1]
            if len(x) <= 5 :
                continue
            if x[5] == '，':
                files_content += x
            
    # 統計字頻
    words = sorted(list(files_content))
    counted_words = {}
    for word in words:
        if word in counted_words:
            counted_words[word] += 1
        else:
            counted_words[word] = 1

    # 去掉低頻字
    erase = []
    for key in counted_words:
        if counted_words[key] <= 2:
            erase.append(key)
    for key in erase:
        del counted_words[key]
    wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])

    words, _ = zip(*wordPairs)
    words += (" ",)
    # word到id的映射
    word2num = dict((c, i) for i, c in enumerate(words))
    num2word = dict((i, c) for i, c in enumerate(words))
    word2numF = lambda x: word2num.get(x, len(words) - 1)
    return word2numF, num2word, words, files_content

In [0]:
class PoetryModel(object):
    def __init__(self, config, train):
        self.model = None
        self.do_train = train
        self.loaded_model = True
        self.config = config

        # 資料前處理
        self.word2numF, self.num2word, self.words, self.files_content = preprocess_file(self.config)
        
        # 詩的list
        self.poems = self.files_content.split(']')
        # 詩的總數
        self.poems_num = len(self.poems)
        
        # 如果模型存在讀入，否則開始訓練
        if os.path.exists(self.config.weight_file) and self.loaded_model:
            self.model = load_model(self.config.weight_file)
        else:
            self.train()

    def build_model(self):
        '''建立模型'''
        print('building model')

        # 输入的dimension
        input_tensor = Input(shape=(self.config.max_len, len(self.words)))
        lstm = LSTM(512, return_sequences=True)(input_tensor)
        dropout = Dropout(0.6)(lstm)
        lstm = LSTM(256)(dropout)
        dropout = Dropout(0.6)(lstm)
        dense = Dense(len(self.words), activation='softmax')(dropout)
        self.model = Model(inputs=input_tensor, outputs=dense)
        optimizer = Adam(lr=self.config.learning_rate)
        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    def sample(self, preds, temperature=1.0):
        '''
        temperature=1.0，模型輸出 正常
        temperature=0.5，模型輸出 開放
        temperature=1.5，模型輸出 保守
        訓練時 temp不同，結果不同
        '''
        preds = np.asarray(preds).astype('float64')
        exp_preds = np.power(preds,1./temperature)
        preds = exp_preds / np.sum(exp_preds)
        pro = np.random.choice(range(len(preds)),1,p=preds)
        return int(pro.squeeze())
    
    def generate_sample_result(self, epoch, logs):
        '''每4個epoch 印出預測'''
        if epoch % 4 != 0:
            return
        
        with open(os.path.join(self.config.folder_path,'out/out.txt'), 'a',encoding='utf-8') as f:
            f.write('==================Epoch {}=====================\n'.format(epoch))
                
        print("\n==================Epoch {}=====================".format(epoch))
        for diversity in [0.7, 1.0, 1.3]:
            print("------------Diversity {}--------------".format(diversity))
            generate = self.predict_random(temperature=diversity)
            print(generate)
            
            with open(os.path.join(self.config.folder_path,'out/out.txt'), 'a',encoding='utf-8') as f:
                f.write(generate+'\n')
    
    def predict_random(self,temperature = 1):
        '''隨機選出一句開頭，生成五言绝句'''
        if not self.model:
            print('model not loaded')
            return
        
        index = random.randint(0, self.poems_num)
        sentence = self.poems[index][: self.config.max_len]
        generate = self.predict_sen(sentence,temperature=temperature)
        return generate
    
    def predict_first(self, char,temperature =1):
        '''根據給出的首字生成'''
        if not self.model:
            print('model not loaded')
            return
        
        index = random.randint(0, self.poems_num)
        #隨機選一首詩的最後max_len字串＋給定首字做為初始輸入
        sentence = self.poems[index][1-self.config.max_len:] + char
        generate = str(char)
#         print('first line = ',sentence)
        # 預測後23字
        generate += self._preds(sentence,length=23,temperature=temperature)
        return generate
    
    def predict_sen(self, text,temperature =1):
        '''
        根據前max_len個字的第一句（含逗號），生成詩句
        '''
        if not self.model:
            return
        max_len = self.config.max_len
        if len(text)<max_len:
            print('length should not be less than ',max_len)
            return

        sentence = text[-max_len:]
        print('the first line:',sentence)
        generate = str(sentence)
        generate += self._preds(sentence,length = 24-max_len,temperature=temperature)
        return generate
    
    def predict_hide(self, text,temperature = 1):
        '''給定4字，生成藏頭詩'''
        if not self.model:
            print('model not loaded')
            return
        if len(text)!=4:
            print('藏頭詩輸入必須是4個字！')
            return
        
        index = random.randint(0, self.poems_num)
        #隨機人一首詩的最後max_len字串+給定首字做為初始輸入
        sentence = self.poems[index][1-self.config.max_len:] + text[0]
        generate = str(text[0])
        print('first line = ',sentence)
        
        for i in range(5):
            next_char = self._pred(sentence,temperature)           
            sentence = sentence[1:] + next_char
            generate+= next_char
        
        for i in range(3):
            generate += text[i+1]
            sentence = sentence[1:] + text[i+1]
            for i in range(5):
                next_char = self._pred(sentence,temperature)           
                sentence = sentence[1:] + next_char
                generate+= next_char

        return generate
    
    
    def _preds(self,sentence,length = 23,temperature =1):
        '''
        sentence:預測輸入值
        lenth:預測出的長度 輸入max_len長度，得到length長的預測值字串
        '''
        sentence = sentence[:self.config.max_len]
        generate = ''
        for i in range(length):
            pred = self._pred(sentence,temperature)
            generate += pred
            sentence = sentence[1:]+pred
        return generate
        
        
    def _pred(self,sentence,temperature =1):
        '''根據一串輸入獲得單個字'''
        if len(sentence) < self.config.max_len:
            print('in def _pred,length error ')
            return
        
        sentence = sentence[-self.config.max_len:]
        x_pred = np.zeros((1, self.config.max_len, len(self.words)))
        for t, char in enumerate(sentence):
            x_pred[0, t, self.word2numF(char)] = 1.
        preds = self.model.predict(x_pred, verbose=0)[0]
        next_index = self.sample(preds,temperature=temperature)
        next_char = self.num2word[next_index]
        
        return next_char

    def data_generator(self):
        '''數據生成器'''
        i = 0
        while 1:
            x = self.files_content[i: i + self.config.max_len]
            y = self.files_content[i + self.config.max_len]

            if ']' in x or ']' in y:
                i += 1
                continue

            y_vec = np.zeros(
                shape=(1, len(self.words)),
                dtype=np.bool
            )
            y_vec[0, self.word2numF(y)] = 1.0

            x_vec = np.zeros(
                shape=(1, self.config.max_len, len(self.words)),
                dtype=np.bool
            )

            for t, char in enumerate(x):
                x_vec[0, t, self.word2numF(char)] = 1.0

            yield x_vec, y_vec
            i += 1

    def train(self):
        print('training')
        number_of_epoch = len(self.files_content)-(self.config.max_len + 1)*self.poems_num
        number_of_epoch /= self.config.batch_size 
        number_of_epoch = int(number_of_epoch / 1.5)
        print('epoches = ',number_of_epoch)
        print('poems_num = ',self.poems_num)
        print('len(self.files_content) = ',len(self.files_content))

        if not self.model:
            self.build_model()

        self.model.fit_generator(
            generator=self.data_generator(),
            verbose=True,
            steps_per_epoch=self.config.batch_size,
            epochs=number_of_epoch,
            callbacks=[
                keras.callbacks.ModelCheckpoint(self.config.weight_file, save_weights_only=False),
                LambdaCallback(on_epoch_end=self.generate_sample_result)
            ]
        )



In [0]:
class Config(object):
    folder_path = 'drive/My Drive/class/勞動部/week9/RNN/poems_gen'
    poetry_file = os.path.join(folder_path, 'dataset/poetry2.txt') 
    weight_file = os.path.join(folder_path, 'poetry_model2.h5')
    #weight_file = os.path.join(folder_path, 'poetry_model20200507.h5')
    # weight_file = 'poetry_model_test.h5'
    # 前六個字預測第七個
    max_len = 6
    batch_size = 32
    learning_rate = 0.001


In [0]:
config = Config()
#model = PoetryModel(config, train=True)
model = PoetryModel(config, train=False)

In [9]:
for i in range(3):
    #藏頭詩
    # sen = model.predict_hide('中午飲水')
    sen = model.predict_hide('深度學習')
    print(sen)

first line =  句寄須頻。深
深川白軍秋，度分對遠華。學吹亭青望，習春依柳微。
first line =  沙埋皓齒。深
深徒悠神為，度和君春天。學不如身白，習成還懸閒。
first line =  夜蒼龍是。深
深不君秋何，度事士相去。學有蒼猶可，習古發傳煙。


In [0]:
for i in range(3):
    # 給第一句
    # sen = model.predict_sen('山為斜好機，')
    sen = model.predict_sen('白日依山盡，')
    print(sen)

the first line: 白日依山盡，
白日依山盡，江雲西海石。中發無日草，草多葉難動。
the first line: 白日依山盡，
白日依山盡，去相州合未。老鄉別西更，雨林過逢下。
the first line: 白日依山盡，
白日依山盡，吹鄉復里月。欲自中年山，方客見事家。


In [0]:
for i in range(3):
    # 给第一字
    sen = model.predict_first('新')
    print(sen)

新陽傳會與，故沙竹行海。鳳何上還別，太遠君空連。
新陽驚門言，難將塵蒼已。萬雲非徒將，望門來清落。
新天海自花，楚開夕花鳥。萬此見者衣，越東生動無。


In [0]:
for temp in [0.5,1,1.5]:
    # 隨機抽取一句話預測
    sen = model.predict_random(temperature=temp)
    print(sen)

the first line: 見說南來處，
見說南來處，獨空道水國。百雲木何遊，入鳥春無水。
the first line: 氣殺高隼擊，
氣殺高隼擊，白江白永人。來深晚歲夕，木數光親雨。
the first line: 夜泊江門外，
夜泊江門外，有枝齊子望。暫人暗見萬，清城宿望山。
