# Character Level RNN 字母级RNN
### 这个文件用没有全部完成，可以运行，但训练时没有收敛，也就是说不成功

# Introduction

In this notebook, we're going to be looking at how you can generate text that is similar to the input text that you give the network. This work is inspired by the great [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) by Andrej Karpathy and most of the code is adapted from [this tutorial](http://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/). The network that we're going to be building is a **character level recurrent neural network**.

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Load in Input Text

We'll be loading in a book from Thomas Paine, obtained from this [website](http://www.textfiles.com/etext/NONFICTION/). We'll need to clean up the sentences in the file, mainly removing the extra spaces.

In [2]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
# 除了空格，其他的标点符号都去除了，这一点好像不是很合理
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("       ", " ")
    string = string.lower().replace("     ", " ")
    string = string.lower().replace("    ", " ")
    string = string.lower().replace("   ", " ")
    string = string.lower().replace("  ", " ")
    string = string.lower().replace("   ", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [3]:
allText = ""
with open("Data/paine.txt", "r") as f:
    lines=f.readlines()
    numWords = 0
    for line in lines:
        allText += (cleanSentences(line))
        numWords += len(line.split())
    chars = sorted(list(set(allText)))

nChars = len(allText)
nVocab = len(chars)
seqLength = 100
print ("字典中一共有%d个单词"%numWords)
print ("文本中一共有%d个字母级部分"%nVocab)
print ("文本中一共有%d个单词"%nChars)


字典中一共有7280个单词
文本中一共有37个字母级部分
文本中一共有40064个单词


In [4]:
chars

[' ',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [5]:
charToInt = dict((c, i) for i, c in enumerate(chars))

In [6]:
charToInt

{' ': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 'a': 11,
 'b': 12,
 'c': 13,
 'd': 14,
 'e': 15,
 'f': 16,
 'g': 17,
 'h': 18,
 'i': 19,
 'j': 20,
 'k': 21,
 'l': 22,
 'm': 23,
 'n': 24,
 'o': 25,
 'p': 26,
 'q': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'x': 34,
 'y': 35,
 'z': 36}

In [7]:
# prepare the dataset of input to output pairs encoded as integers
# 把文本分组，以seqLength长度为一组，这里seqLength=100,一共组成(nChars-seqLength)组
dataX = []
dataY = []
for i in range(0, nChars - seqLength, 1):  # 0,1,2,3,...nChars-seqLength
    seq_in = allText[i:i + seqLength]
    seq_out = allText[i + seqLength]
    dataX.append([charToInt[char] for char in seq_in])
    dataY.append(charToInt[seq_out])
nExamples = len(dataX)
print ("Total Examples: ", nExamples)

('Total Examples: ', 39964)


In [8]:
print("每一个序列的长度为%d"%len(dataX[0]))
print(dataX[0])
print(dataY[0]) #Y从第100个字母开始

每一个序列的长度为100
[0, 2, 5, 0, 26, 11, 17, 15, 0, 26, 28, 19, 24, 30, 25, 31, 30, 0, 28, 15, 26, 28, 25, 14, 31, 13, 19, 12, 22, 15, 0, 15, 22, 15, 13, 30, 28, 25, 24, 19, 13, 0, 26, 31, 12, 22, 19, 29, 18, 19, 24, 17, 0, 13, 11, 24, 0, 14, 15, 16, 15, 11, 30, 0, 13, 15, 24, 29, 25, 28, 29, 18, 19, 26, 0, 0, 0, 30, 18, 19, 29, 0, 16, 19, 22, 15, 0, 19, 30, 29, 0, 26, 28, 19, 24, 30, 25, 31, 30, 0]
25


In [9]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (nExamples, seqLength, 1)) # 这样重构后，其实只有一个输入节点
# normalize 除以37
X = X / float(nVocab)
# one hot encode the output variable
# 把输出置为one-hot-coding
y = np.zeros([nExamples, nVocab])
for i, example in enumerate(dataY):
    lis = np.zeros(nVocab)
    lis[example] = 1
    y[i] = lis

In [10]:
# 举例说明
y[1]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

# Tensorflow Model

In [11]:
batchSize = 24
lstmUnits = 48
iterations = 100000
numDimensions = 1
numClasses = nVocab

In [12]:
import tensorflow as tf
from tensorflow.contrib import rnn
tf.reset_default_graph()
learning_rate = 0.001 # 学习速率

labels = tf.placeholder(tf.float32, [None, numClasses])
input_data = tf.placeholder(tf.float32, [None, seqLength, numDimensions])

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
#lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.85)
value, _ = tf.nn.dynamic_rnn(lstmCell, input_data, dtype=tf.float32)
#value,_ = rnn.static_rnn(lstmCell, input_data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# Training

In [13]:
import datetime

sess = tf.InteractiveSession()
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

In [None]:
from random import randint
def getTrainBatch():
    num = randint(0,nExamples - batchSize - 1)
    labels = y[num:num+batchSize]
    arr = X[num:num+batchSize]
    return arr, labels

In [None]:
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
avg_cost = 0.

for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    _,c = sess.run([optimizer,loss], {input_data: nextBatch, labels: nextBatchLabels})
    avg_cost =c
    #Write summary to Tensorboard
    if (i % 25 == 0):
        #summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        #writer.add_summary(summary, i)
        c,acc = sess.run([loss,accuracy], {input_data: nextBatch, labels: nextBatchLabels})
        print "iter:", '%04d' % (i+1), "cost=", \
                "{:.9f}".format(avg_cost)+", Training acc:"+"{:.9f}".format(acc)

# writer.close()# 不想用Tensorboard

iter: 0001 cost= 3.681311131, Training acc:0.041666668
iter: 0026 cost= 3.205077887, Training acc:0.208333343
iter: 0051 cost= 3.046351910, Training acc:0.166666672
iter: 0076 cost= 3.029627800, Training acc:0.166666672
iter: 0101 cost= 2.871790171, Training acc:0.125000000
iter: 0126 cost= 2.880517483, Training acc:0.166666672
iter: 0151 cost= 2.701065540, Training acc:0.166666672
iter: 0176 cost= 2.835211515, Training acc:0.166666672
iter: 0201 cost= 2.915950537, Training acc:0.125000000
iter: 0226 cost= 2.978018522, Training acc:0.166666672
iter: 0251 cost= 2.728186131, Training acc:0.250000000
iter: 0276 cost= 2.876547337, Training acc:0.125000000
iter: 0301 cost= 2.902638435, Training acc:0.041666668
iter: 0326 cost= 2.603354931, Training acc:0.208333343
iter: 0351 cost= 2.915779829, Training acc:0.166666672
iter: 0376 cost= 2.888105869, Training acc:0.208333343
iter: 0401 cost= 2.785046816, Training acc:0.166666672
iter: 0426 cost= 2.905962944, Training acc:0.125000000
iter: 0451

In [None]:
nextBatch, nextBatchLabels = getTrainBatch();
pred = sess.run(prediction, {input_data: nextBatch[0], labels: nextBatchLabels[0]})
p = (tf.argmax(pred,1).eval())
print ([chars[x] for x in p])

ValueError: Cannot feed value of shape (100, 1) for Tensor u'Placeholder_1:0', which has shape '(?, 100, 1)'

# What You Can Do (IN PROGRESS)

Really cool stuff right? Now that you know about the model, and how it works, you can try this generate new text based on your own datasets. In order to try the model with your own datasets, save the txt file in the Data folder, and then 