In [0]:
###############################################################################
# rnn (recurrent neural net) character language model
###############################################################################
import time, os, sys, random, datetime, math
import pickle
import argparse
import numpy as np
#import matplotlib.pyplot as plt

import torch
import torch.nn as nn

###############################################################################
use_cuda = torch.cuda.is_available()
print(torch.version.__version__, use_cuda)
###############################################################################
def cuda(arr):
    if use_cuda:
        return arr.cuda()
    return arr

###############################################################################
global log_file
global CharDict
global CharList
global CharWeight
global cover_list

CharDict = dict()
CharList = []
cover_list = ['\n','\t', '　',' ', '（', '）', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '“','”','…','‘']

###############################################################################
# tunable hyper-parameters - default values
###############################################################################
# these values are used to create the model
###############################################################################
# character embedding
char_embed_size = 30

# recurrent layers
rnn_size = 300
# rnn_size = 400 # dimension of hidden layer
rnn_nLayers = 2

RNN_layers = [rnn_size, rnn_nLayers]

# feed-forward layers
layer0 = 128
layer1 = 64
layer2 = 128

layer3 = 100
layer4 = 100

FFNN_layers = [layer0, layer1, layer2]
#FFNN_layers = [layer0, layer1, layer2, layer3]
#FFNN_layers = [layer0, layer1, layer2, layer3, layer4]

dropout = 0.0
dropout = 0.1

###############################################################################
# these values are used within the training code
###############################################################################
global learning_rate
global batch_size
global chunk_size
global nEpochs
global L2_lambda
global Max_Vocab

Max_Vocab = 3600

learning_rate = 0.0001
learning_rate = 0.0003
# learning_rate = 0.001
# learning_rate = 0.003

batch_size = 5
batch_size = 20
# batch_size = 100

chunk_size = 50
chunk_size = 100
# chunk_size = 200

#nEpochs = 1
nEpochs = 2
nEpochs = 10
nEpochs = 200
#nEpochs = 1000

L2_lambda = 0.0
# L2_lambda = 0.002

###############################################################################
class RNN(nn.Module):
###############################################################################
    def __init__(self, specs):
        super(RNN, self).__init__()

        nChars, embed_size, rnn_layers, ffnn_layers, dropout = specs
        self.CharEmbed = nn.Embedding(nChars, embed_size)

        rnn_size, rnn_nLayers = rnn_layers
        self.rnn = nn.GRU(embed_size, rnn_size, rnn_nLayers, dropout=dropout, batch_first = True)
        # self.rnn = nn.RNN(embed_size, rnn_size, rnn_nLayers, dropout=dropout, batch_first = True)
        # self.rnn = nn.LSTM(embed_size, rnn_size, rnn_nLayers, dropout=dropout, batch_first = True)

        self.layers = nn.ModuleList([])
        prev_size = rnn_size
        for i, layer_size in enumerate(ffnn_layers):
            layer = nn.Linear(prev_size, layer_size)
            self.layers.append(layer)
            prev_size = layer_size

        self.out = nn.Linear(prev_size, nChars) # character - CrossEntropy

        self.non_linear = nn.LeakyReLU(negative_slope=0.01)
        
        self.dropout = nn.Dropout(dropout)

        for p in self.parameters(): # optionally apply different randomization
            if p.dim() > 1:
                nn.init.kaiming_normal_(p)
                pass

    #################################################################
    def forward(self, seqs, hidden=None):
        # input is a list of sequences of torch longs already on cuda as needed
        nBatch = len(seqs)
        nChars = len(seqs[0])

        seqs = torch.cat(seqs).view(nBatch, nChars)

        embed = self.CharEmbed(seqs)

        prev, hidden = self.rnn(embed, hidden)

        #print(prev.size())

        for layer in self.layers:
            prev = layer(prev)
            prev = self.non_linear(prev)
            prev = self.dropout(prev)

        out = self.out(prev) # chars
        # print(out.size())
        #hidden = torch.transpose(hidden, 0, 1)
        return out, hidden

###############################################################################
def RNN_train(model, optimizer, criterion, chunks, update=True):
    model.zero_grad()
    loss = 0
    nFrames = 0

    out, hidden = model(chunks)

    skip = 5
    nBatch = len(chunks)   
    for i in range(nBatch):
        loss += criterion(out[i][skip-1:-1,:], chunks[i][skip:])
        nFrames += len(chunks[i]) - skip

    if update:
        if not loss is 0:
            loss.backward()
            optimizer.step()

    return loss.data.item(), nFrames

###############################################################################
def train_rnn_model(model, data_train, data_dev=None):
    if use_cuda:
        model = model.cuda()

    # define the loss functions
    criterion = nn.CrossEntropyLoss(reduction='sum', weight = CharWeight)

    # choose an optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=L2_lambda)

    data_len = len(data_train) - chunk_size
    nLoops = 1 + len(data_train) // (batch_size * chunk_size)

    results = []

    start_time = time.time()
    for e in range(nEpochs):
        train_frames = 0
        train_loss = 0
        model.train()
        for i in range(nLoops):
            chunks = []
            for j in range(batch_size):
                start = np.random.randint(data_len)
                chunks.append(data_train[start:start+chunk_size])
        
            loss, nFrames = RNN_train(model, optimizer, criterion, chunks, update=True)
          
            train_frames += nFrames
            train_loss += loss
            # print("%2d %6d %8.3f" % (e, (i+1)*batch_size*chunk_size, loss/nFrames))

        print(train_frames, train_loss, train_loss / train_frames)
        sample = generate(model, seed="这", n=100)
        print('epoch'+ str(e) + '#  ' +sample)

        dev_frames = 0
        dev_loss = 0
        if not data_dev is None:
            model.eval()
            chunks = []
            for start in range(0, len(data_dev)-chunk_size, chunk_size):
                chunks.append(data_dev[start:start+chunk_size])

            dev_loss, dev_frames = RNN_train(model, optimizer, criterion, chunks, update=False)

        # if dev_frames == 0: dev_frames = 1
        # log_message(log_file, "%3d %8d %8.3f\t%8d %8.3f\t%6.1f" % (e, train_frames, \
        #               train_loss/train_frames, dev_frames, dev_loss/dev_frames, (time.time()-start_time)))

    # torch.save(model, 'model/charlm-temp.pth')

    return model

###############################################################################
def log_message(outf, message):
    print(message)
    if not outf is None:
        outf.write(message)
        outf.write("\n")
        outf.flush()

def generate(model, seed="The ", n=100):
    model.eval()
    ndx_data = cuda(convert_data(seed, CharDict))
    c, h = model([ndx_data])
    text = list(seed)
    for i in range(n):
        scores = c[0,-1]
        _, best = scores.max(0)
        best = best.data.item()
        text.append(CharList[best].char)
        c_in = cuda(torch.LongTensor([best]))
        c, h = model([c_in], h)

    return ''.join(text)

# input a list of chars, output a tensor made of index of chars
def convert_data(data, CharDict, quit_early=0):
    start = time.time()
    if quit_early > 0:
        data = data[:quit_early]
    
    ndx_data = torch.zeros(len(data), dtype=torch.long)

    for i, c in enumerate(data):
        #print(i, c)
        ndx_data[i] = CharDict[c].ndx

    return ndx_data

###############################################################################
class TextConverter:
    def __init__(self, text_path, max_vocab=Max_Vocab):
        with open(text_path, 'r') as f:
          text = f.read()
        data = text
        text = []
        for c in data:
          if c not in cover_list:
            text.append(c)

        vocab = set(text)
        vocab_count = {}

        for word in vocab:
          vocab_count[word] = 0
        for word in text:
          vocab_count[word] += 1
        vocab_count_list = []
        for word in vocab_count:
          vocab_count_list.append((word, vocab_count[word]))
        vocab_count_list.sort(key=lambda x:x[1], reverse=True)

        avg_count = len(text) / len(vocab_count_list)
        vocab_count_list = vocab_count_list[:max_vocab]
        num_val = 0
        for num in vocab_count_list:
          num_val += num[1]
        
        self.weight = []
        for x in vocab_count_list:
          # self.weight.append(math.sqrt(avg_count/x[1]))
          self.weight.append(math.pow(avg_count/x[1], 0.3))
        self.weight.append(math.sqrt(avg_count/(len(text)-num_val)))
        
        vocab = [x[0] for x in vocab_count_list]
        self.vocab = vocab

        self.word_to_int_table = {c: i for i, c in enumerate(self.vocab)}
        self.int_to_word_table = dict(enumerate(self.vocab))
      
    def vocab_size(self):
      return len(self.vocab) + 1

    def word_to_int(self, word):
      if word in self.word_to_int_table:
        return self.word_to_int_table[word]
      else:
        return len(self.vocab)
    
    def int_to_word(self, index):
      if index == len(self.vocab):
        return '<unk>'
      elif index < len(self.vocab):
        return self.int_to_word_table[index]
      else:
        raise Exception('Unknown index!')
    
    def text_to_arr(self, text):
      arr = []
      for word in text:
        arr.append(self.word_to_int(word))
      return np.array(arr)

    def arr_to_text(self, arr):
      words = []
      for index in arr:
        words.append(self.int_to_word(index))
      return ''.join(words)


class Char:
    def __init__(self, char):
        self.char = char
        self.ndx = None  # the index number of this char
        self.count = 0

###############################################################################
# start main
###############################################################################
global convert 
convert = TextConverter('./tlbb.txt')

for i in range(convert.vocab_size()):
  c = convert.int_to_word(i)
  char = Char(c)
  char.ndx = i
  CharDict[c] = char
  CharList.append(char)

# char = Char('<unk>')
# char.ndx = convert.word_to_int(c)
# CharDict['<unk>'] = char
# CharList.append(char)

CharWeight = torch.zeros(len(CharList), dtype=torch.float)
for i in range(len(convert.weight)):
  CharWeight[i] = convert.weight[i]
  # CharWeight[i] = 1
CharWeight[-1] = 0
CharWeight = cuda(CharWeight)
for i in range(10):
  print(CharList[i].char)
  print(CharWeight[i])
print(CharList[-2].char)
print(CharWeight[-2])
print(CharList[-1].char)
print(CharWeight[-1])
# CharList is a int to char table, with 0th item being ndx=0 char (of class Char)

with open('tlbb.txt', 'r') as f:
  corpus = f.read()
data = corpus
corpus = []
for c in data:
  if c not in cover_list:
    corpus.append(c)

# corpus = corpus[:500000]

train_data = convert.text_to_arr(corpus)
ndx_train_data = torch.from_numpy(train_data)
ndx_train_data = cuda(ndx_train_data)
nChars = convert.vocab_size()
specs = [nChars, char_embed_size, RNN_layers, FFNN_layers, dropout]
model = RNN(specs)
print(model)

train_rnn_model(model, ndx_train_data, data_dev=None)



1.4.0 True
，
tensor(0.1783, device='cuda:0')
。
tensor(0.2523, device='cuda:0')
不
tensor(0.2767, device='cuda:0')
一
tensor(0.2824, device='cuda:0')
的
tensor(0.2837, device='cuda:0')
是
tensor(0.2934, device='cuda:0')
道
tensor(0.2992, device='cuda:0')
了
tensor(0.3000, device='cuda:0')
：
tensor(0.3013, device='cuda:0')
人
tensor(0.3183, device='cuda:0')
悭
tensor(4.3816, device='cuda:0')
<unk>
tensor(0., device='cuda:0')
RNN(
  (CharEmbed): Embedding(3601, 30)
  (rnn): GRU(30, 300, num_layers=2, batch_first=True, dropout=0.1)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=128, bias=True)
  )
  (out): Linear(in_features=128, out_features=3601, bias=True)
  (non_linear): LeakyReLU(negative_slope=0.01)
  (dropout): Dropout(p=0.1, inplace=False)
)
1105800 5095559.920410156 4.608030313266554
epoch0#  这，这一一，你一，你一，你一，你你这，你你你你你你你你你你你你你你你你你你我我我你不你不不，你我你你你你你我

RNN(
  (CharEmbed): Embedding(3601, 30)
  (rnn): GRU(30, 300, num_layers=2, batch_first=True, dropout=0.1)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=128, bias=True)
  )
  (out): Linear(in_features=128, out_features=3601, bias=True)
  (non_linear): LeakyReLU(negative_slope=0.01)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [0]:

sample = generate(model, seed="第一", n=100)
print(sample)

第一个铭下一团漆黑，说道：你们不肯娶了驸马，我们也不用好了。段誉道：你们不肯让我杀了。段誉道：你们不肯让我瞧瞧，我们我们不肯杀我？你们们我们是我们我们这个珍珑棋局，却也不妨。段誉道：你们不肯让我杀了。段誉


## Character Language Model
1. Upload the 'tlbb.txt' corpus into the colab notebook and sart running. The corpus has 1 million chinese characters and approximately 4000 different chinese characters.
2. Collect all the characters with frequency rank bigger than Max_Vocab and map it to '\<unk\>' character.
3. After adding weight to nn.CrossEntropyLoss, the model can generate chinese sentences character by character.
4. GRU and LSTM both perform better than vanilla RNN