In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# read data
with open("shakespeare.txt") as f:
    data = f.read()

data = data[:1000000]
chars = list(set(data))
char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for i, ch in enumerate(chars)}

In [3]:
def get_seqs(data: str, seq_length: int):
    num_seq = (len(data) - 1) // seq_length

    for i in range(num_seq):
        begin = i * seq_length
        in_seq = data[begin:begin + seq_length]
        out_seq = data[begin + 1:begin + seq_length + 1]

        # pre-processing step for input seq
        int_in_seq = [char2idx[ch] for ch in in_seq]
        X = torch.tensor(int_in_seq).unsqueeze(0).long()

        # pre-processing step for target seq
        int_out_seq = np.array([char2idx[ch] for ch in out_seq])
        y_tg = torch.from_numpy(int_out_seq)
        yield (X, y_tg)

In [4]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, bias=True, batch_first=True, rnn_type="LSTM"):
        super(CharRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layesr = num_layers
        self.bias = bias
        self.batch_first = batch_first
        
        self.embedding = nn.Embedding(num_embeddings=input_size, embedding_dim=hidden_size)
    
        if rnn_type == "LSTM":
            self.rnn = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size,
                               num_layers=num_layers, bias=bias, batch_first=batch_first)
        elif rnn_type == "GRU":
            self.rnn = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, 
                              num_layers=num_layers, bias=bias, batch_first=batch_first)
        else:
            self.rnn = nn.RNN(input_size=hidden_size, hidden_size=hidden_size, 
                              num_layers=num_layers, bias=bias, batch_first=batch_first)
        
        self.fc = nn.Linear(hidden_size, input_size)

    def forward(self, input, hidden):
        out = self.embedding(input)
        out, hidden = self.rnn(out, hidden)
        out = out.reshape(-1, self.hidden_size)
        out = self.fc(out)
        return out, hidden

In [5]:
len_seq = 100
input_size = len(chars)
hidden_size = 128
num_layers = 2

In [6]:
def init_hidden(rnn_type: str):
    if rnn_type == "LSTM":
        return (torch.zeros((num_layers, 1, hidden_size)).cuda(), 
                torch.zeros((num_layers, 1, hidden_size)).cuda())
    return torch.zeros((num_layers, 1, hidden_size)).cuda()

In [7]:
def detach_hidden(hidden: torch.tensor, rnn_type: str):
    if rnn_type == "LSTM":
        return (hidden[0].detach(), hidden[1].detach())
    return hidden.detach()

In [8]:
def train(num_epochs: int = 20, rnn_type: str = "LSTM"):
    net = CharRNN(input_size, hidden_size, rnn_type=rnn_type).cuda().train()
    print(net)
    
    optimizer = optim.Adam(net.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()
    
    running_loss = None
    seqs = list(get_seqs(data, len_seq))

    for epoch in range(num_epochs):
        hidden = init_hidden(rnn_type=rnn_type)

        for idx, (X, y_tg) in enumerate(seqs):
            X = X.cuda()
            y_tg = y_tg.cuda()

            # pass through network
            Y, hidden = net(X, hidden)

            # compute loss
            optimizer.zero_grad()
            loss = criterion(Y, y_tg)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
            optimizer.step()
            hidden = detach_hidden(hidden=hidden, rnn_type=rnn_type)

            running_loss = loss.item() if running_loss is None else running_loss * 0.99 + 0.01 * loss.item()

            if idx % 1000 == 0:
                print("Epoch: %d, Iter: %d, Loss: %.3f" % (epoch, idx, running_loss))
    
    return net

In [9]:
def sample(net, rnn_type: str = "LSTM", len_seq: int = 1000, k: int = 5):
    net.eval()

    ch = np.random.randint(0, len(chars))
    hidden = init_hidden(rnn_type=rnn_type)
    sentence = []

    with torch.no_grad():
        for i in range(1, len_seq):
            sentence.append(ch)
            X = torch.tensor([[ch]]).cuda()
            
            Y, hidden = net(X, hidden)
            Y = F.softmax(Y, 1)

            values, indices = Y[0].topk(k)
            values, indices = values.cpu().numpy(), indices.cpu().numpy()
            p = values / values.sum()
            ch = np.random.choice(indices, p=p)

    sentence = "".join([idx2char[idx] for idx in sentence])
    return sentence

### LSTM

In [10]:
rnn_type = "LSTM"
net = train(num_epochs=5, rnn_type=rnn_type)
text = sample(net, rnn_type=rnn_type, len_seq=1000, k=10)
print(text)

CharRNN(
  (embedding): Embedding(82, 128)
  (rnn): LSTM(128, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=82, bias=True)
)
Epoch: 0, Iter: 0, Loss: 4.427
Epoch: 0, Iter: 1000, Loss: 2.646
Epoch: 0, Iter: 2000, Loss: 2.489
Epoch: 0, Iter: 3000, Loss: 2.286
Epoch: 0, Iter: 4000, Loss: 2.112
Epoch: 0, Iter: 5000, Loss: 2.065
Epoch: 0, Iter: 6000, Loss: 2.157
Epoch: 0, Iter: 7000, Loss: 2.047
Epoch: 0, Iter: 8000, Loss: 1.941
Epoch: 0, Iter: 9000, Loss: 1.945
Epoch: 1, Iter: 0, Loss: 1.899
Epoch: 1, Iter: 1000, Loss: 1.803
Epoch: 1, Iter: 2000, Loss: 1.859
Epoch: 1, Iter: 3000, Loss: 1.851
Epoch: 1, Iter: 4000, Loss: 1.748
Epoch: 1, Iter: 5000, Loss: 1.768
Epoch: 1, Iter: 6000, Loss: 1.775
Epoch: 1, Iter: 7000, Loss: 1.793
Epoch: 1, Iter: 8000, Loss: 1.688
Epoch: 1, Iter: 9000, Loss: 1.744
Epoch: 2, Iter: 0, Loss: 1.717
Epoch: 2, Iter: 1000, Loss: 1.654
Epoch: 2, Iter: 2000, Loss: 1.666
Epoch: 2, Iter: 3000, Loss: 1.687
Epoch: 2, Iter: 4000, Loss: 1.60

### GRU

In [11]:
rnn_type = "GRU"
net = train(num_epochs=5, rnn_type=rnn_type)
text = sample(net, rnn_type=rnn_type, len_seq=1000, k=10)
print(text)

CharRNN(
  (embedding): Embedding(82, 128)
  (rnn): GRU(128, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=82, bias=True)
)
Epoch: 0, Iter: 0, Loss: 4.426
Epoch: 0, Iter: 1000, Loss: 2.290
Epoch: 0, Iter: 2000, Loss: 2.256
Epoch: 0, Iter: 3000, Loss: 2.117
Epoch: 0, Iter: 4000, Loss: 1.945
Epoch: 0, Iter: 5000, Loss: 1.916
Epoch: 0, Iter: 6000, Loss: 1.984
Epoch: 0, Iter: 7000, Loss: 1.923
Epoch: 0, Iter: 8000, Loss: 1.815
Epoch: 0, Iter: 9000, Loss: 1.823
Epoch: 1, Iter: 0, Loss: 1.772
Epoch: 1, Iter: 1000, Loss: 1.686
Epoch: 1, Iter: 2000, Loss: 1.746
Epoch: 1, Iter: 3000, Loss: 1.736
Epoch: 1, Iter: 4000, Loss: 1.629
Epoch: 1, Iter: 5000, Loss: 1.673
Epoch: 1, Iter: 6000, Loss: 1.653
Epoch: 1, Iter: 7000, Loss: 1.699
Epoch: 1, Iter: 8000, Loss: 1.590
Epoch: 1, Iter: 9000, Loss: 1.638
Epoch: 2, Iter: 0, Loss: 1.605
Epoch: 2, Iter: 1000, Loss: 1.558
Epoch: 2, Iter: 2000, Loss: 1.578
Epoch: 2, Iter: 3000, Loss: 1.585
Epoch: 2, Iter: 4000, Loss: 1.507

### RNN

In [12]:
rnn_type = "RNN"
net = train(num_epochs=5, rnn_type=rnn_type)
text = sample(net, rnn_type=rnn_type, len_seq=1000, k=10)
print(text)

CharRNN(
  (embedding): Embedding(82, 128)
  (rnn): RNN(128, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=82, bias=True)
)
Epoch: 0, Iter: 0, Loss: 4.400
Epoch: 0, Iter: 1000, Loss: 2.257
Epoch: 0, Iter: 2000, Loss: 2.258
Epoch: 0, Iter: 3000, Loss: 2.118
Epoch: 0, Iter: 4000, Loss: 1.963
Epoch: 0, Iter: 5000, Loss: 1.927
Epoch: 0, Iter: 6000, Loss: 1.998
Epoch: 0, Iter: 7000, Loss: 1.951
Epoch: 0, Iter: 8000, Loss: 1.852
Epoch: 0, Iter: 9000, Loss: 1.858
Epoch: 1, Iter: 0, Loss: 1.807
Epoch: 1, Iter: 1000, Loss: 1.734
Epoch: 1, Iter: 2000, Loss: 1.795
Epoch: 1, Iter: 3000, Loss: 1.792
Epoch: 1, Iter: 4000, Loss: 1.695
Epoch: 1, Iter: 5000, Loss: 1.707
Epoch: 1, Iter: 6000, Loss: 1.707
Epoch: 1, Iter: 7000, Loss: 1.767
Epoch: 1, Iter: 8000, Loss: 1.665
Epoch: 1, Iter: 9000, Loss: 1.703
Epoch: 2, Iter: 0, Loss: 1.667
Epoch: 2, Iter: 1000, Loss: 1.633
Epoch: 2, Iter: 2000, Loss: 1.656
Epoch: 2, Iter: 3000, Loss: 1.667
Epoch: 2, Iter: 4000, Loss: 1.589