# Language Model

- Data: Jaychou Lyrics
- Data Process:  
    - Only use a part of this dataset
    - Build index dictionary
    - Transform data to indices
- Model:
    - Recurrent Network built from scratch
    - Use built-in tools

In [1]:
import torch
import random
import zipfile

import renyan_utils as ry

## Data

### Read

In [2]:
# saved to renyan_utils.py
def load_data_jay_lyrics(clip_num = 10000):
    with zipfile.ZipFile('data/book_data/jaychou_lyrics.txt.zip') as zin:
        with zin.open("jaychou_lyrics.txt") as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[:clip_num]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [2]:
# different every time
corpus_indices, char_to_idx, idx_to_char, vocab_size = ry.load_data_jay_lyrics()

### Sampling-Random

In [4]:
# saved to renyan_utils.py
def data_iter_random(corpus_indices, batch_size, num_steps, device = None):
    num_examples = (len(corpus_indices) - 1) // num_steps
    epoch_size = num_examples // batch_size
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)
    
    def _data(pos):
        return corpus_indices[pos:pos + num_steps]
    
    if device == None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(epoch_size):
        i = i*batch_size
        batch_indices = example_indices[i:i + batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield torch.tensor(X, dtype = torch.float32, device = device), torch.tensor(Y, dtype = torch.float32, device = device)

In [3]:
my_seq = list(range(30))
for X, Y in ry.data_iter_random(my_seq, batch_size = 2, num_steps = 6):
    print('X: ', X, '\nY: ', Y, '\n')

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [ 0.,  1.,  2.,  3.,  4.,  5.]]) 
Y:  tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [ 1.,  2.,  3.,  4.,  5.,  6.]]) 

X:  tensor([[18., 19., 20., 21., 22., 23.],
        [12., 13., 14., 15., 16., 17.]]) 
Y:  tensor([[19., 20., 21., 22., 23., 24.],
        [13., 14., 15., 16., 17., 18.]]) 



## Sampling-Consecutive

In [6]:
# saved to renyan_utils.py
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device = None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    corpus_indices = torch.tensor(corpus_indices, dtype = torch.float32, device = device)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0:batch_size*batch_len].view(batch_size, batch_len)
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:,i:i+num_steps]
        Y = indices[:,i+1:i+num_steps+1]
        yield X, Y

In [4]:
for X, Y in ry.data_iter_consecutive(my_seq, batch_size = 2, num_steps = 6):
    print('X: ', X, '\nY: ', Y, '\n')

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
Y:  tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]]) 

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
Y:  tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]]) 



---

# Recurrent Network

In [9]:
import time
import math
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
import renyan_utils as ry

In [19]:
# hyper parameter
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
corpus_indices, char_to_idx, idx_to_char, vocab_size = ry.load_data_jay_lyrics()

## Build from scratch

In [21]:
def one_hot(x, n_class, dtype = torch.float32):
    x = x.long()
    res = torch.zeros(x.shape[0], n_class, dtype = dtype, device = x.device)
    res.scatter_(1, x.view(-1, 1), 1)
    return res

In [22]:
def to_onehot(X, n_class):
    return [one_hot(X[:,i], n_class) for i in range(X.shape[1])]

In [23]:
X = torch.arange(10).view(2, 5)
inputs = to_onehot(X, vocab_size)
print(len(inputs), inputs[0].shape)

5 torch.Size([2, 1027])


In [24]:
def get_params():
    def _one(shape):
        ts = torch.tensor(np.random.normal(0, 0.01, size = shape), device = device, dtype = torch.float32)
        return torch.nn.Parameter(ts, requires_grad = True)
    
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device = device, requires_grad = True))
    W_hq = _one((num_hiddens, num_outputs))
    b_q = torch.nn.Parameter(torch.zeros(num_outputs, device = device, requires_grad = True))
    return nn.ParameterList((W_xh, W_hh, b_h, W_hq, b_q))

In [33]:
# model
def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device = device),)

In [34]:
# shape(inputs) = (batch_size, vocab_size)
# shape(outputs) = (batch_size, vocab_size)
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    # X is a row vector
    for X in inputs:
        H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h)
        Y = torch.matmul(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H,)

In [36]:
state = init_rnn_state(X.shape[0], num_hiddens, device)
inputs = to_onehot(X.to(device), vocab_size)
params = get_params()
outputs, state_new = rnn(inputs, state, params)
print(len(outputs), outputs[0].shape, state_new[0].shape)

5 torch.Size([2, 1027]) torch.Size([2, 256])
