# Language Model

- Data: Jaychou Lyrics
- Data Process:  
    - Only use a part of this dataset
    - Build index dictionary
    - Transform data to indices

In [4]:
import torch
import random
import zipfile

import renyan_utils as ry

## Data

### Read

In [None]:
# saved to renyan_utils.py
def load_data_jay_lyrics(clip_num = 10000):
    with zipfile.ZipFile('data/book_data/jaychou_lyrics.txt.zip') as zin:
        with zin.open("jaychou_lyrics.txt") as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[:clip_num]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [5]:
# different every time
corpus_indices, char_to_idx, idx_to_char, vocab_size = ry.load_data_jay_lyrics()

### Sampling-Random

In [38]:
def data_iter_random(corpus_indices, batch_size, num_steps, device = None):
    num_examples = (len(corpus_indices) - 1) // num_steps
    epoch_size = num_examples // batch_size
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)
    
    def _data(pos):
        return corpus_indices[pos:pos + num_steps]
    
    if device == None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(epoch_size):
        i = i*batch_size
        batch_indices = example_indices[i:i + batch_size]
        X = [_data(j * num_steps) for j in batch_indices]
        Y = [_data(j * num_steps + 1) for j in batch_indices]
        yield torch.tensor(X, dtype = torch.float32, device = device), torch.tensor(Y, dtype = torch.float32, device = device)

In [39]:
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size = 2, num_steps = 6):
    print('X: ', X, '\nY: ', Y, '\n')

X:  tensor([[12., 13., 14., 15., 16., 17.],
        [18., 19., 20., 21., 22., 23.]]) 
Y:  tensor([[13., 14., 15., 16., 17., 18.],
        [19., 20., 21., 22., 23., 24.]]) 

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [ 6.,  7.,  8.,  9., 10., 11.]]) 
Y:  tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [ 7.,  8.,  9., 10., 11., 12.]]) 



## Sampling-Consecutive

In [42]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device = None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    corpus_indices = torch.tensor(corpus_indices, dtype = torch.float32, device = device)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0:batch_size*batch_len].view(batch_size, batch_len)
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:,i:i+num_steps]
        Y = indices[:,i+1:i+num_steps+1]
        yield X, Y

In [43]:
for X, Y in data_iter_consecutive(my_seq, batch_size = 2, num_steps = 6):
    print('X: ', X, '\nY: ', Y, '\n')

X:  tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
Y:  tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]]) 

X:  tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
Y:  tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]]) 

