# Recurrent Neural Networks and Language Models

You guys probably very excited about ChatGPT.  In today class, we will be implementing a very simple language model, which is basically what ChatGPT is, but with a simple LSTM.  You will be surprised that it is not so difficult at all.

Paper that we base on is *Regularizing and Optimizing LSTM Language Models*, https://arxiv.org/abs/1708.02182

In [1]:
# !pip install datasets

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# torch.cuda.get_device_name(0)

cpu


In [3]:
! pip install pythainlp

Collecting pythainlp
  Downloading pythainlp-3.1.1-py3-none-any.whl (9.6 MB)
Installing collected packages: pythainlp
Successfully installed pythainlp-3.1.1


In [3]:
from pythainlp.tokenize import word_tokenize

## 1. Load data - Thaisum
We will be using wikitext which contains a large corpus of text, perfect for language modeling task.  This time, we will use the `datasets` library from HuggingFace to load.

In [6]:
from datasets import load_dataset

ds = datasets.load_dataset("thaisum")
# dataset = datasets.load_dataset('wikitext', 'wikitext-2-raw-v1')
# print(next(iter(ds))["code"])





  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
type(ds)

datasets.dataset_dict.DatasetDict

## 2. Preprocessing

### Tokenizing

Simply tokenize the given text to tokens.

In [8]:
word_tokenize(text = "ผมไปกินข้าว")

['ผม', 'ไป', 'กินข้าว']

In [9]:
tokenizer = word_tokenize

#function to tokenize
tokenize_data = lambda example, tokenizer: {'tokens': word_tokenize(example['summary'])}  

#map the function to each example
tokenized_dataset = ds.map(tokenize_data, remove_columns=['summary'], fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][50000]['tokens'])



['อินดี้', 'จาก', 'อุดร', ' ', 'โอ๊ค', '-', 'สุ', 'พัฒน', '์', 'กิจ', ' ', 'ถวิล', 'การ', ' ', 'เล่น', 'กีตาร์', 'และ', 'ร้อง', 'นำ', ' ', 'ดูโอ', 'กับ', ' ', 'ทัด', '-', 'ณัตฐ', 'พงษ์', ' ', 'สุทธิ', 'วงศ์', 'กร', ' ', 'ตี', 'กลอง', ' ', 'พวกเขา', 'ทั้งสอง', 'ทำ', 'เพลง', 'มีเสน่ห์', ' ', 'การเขียน', 'เนื้อเพลง', 'แบบ', 'ย้ำคำ', 'ซ้ำๆ', ' ', 'วน', 'ไป', 'ให้', 'ติด', 'หู', 'แต่', 'ไม่', 'เข้าใจ', ' ', 'กับ', 'กีตาร์', 'ที่', 'มี', 'ความ', 'พริ้ม', 'เป็น', ' ', 'Dream', ' ', 'Pop', ' ', 'ล่องลอย', ' ', 'บาง', 'เพลง', 'เป็น', ' ', 'Shoegaze', ' ', 'ได้', 'เฉย', 'เลย', ' ']


In [10]:
print(tokenized_dataset['train'][50]['tokens'])

['ครอบครัว', 'หนุ่ม', 'สุราษฎร์ฯ', ' ', 'เข้าพบ', 'ผู้ใหญ่บ้าน', 'หมู่', ' ', '10', ' ', 'เมือง', 'คอน', ' ', 'แสดงตัว', 'เป็นเจ้าของ', ' ', 'เจ้า', ' ', 'บิ๊ก', 'บู', ' ', 'สุนัข', 'ตกรถ', ' ', 'เพศ', 'ผู้', ' ', 'อารมณ์ดี', ' ', 'เผย', 'วินาที', 'แรก', 'ที่', 'เจอ', 'หน้า', 'กลับ', 'เฉย', ' ', 'แต่', 'พอ', 'ถูก', 'ลูบคลำ', 'และ', 'เอา', 'ผล', 'ปาล์ม', 'ให้', 'กิน', 'กลับ', 'แสดง', 'ความดีใจ']


test thailand language tokenizer

In [11]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body', 'type', 'tags', 'url', 'tokens'],
        num_rows: 358868
    })
    validation: Dataset({
        features: ['title', 'body', 'type', 'tags', 'url', 'tokens'],
        num_rows: 11000
    })
    test: Dataset({
        features: ['title', 'body', 'type', 'tags', 'url', 'tokens'],
        num_rows: 11000
    })
})

### Numericalizing

We will tell torchtext to add any word that has occurred at least three times in the dataset to the vocabulary because otherwise it would be too big.  Also we shall make sure to add `unk` and `eos`.

In [12]:
## numericalizing
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], 
min_freq=3) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])       

44161
['<unk>', '<eos>', ' ', 'ที่', 'ใน', 'และ', 'ไม่', 'ของ', 'มี', 'การ']


## 3. Prepare the batch loader

### Prepare data

Given "Chaky loves eating at AIT", and "I really love deep learning", and given batch size = 3, we will get three batches of data "Chaky loves eating at", "AIT `<eos>` I really", "love deep learning `<eos>`".  

In [13]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example['tokens'].append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)          
    return data


In [14]:
len(tokenized_dataset['train'])

358868

In [15]:
batch_size = 200
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

In [16]:
type(train_data)

torch.Tensor

## 4. Modeling 

In [4]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

## 5. Training 

Follows very basic procedure.  One note is that some of the sequences that will be fed to the model may involve parts from different sequences in the original dataset or be a subset of one (depending on the decoding length). For this reason we will reset the hidden state every epoch, this is like assuming that the next batch of sequences is probably always a follow up on the previous in the original dataset.

In [5]:
import pickle
from torchtext.vocab import Vocab

# define file path for loading
file_path = 'vocab.pkl'

# load the Vocab object
with open(file_path, 'rb') as f:
    vocab = pickle.load(f)



In [6]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)

In [19]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 107,279,489 trainable parameters


In [20]:
def get_batch(data, seq_len, idx):
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [21]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [5]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [None]:
n_epochs = 5
seq_len  = 50
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

## 6. Testing

In [None]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

In [7]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))

<All keys matched successfully>

## 7. Real-world inference

Here we take the prompt, tokenize, encode and feed it into the model to get the predictions.  We then apply softmax while specifying that we want the output due to the last word in the sequence which represents the prediction for the next word.  We divide the logits by a temperature value to alter the model’s confidence by adjusting the softmax probability distribution.

Once we have the Softmax distribution, we randomly sample it to make our prediction on the next word. If we get <unk> then we give that another try.  Once we get <eos> we stop predicting.
    
We decode the prediction back to strings last lines.

In [8]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [9]:
tokenizer = word_tokenize

In [13]:
prompt = 'ควย'
max_seq_len = 60
seed = 0

temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    
    print(str(temperature)+'\n'+''.join(generation))

0.5
ควยช้างป่า จ.เลย ถูกขวิดเสียชีวิต 
0.7
ควยช้างป่า จ.เลย ถูกพัดทำร้ายลงมา มีรอยช้ำ 
0.75
ควยช้างป่า จ.เลย ถูกพัดทำร้ายลงมา มีรอยช้ำที่แขน 
0.8
ควยช้างป่า จ.เลย ถูกพัดทำร้ายลงมาอย่างมีเงื่อนงำและความรุนแรงในช่วงหน้าแล้ง
1.0
ควยหัวใจสลาย ขาดหมอ ส่วนเหยื่อคนซ้อนไม่มีใครและความเข้าเวรไทย ทำเอาคนฟังต่างฝ่ายต่างบอกบ้านเราในเรื่องราวที่อยากให้ลูกหนีข่าวได้ คาดเป็นคนที่เล่า มีแม่ม่ายสุนัขช่วยบริจาคผู้พิการหลายแสน เนื่องจากมีคนแจ้งว่าเป็นของขวัญ
