# A2 - Language Modeling

In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import torchtext, datasets
from tqdm import tqdm
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pickle

In [56]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [57]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# 1. Load Data

In [58]:
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [59]:
dataset = datasets.load_dataset('KaungHtetCho/Harry_Potter_LSTM')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 57435
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5897
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6589
    })
})


In [60]:
print(dataset['train'][512]['text'])

Harry went back to the kitchen, still staring at his letter. He handed Uncle Vernon the bill and the postcard, sat down, and slowly began to open the yellow envelope. 


# 2.Preprocessing

### Tokenizing

In [61]:
tokenizer = get_tokenizer('basic_english')

#function to tokenize
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  

#map the function to each example
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][512]['tokens'])

['harry', 'went', 'back', 'to', 'the', 'kitchen', ',', 'still', 'staring', 'at', 'his', 'letter', '.', 'he', 'handed', 'uncle', 'vernon', 'the', 'bill', 'and', 'the', 'postcard', ',', 'sat', 'down', ',', 'and', 'slowly', 'began', 'to', 'open', 'the', 'yellow', 'envelope', '.']


### Numericalizing

In [62]:
## numericalizing

# Define special symbols and indices
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

vocab = build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3, specials=special_symbols)   

vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])

9805
['<unk>', '<pad>', '<sos>', '<eos>', '.', ',', 'the', 'and', 'to', "'"]


In [63]:
# Save the vocab
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

# 3. Preparing batch loader

In [64]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example['tokens'].append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size #get the int number of batches...
    data = data[:num_batches * batch_size] #make the batch evenly, and cut out any remaining                      
    data = data.view(batch_size, num_batches)     
    return data #[batch size, bunch of tokens]

In [65]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

# 4. Modeling

In [66]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

# 5. Training

In [67]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3    

In [68]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 36,884,045 trainable parameters


In [69]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [70]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, bunch of tokens]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [71]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [17]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 440.298
	Valid Perplexity: 249.668


                                                           

	Train Perplexity: 190.563
	Valid Perplexity: 129.291


                                                           

	Train Perplexity: 134.131
	Valid Perplexity: 106.986


                                                           

	Train Perplexity: 112.372
	Valid Perplexity: 96.674


                                                           

	Train Perplexity: 100.041
	Valid Perplexity: 89.816


                                                           

	Train Perplexity: 91.626
	Valid Perplexity: 85.362


                                                           

	Train Perplexity: 85.084
	Valid Perplexity: 81.959


                                                           

	Train Perplexity: 79.897
	Valid Perplexity: 79.037


                                                           

	Train Perplexity: 75.535
	Valid Perplexity: 77.038


                                                           

	Train Perplexity: 71.722
	Valid Perplexity: 75.215


                                                           

	Train Perplexity: 68.495
	Valid Perplexity: 73.852


                                                           

	Train Perplexity: 65.534
	Valid Perplexity: 72.683


                                                           

	Train Perplexity: 62.779
	Valid Perplexity: 71.791


                                                           

	Train Perplexity: 60.509
	Valid Perplexity: 71.060


                                                           

	Train Perplexity: 58.365
	Valid Perplexity: 70.553


                                                           

	Train Perplexity: 56.333
	Valid Perplexity: 70.237


                                                           

	Train Perplexity: 54.415
	Valid Perplexity: 70.106


                                                           

	Train Perplexity: 52.790
	Valid Perplexity: 69.793


                                                           

	Train Perplexity: 51.259
	Valid Perplexity: 69.885


                                                           

	Train Perplexity: 49.023
	Valid Perplexity: 69.070


                                                           

	Train Perplexity: 47.924
	Valid Perplexity: 68.788


                                                           

	Train Perplexity: 47.060
	Valid Perplexity: 68.700


                                                           

	Train Perplexity: 46.364
	Valid Perplexity: 68.602


                                                           

	Train Perplexity: 45.546
	Valid Perplexity: 68.679


                                                           

	Train Perplexity: 44.534
	Valid Perplexity: 68.283


                                                           

	Train Perplexity: 44.017
	Valid Perplexity: 68.330


                                                           

	Train Perplexity: 43.510
	Valid Perplexity: 68.056


                                                           

	Train Perplexity: 43.247
	Valid Perplexity: 68.099


                                                           

	Train Perplexity: 42.896
	Valid Perplexity: 67.956


                                                           

	Train Perplexity: 42.760
	Valid Perplexity: 67.957


                                                           

	Train Perplexity: 42.609
	Valid Perplexity: 67.979


                                                           

	Train Perplexity: 42.398
	Valid Perplexity: 67.995


                                                           

	Train Perplexity: 42.326
	Valid Perplexity: 68.014


                                                           

	Train Perplexity: 42.376
	Valid Perplexity: 68.012


                                                           

	Train Perplexity: 42.304
	Valid Perplexity: 68.007


                                                           

	Train Perplexity: 42.346
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.355
	Valid Perplexity: 68.005


                                                           

	Train Perplexity: 42.274
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.238
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.334
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.308
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.340
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.343
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.321
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.267
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.304
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.289
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.307
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.334
	Valid Perplexity: 68.006


                                                           

	Train Perplexity: 42.323
	Valid Perplexity: 68.006


# 6.Testing

In [72]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 85.020


# 7. Real-World Inference

In [73]:
with open("vocab.pkl", "rb") as f:
    vocab_test = pickle.load(f)

In [74]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [75]:
prompt = 'Harry Potter is'
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab_test, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter is as sane as he ' d been at hogwarts .

0.7
harry potter is as sane as he has been to tell me . yes , i am going to tell you what he was .

0.75
harry potter is as sane as he slept . i was off to all sorts of things in your cupboard . should he have to run on the doorstep in the pub ?

0.8
harry potter is as sane as he slept . i was off to all sorts of things in your cupboard . should he go to bed ? he said quietly .

1.0
harry potter is as sane as august . i cared what off to all in there , because he found it albus lupin potter .

