In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "g:\My Drive\AT82.05 NLU\Assignments\NLP Assignments\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "g:\My Drive\AT82.05 NLU\Assignments\NLP Assignments\.venv\lib\site-packages

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


## Task 1. Dataset Acquisition - Harry Potter

In [3]:
from datasets import load_dataset

dataset = load_dataset("KaungHtetCho/Harry_Potter_LSTM")

print(dataset)

# Load Dataset: The datasets library loads the Harry Potter dataset via load_dataset.

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 57435
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5897
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6589
    })
})


In [29]:
print(dataset['train'].shape)

(57435, 1)


In [5]:
print(dataset['train'][:100])

{'text': ["Harry Potter and the Sorcerer's Stone ", '', 'CHAPTER ONE ', '', 'THE BOY WHO LIVED ', '', "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. ", '', 'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. ', '', "The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear

 ## Task 2. Model Training

### 1) Preprocessing the Text Data
#### a) Tokenization

In [30]:
import torchtext

# The basic_english tokenizer breaks down the raw text into individual tokens.
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')


# Tokenization is applied to all text examples in the dataset using:
tokenize_data = lambda example: {'tokens': tokenizer(example['text'])}
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'])

print(tokenized_dataset['train'][0])

Map: 100%|██████████| 57435/57435 [00:03<00:00, 16922.83 examples/s]
Map: 100%|██████████| 5897/5897 [00:00<00:00, 13717.27 examples/s]
Map: 100%|██████████| 6589/6589 [00:00<00:00, 13099.43 examples/s]

{'tokens': ['harry', 'potter', 'and', 'the', 'sorcerer', "'", 's', 'stone']}





#### b) Building the Vocabulary

In [31]:
# Build Vocabulary: Using torchtext.vocab.build_vocab_from_iterator, a vocabulary is created from tokenized data.
# Words with a frequency less than 3 are excluded (min_freq=3).
# Special tokens like <unk> (unknown) and <eos> (end of sequence) are added

vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])  # Default token for unknown words

#### c) Preparing the Data for Training

In [32]:
# Numericalize Data: Each token is mapped to its corresponding index in the vocabulary, creating sequences of integers. 
# These sequences are reshaped into batches:

def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'] + ['<eos>']  # Add end-of-sequence token
            tokens = [vocab[token] for token in tokens]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches)  # [batch size, seq len]
    return data

# Split Data: The dataset is divided into train, validation, and test sets for model evaluation.
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)


### 2) Model Architecture and Training Process
#### a) Model Architecture:

In [None]:
# Model Architecture: The language model is an LSTM-based recurrent neural network with the following key components:

# Embedding Layer:
    # Maps each token (integer) to a dense vector of size emb_dim.
    # Provides semantic meaning for tokens.
# LSTM Layers:
    # A stack of 2 LSTM layers processes the input sequence (num_layers=2).
    # Each LSTM cell has a hidden state dimension of hid_dim=1024.
    # Dropout (dropout_rate=0.65) reduces overfitting.
# Fully Connected Layer:
    # Maps the output of the LSTM layers to the vocabulary size.
    # Predicts the next token in the sequence.

In [33]:
import torch
import torch.nn as nn
import math

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)

        self.init_weights()

# Initialization: All weights are initialized uniformly to stabilize training
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1 / math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim, self.hid_dim).uniform_(-init_range_other, init_range_other)
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, self.hid_dim).uniform_(-init_range_other, init_range_other)

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        embedding = self.dropout(self.embedding(src))  # [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)  # [batch size, seq len, hid dim]
        output = self.dropout(output)
        prediction = self.fc(output)  # [batch_size, seq_len, vocab_size]
        return prediction, hidden


#### b) Training Process:

In [38]:
# Training Loop: For each batch, the input sequence (src) and target sequence (target) are generated using:

def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [39]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        # The model predicts the next token using:
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        # The loss is computed using CrossEntropyLoss
        loss = criterion(prediction, target)
        
        # Gradients are backpropagated, and weights are updated using the Adam optimizer:
        loss.backward()
        # Gradient clipping (clip=0.25) ensures stable training:
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [40]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [41]:
import torch.optim as optim

# Initialize model
model = LSTMLanguageModel(len(vocab), emb_dim=1024, hid_dim=1024, num_layers=2, dropout_rate=0.65).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Training and evaluation loop
n_epochs = 50
seq_len = 50  # Decoding length
clip = 0.25

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, seq_len, device)

    print(f'Epoch {epoch+1}:')
#     Validation:
        # After each epoch, the model is evaluated on the validation set to monitor performance.
        # Perplexity, a measure of language model quality, is reported:
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')


                                                           

Epoch 1:
	Train Perplexity: 516.267
	Valid Perplexity: 312.083


                                                           

Epoch 2:
	Train Perplexity: 247.631
	Valid Perplexity: 152.466


                                                           

Epoch 3:
	Train Perplexity: 152.147
	Valid Perplexity: 113.978


                                                           

Epoch 4:
	Train Perplexity: 120.241
	Valid Perplexity: 98.783


                                                           

Epoch 5:
	Train Perplexity: 103.616
	Valid Perplexity: 90.179


                                                           

Epoch 6:
	Train Perplexity: 92.929
	Valid Perplexity: 84.597


                                                           

Epoch 7:
	Train Perplexity: 84.814
	Valid Perplexity: 80.463


                                                           

Epoch 8:
	Train Perplexity: 78.236
	Valid Perplexity: 77.509


                                                           

Epoch 9:
	Train Perplexity: 73.009
	Valid Perplexity: 75.296


                                                           

Epoch 10:
	Train Perplexity: 68.483
	Valid Perplexity: 73.751


                                                           

Epoch 11:
	Train Perplexity: 64.871
	Valid Perplexity: 72.207


                                                           

Epoch 12:
	Train Perplexity: 61.609
	Valid Perplexity: 70.873


                                                           

Epoch 13:
	Train Perplexity: 58.663
	Valid Perplexity: 70.029


                                                           

Epoch 14:
	Train Perplexity: 56.337
	Valid Perplexity: 69.555


                                                           

Epoch 15:
	Train Perplexity: 54.002
	Valid Perplexity: 68.965


                                                           

Epoch 16:
	Train Perplexity: 51.981
	Valid Perplexity: 68.296


                                                           

Epoch 17:
	Train Perplexity: 50.236
	Valid Perplexity: 68.405


                                                           

Epoch 18:
	Train Perplexity: 48.575
	Valid Perplexity: 67.821


                                                           

Epoch 19:
	Train Perplexity: 47.057
	Valid Perplexity: 67.467


                                                           

Epoch 20:
	Train Perplexity: 45.619
	Valid Perplexity: 67.472


                                                           

Epoch 21:
	Train Perplexity: 44.257
	Valid Perplexity: 67.253


                                                           

Epoch 22:
	Train Perplexity: 42.947
	Valid Perplexity: 67.177


                                                           

Epoch 23:
	Train Perplexity: 41.883
	Valid Perplexity: 66.773


                                                           

Epoch 24:
	Train Perplexity: 40.779
	Valid Perplexity: 66.995


                                                           

Epoch 25:
	Train Perplexity: 39.789
	Valid Perplexity: 67.002


                                                           

Epoch 26:
	Train Perplexity: 38.783
	Valid Perplexity: 66.821


                                                           

Epoch 27:
	Train Perplexity: 37.921
	Valid Perplexity: 67.074


                                                           

Epoch 28:
	Train Perplexity: 37.106
	Valid Perplexity: 67.289


                                                           

Epoch 29:
	Train Perplexity: 36.248
	Valid Perplexity: 67.871


                                                           

Epoch 30:
	Train Perplexity: 35.565
	Valid Perplexity: 68.523


                                                           

Epoch 31:
	Train Perplexity: 34.879
	Valid Perplexity: 68.372


                                                           

Epoch 32:
	Train Perplexity: 34.235
	Valid Perplexity: 68.812


                                                           

Epoch 33:
	Train Perplexity: 33.652
	Valid Perplexity: 68.290


                                                           

Epoch 34:
	Train Perplexity: 32.945
	Valid Perplexity: 69.306


                                                           

Epoch 35:
	Train Perplexity: 32.414
	Valid Perplexity: 69.373


                                                           

Epoch 36:
	Train Perplexity: 31.867
	Valid Perplexity: 69.967


                                                           

Epoch 37:
	Train Perplexity: 31.379
	Valid Perplexity: 69.882


                                                           

Epoch 38:
	Train Perplexity: 30.743
	Valid Perplexity: 70.405


                                                           

Epoch 39:
	Train Perplexity: 30.341
	Valid Perplexity: 70.952


                                                           

Epoch 40:
	Train Perplexity: 29.889
	Valid Perplexity: 70.735


                                                           

Epoch 41:
	Train Perplexity: 29.487
	Valid Perplexity: 71.093


                                                           

Epoch 42:
	Train Perplexity: 29.069
	Valid Perplexity: 71.712


                                                           

Epoch 43:
	Train Perplexity: 28.747
	Valid Perplexity: 71.507


                                                           

Epoch 44:
	Train Perplexity: 28.284
	Valid Perplexity: 71.744


                                                           

Epoch 45:
	Train Perplexity: 27.972
	Valid Perplexity: 72.116


                                                           

Epoch 46:
	Train Perplexity: 27.620
	Valid Perplexity: 71.767


                                                           

Epoch 47:
	Train Perplexity: 27.291
	Valid Perplexity: 72.441


                                                           

Epoch 48:
	Train Perplexity: 26.946
	Valid Perplexity: 72.581


                                                           

Epoch 49:
	Train Perplexity: 26.712
	Valid Perplexity: 73.117


                                                           

Epoch 50:
	Train Perplexity: 26.325
	Valid Perplexity: 73.195


In [45]:
torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

## 6. Testing

In [46]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 96.453


## 7. Real-world inference

Here we take the prompt, tokenize, encode and feed it into the model to get the predictions.  We then apply softmax while specifying that we want the output due to the last word in the sequence which represents the prediction for the next word.  We divide the logits by a temperature value to alter the model’s confidence by adjusting the softmax probability distribution.

Once we have the Softmax distribution, we randomly sample it to make our prediction on the next word. If we get <unk> then we give that another try.  Once we get <eos> we stop predicting.
    
We decode the prediction back to strings last lines.

In [47]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [48]:
prompt = 'Harry Potter is '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter is a bad man , i reckon you ' d better go back to bed , and i ' m going to go to the hospital wing .

0.7
harry potter is a bad man , i reckon you ' d go to bed .

0.75
harry potter is . he didn ' t need him , but he didn ' t like a single idea of a very good time quidditch , it wasn ' t a attack

0.8
harry potter is . he didn ' t need him , but he didn ' t like a single family .

1.0
harry potter is . he didn ' t need him , but he didn ' t like good for his hand to eat the seeing .

