In [1]:
!pip install torchtext datasets --no-cache-dir

Collecting torchtext
  Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading torchtext-0.18.0-cp311-cp311-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m247.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip uninstall torchtext torch -y

Found existing installation: torchtext 0.18.0
Uninstalling torchtext-0.18.0:
  Successfully uninstalled torchtext-0.18.0
Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121


In [3]:
!pip install torch==2.1.0+cu121 torchtext==0.16.0 -f https://download.pytorch.org/whl/cu121/torch_stable.html

Looking in links: https://download.pytorch.org/whl/cu121/torch_stable.html
Collecting torch==2.1.0+cu121
  Downloading https://download.pytorch.org/whl/cu121/torch-2.1.0%2Bcu121-cp311-cp311-linux_x86_64.whl (2200.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 GB[0m [31m522.1 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchtext==0.16.0
  Downloading https://download.pytorch.org/whl/torchtext-0.16.0%2Bcpu-cp311-cp311-linux_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.1.0 (from torch==2.1.0+cu121)
  Downloading triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Collecting torchdata==0.7.0 (from torchtext==0.16.0)
  Downloading torchdata-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading torchdata-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [6]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. Load data - Wiki Text

We will be using wikitext which contains a large corpus of text, perfect for language modeling task.  we will use the `datasets` library from HuggingFace to load.

# **sleeping-ai/TEKGEN-Wiki dataset**

In [7]:
dataset = datasets.load_dataset("sleeping-ai/TEKGEN-Wiki")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/614 [00:00<?, ?B/s]

quadruples-train.txt:   0%|          | 0.00/840M [00:00<?, ?B/s]

quadruples-validation.txt:   0%|          | 0.00/105M [00:00<?, ?B/s]

quadruples-test.txt:   0%|          | 0.00/106M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6310060 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/788745 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/796981 [00:00<?, ? examples/s]

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 6310060
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 788745
    })
    test: Dataset({
        features: ['text'],
        num_rows: 796981
    })
})


In [9]:
# Take only the first 50000 examples from the training set
dataset['train'] = dataset['train'].select(range(50000))
dataset['validation'] = dataset['validation'].select(range(50000))
dataset['test'] = dataset['test'].select(range(50000))
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 50000
    })
})


In [10]:
print(dataset['train'].shape)

(50000, 1)


# 2. Preprocessing
### Tokenizing

Simply tokenize the given text to tokens.

In [11]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}

tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [12]:
print(tokenized_dataset['train'][223]['tokens'])

['roger', 'gaillard', '(', 'port-au-prince', ',', '10', 'april', '1923', '-', '2000', ')', 'was', 'a', 'haitian', 'historian', 'and', 'novelist', '.']


### Numericalizing

In [13]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [14]:
print(len(vocab))

23085


In [15]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', 'the', ',', '.', 'in', 'of', 'a', 'and', 'is']


## 3. Prepare the batch loader

### Prepare data

In [16]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [17]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [18]:
train_data.shape

torch.Size([128, 10229])

## 4. Modeling

In [19]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim

        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)

        self.init_weights()

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

## 5. Training

In [20]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65
lr = 1e-3

In [21]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The model has 64,094,765 trainable parameters


In [22]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1
    return src, target

In [23]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):

    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]

    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)

    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()

        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]
        target = target.reshape(-1)
        loss = criterion(prediction, target)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [24]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [25]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion,
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size,
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')



	Train Perplexity: 586.265
	Valid Perplexity: 226.085




	Train Perplexity: 203.382
	Valid Perplexity: 116.897




	Train Perplexity: 129.521
	Valid Perplexity: 89.948




	Train Perplexity: 101.742
	Valid Perplexity: 76.331




	Train Perplexity: 85.330
	Valid Perplexity: 68.425




	Train Perplexity: 74.517
	Valid Perplexity: 63.025




	Train Perplexity: 66.551
	Valid Perplexity: 59.198




	Train Perplexity: 60.525
	Valid Perplexity: 56.672




	Train Perplexity: 55.699
	Valid Perplexity: 54.684




	Train Perplexity: 51.693
	Valid Perplexity: 53.228




	Train Perplexity: 48.280
	Valid Perplexity: 52.021




	Train Perplexity: 45.436
	Valid Perplexity: 51.120




	Train Perplexity: 42.862
	Valid Perplexity: 50.579




	Train Perplexity: 40.682
	Valid Perplexity: 50.574




	Train Perplexity: 37.835
	Valid Perplexity: 49.864




	Train Perplexity: 36.436
	Valid Perplexity: 49.619




	Train Perplexity: 35.310
	Valid Perplexity: 49.628




	Train Perplexity: 34.084
	Valid Perplexity: 49.006




	Train Perplexity: 33.424
	Valid Perplexity: 48.925




	Train Perplexity: 32.817
	Valid Perplexity: 48.874




	Train Perplexity: 32.294
	Valid Perplexity: 48.761




	Train Perplexity: 31.774
	Valid Perplexity: 48.819




	Train Perplexity: 31.256
	Valid Perplexity: 48.895




	Train Perplexity: 30.933
	Valid Perplexity: 48.557




	Train Perplexity: 30.850
	Valid Perplexity: 48.554




	Train Perplexity: 30.717
	Valid Perplexity: 48.430




	Train Perplexity: 30.694
	Valid Perplexity: 48.446




	Train Perplexity: 30.736
	Valid Perplexity: 48.303




	Train Perplexity: 30.692
	Valid Perplexity: 48.295




	Train Perplexity: 30.714
	Valid Perplexity: 48.250




	Train Perplexity: 30.723
	Valid Perplexity: 48.241




	Train Perplexity: 30.711
	Valid Perplexity: 48.226




	Train Perplexity: 30.707
	Valid Perplexity: 48.228




	Train Perplexity: 30.725
	Valid Perplexity: 48.228




	Train Perplexity: 30.742
	Valid Perplexity: 48.225




	Train Perplexity: 30.649
	Valid Perplexity: 48.227




	Train Perplexity: 30.731
	Valid Perplexity: 48.227




	Train Perplexity: 30.703
	Valid Perplexity: 48.226




	Train Perplexity: 30.728
	Valid Perplexity: 48.227




	Train Perplexity: 30.687
	Valid Perplexity: 48.227




	Train Perplexity: 30.679
	Valid Perplexity: 48.227




	Train Perplexity: 30.675
	Valid Perplexity: 48.227




	Train Perplexity: 30.690
	Valid Perplexity: 48.227




	Train Perplexity: 30.732
	Valid Perplexity: 48.226




	Train Perplexity: 30.680
	Valid Perplexity: 48.226




	Train Perplexity: 30.696
	Valid Perplexity: 48.226




	Train Perplexity: 30.680
	Valid Perplexity: 48.226




	Train Perplexity: 30.649
	Valid Perplexity: 48.226




	Train Perplexity: 30.682
	Valid Perplexity: 48.227




	Train Perplexity: 30.705
	Valid Perplexity: 48.226


## 6. **Testing**

In [26]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 46.286


In [27]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)

            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab

            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)
            prediction = torch.multinomial(probs, num_samples=1).item()

            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [28]:
prompt = 'Harry Potter is '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer,
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
harry potter is a director of the american television series , and the first time in the american film series .

0.7
harry potter is a director of the de la salle , dean of burgundy and to the english throne .

0.75
harry potter is best known as a producer and actor .

0.8
harry potter is best known as a producer and actor .

1.0
harry potter is best known as a producer and musical singer .

