In [1]:
# Install datasets library Hugging Face
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.18-cp310-cp310-win_amd64.whl.metadata (8.0 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from datasets)
  Downloading PyYAML-6.0.2-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from collections import Counter # dictionary for counting hashable objects
from itertools import chain # takes multiple iterables and flattens them into a single sequence

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import warnings
warnings.simplefilter(action='ignore')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
# Load the WikiText-2 dataset from Hugging Face

# dataset will be a dictionary-like object with splits: "train", "validation", and "test"
dataset = load_dataset("wikitext", "wikitext-2-raw-v1") # (dataset group, specific version)
train_texts = dataset["train"]["text"] # extract the text column from the training set

Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 726241.68 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 2447966.27 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 1250938.61 examples/s]


In [6]:
# Preprocess: basic tokenization and cleaning
tokenized = []
for line in train_texts:
    if len(line.strip()) > 0: # strip() to remove whitespaces
        tokenized.append(line.lower().split()) # convert text to lowercase and split on whitespace

flat_tokens = list(chain.from_iterable(tokenized)) # use chain to flatten one list of tokens

```
line = "Hello World!"
line.lower().split()
# Output: ['hello', 'world!']
```

```
tokenized = [['hello', 'world'], ['this', 'is', 'fine']]
flat_tokens = ['hello', 'world', 'this', 'is', 'fine']
```

In [7]:
# Build vocab
vocab_counter = Counter(flat_tokens) # count the frequency of each token in the dataset
vocab = {word: i+2 for i, (word, _) in enumerate(vocab_counter.items())} # build a dictionary mapping each word to a unique index
vocab["<pad>"] = 0 # padding: help when batching sequences of different lengths
vocab["<unk>"] = 1 # unknown words
inv_vocab = {i: w for w, i in vocab.items()} # create a reverse search dictionary so you can convert indices back to words

```
Counter(['hello', 'world', 'hello'])
# Output: {'hello': 2, 'world': 1}
```

```
flat_tokens = ["hello", "world", "hello"]

vocab = {"hello": 2, "world": 3, "<pad>": 0, "<unk>": 1}
inv_vocab = {2: "hello", 3: "world", 0: "<pad>", 1: "<unk>"}
```

In [8]:
# map each token (word) to its corresponding index from the vocab dictionary
data = [vocab.get(word, vocab["<unk>"]) for word in flat_tokens]

```
vocab = {'hello': 2, 'world': 3, '<pad>': 0, '<unk>': 1}
flat_tokens = ['hello', 'world', 'unknownword']

data = [2, 3, 1]
```

In [23]:
# Create sequences
seq_len = 7 # length of input sequences

X, y = [], []
for i in range(len(data) - seq_len):
    X.append(data[i:i+seq_len]) # input: 5-token sequence
    y.append(data[i+seq_len]) # target: next token
X = torch.tensor(X) # convert x list into PyTorch tensors
y = torch.tensor(y) # convert y list into PyTorch tensors

```
data = [2, 3, 5, 4, 6, 7], seq_len = 5

X = [[2, 3, 5, 4, 6]]
y = [7]
```

In [24]:
# DataLoader
# TensorDataset(X, y): Wraps the input/output tensors into a dataset --> (X[i], y[i])
train_loader = DataLoader(TensorDataset(X, y), batch_size=64, shuffle=True)

In [25]:
# Model
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size) # convert token indices into embedding vectors
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads) # create encoder layers in Transformer
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) # Output shape: [batch_size, seq_len, embed_size]
        x = x.permute(1, 0, 2)  # reshaping the dimensions --> Output shape: [seq_len, batch_size, embed_size]
        x = self.transformer(x)
        x = x[-1]  # last token output (the output corresponding to the last token in the input sequence)
        x = self.fc(x)
        return x

In [26]:
# Instantiate and train
embed_size = 64
num_heads = 2
num_layers = 1

model = TransformerLM(len(vocab), embed_size, num_heads, num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [27]:
epochs = 10

model.train()
for epoch in range(epochs):
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        output = model(xb)
        loss = loss_fn(output, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

print('Training complete!')

Epoch 1/10, Loss: 6.5609
Epoch 2/10, Loss: 6.1916
Epoch 3/10, Loss: 6.0593
Epoch 4/10, Loss: 5.9681
Epoch 5/10, Loss: 5.9032
Epoch 6/10, Loss: 5.8497
Epoch 7/10, Loss: 5.8027
Epoch 8/10, Loss: 5.7658
Epoch 9/10, Loss: 5.7289
Epoch 10/10, Loss: 5.6946
Training complete!


In [28]:
def predict_next(text, model, vocab, inv_vocab, top_k=10):
    model.eval()
    tokens = [vocab.get(w.lower(), vocab["<unk>"]) for w in text.split()]
    tokens = tokens[-seq_len:]  # keep only last `seq_len` tokens
    if len(tokens) < seq_len:
        tokens = [vocab["<pad>"]] * (seq_len - len(tokens)) + tokens
    input_tensor = torch.tensor(tokens).unsqueeze(0).to(device)

    with torch.no_grad():
        logits = model(input_tensor)
        probs = F.softmax(logits, dim=-1)
        top_probs, top_idxs = torch.topk(probs, top_k)

        print(f"\nInput: '{text}'")
        print("Top predictions:")
        for i in range(top_k):
            word = inv_vocab[top_idxs[0][i].item()]
            prob_percent = top_probs[0][i].item() * 100
            print(f"{word}: {prob_percent:.2f}%")

        total_predictions = probs.shape[-1]
        print(f"\nTotal number of possible predictions: {total_predictions}")

In [29]:
print("seq_len = 7")
predict_next("The book is very ", model, vocab, inv_vocab)

seq_len = 7

Input: 'The book is very '
Top predictions:
rich: 3.80%
similar: 3.32%
famous: 3.20%
popular: 2.68%
low: 2.63%
different: 2.36%
rare: 2.11%
large: 1.75%
difficult: 1.73%
successful: 1.64%

Total number of possible predictions: 66651
