Language modeling task  
Assign a probablity for the likelihood of a given word (or a sequence of words)  
to follow a sequence of words. A sequence of tokens are passed to the embedding  
layer first, followed by a positional encoding layer to account for the order of  
the word.  
  
The nn.TransformerEncoder consists of multiple layers of nn.TransformerEncoderLayer.   
Along with the input sequence, a square attention mask is required because the self-  
attention layers in nn.TransformerDecoder are only allowed to attend the earlier  
positions in the sequence. For the language modeling task, any tokens on the future  
positions should be masked. This masking, combined with fact that the output embeddings  
are offset with later positions ensures that the predicttions for position I can depend  
only on the known outputs at positions less than i. To produce a probability  
distribution over output words, the output of the nn.TransformerEncoder model is passed  
through a linear layer to output unnormalized logits. The log-softmax function isn't  
applied here due to th elater use of CrossEntropyLoss, which requires the inputs to be  
unnormalized logits.


In [7]:
# Define the model
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )

class PositionalEncoding( nn.Module ):

  def __init__( self, d_model: int, dropout: float = 0.1, max_len: int = 5000 ):
    super().__init__()
    self.dropout = nn.Dropout( p = dropout )

    position = torch.arange( max_len ).unsqueeze( 1 )
    div_term = torch.exp( torch.arange( 0, d_model, 2 ) * ( -math.log( 10000.0 ) / d_model ) )
    pe = torch.zeros( max_len, 1, d_model )
    pe[ :, 0, 0::2 ] = torch.sin( position * div_term )
    pe[ :, 0, 1::2 ] = torch.cos( position * div_term )
    self.register_buffer( 'pe', pe )

  def forward( self, x: Tensor ) -> Tensor:
    """
    Arguments: 
      x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
    """
    x = x + self.pe[ :x.size(0) ]
    return self.dropout( x )

class TransformerModel(nn.Module):

  def __init__( self, ntoken: int, d_model: int, nhead: int, d_hid: int, 
                nlayers: int, dropout: float = 0.5 ):
    super().__init__()
    self.model_type = 'Transformer'
    self.pos_encoder = PositionalEncoding( d_model, dropout )
    encoder_layers = TransformerEncoderLayer( d_model, nhead, d_hid, dropout )
    self.transformer_encoder = TransformerEncoder( encoder_layers, nlayers )
    self.embedding = nn.Embedding( ntoken, d_model )
    self.d_model = d_model
    self.linear = nn.Linear( d_model, ntoken )

    self.init_weights()

  def init_weights( self ) -> None:
    initrange = 0.1
    self.embedding.weight.data.uniform_( -initrange, initrange )
    self.linear.bias.data.zero_()
    self.linear.weight.data.uniform_( -initrange, initrange )

  def forward( self, src: Tensor, src_mask: Tensor = None ) -> Tensor:
    """
    Arguments:
      src: Tensor, shape ``[seq_len, batch_size]``
      src_mask: Tensor, shape ``[seq_len, seq_len]``

    Returns:
      output Tensor of shape ``[seq_len, batch_size, ntoken]``
    """
    src = self.embedding( src ) * math.sqrt( self.d_model )
    src = self.pos_encoder( src )
    if src_mask is None:
      """
      Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
      Unmasked positions are filled with float(0.0).
      """
      src_mask = nn.Transformer.generate_square_subsequent_mask( len(src)).to(device)

    output = self.transformer_encoder( src, src_mask )
    output = self.linear( output )
    return output

In [None]:
#class PositionalEncoding( nn.Module ):
#
#  def __init__( self, d_model: int, dropout: float = 0.1, max_len: int = 5000 ):
#    super().__init__()
#    self.dropout = nn.Dropout( p = dropout )
#
#    position = torch.arange( max_len ).unsqueeze( 1 )
#    div_term = torch.exp( torch.arange( 0, d_model, 2 ) * ( -math.log( 10000.0 ) / d_model ) )
#    pe = torch.zeros( max_len, 1, d_model )
#    pe[ :, 0, 0::2 ] = torch.sin( position * div_term )
#    pe[ :, 0, 1::2 ] = torch.cos( position * div_term )
#    self.register_buffer( 'pe', pe )
#
#  def forward( self, x: Tensor ) -> Tensor:
#    """
#    Arguments: 
#      x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
#    """
#    x = x + self.pe[ :x.size(0) ]
#    return self.dropout( x )

In [8]:
train_iter = WikiText2( split = 'train' )
tokenizer = get_tokenizer( 'basic_english' )
vocab = build_vocab_from_iterator( 
  map( 
    tokenizer, train_iter 
  ), 
  specials = [   '<unk>' ] 
)
vocab.set_default_index( vocab[ '<unk>' ] )

def data_process( raw_text_iter: dataset.IterableDataset ) -> Tensor: 
  """Converts raw text into a flat Tensor."""
  data = [ torch.tensor( vocab( tokenizer( item ) ), dtype = torch.long ) for item in raw_text_iter ]
  return torch.cat( tuple( filter( lambda t: t.numel() > 0, data ) ) )

# ``train_iter`` was "consumed" by the process of building the vocab, 
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process( train_iter )
val_data = data_process( val_iter )
test_data = data_process( test_iter )

def batchify( data: Tensor, bsz: int ) -> Tensor:
  """Divides the data into ``bsz``separate sequences, removing extra elements that wouldn't cleanly fit.

  Arguments:
    data: Tensor, shape ``[N]``
    bsz: int, batch size

  Returns:
    Tensor of shape ``[N // bsz, bsz]``
  """
  seq_len = data.size( 0 ) // bsz
  data = data[ :seq_len * bsz ]
  data = data.view( bsz, seq_len ).t().contiguous()
  return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify( train_data, batch_size ) # shape ``[seq_len, batch_size]``
val_data = batchify( val_data, eval_batch_size )
test_data = batchify( test_data, eval_batch_size )

In [9]:
bptt = 35
def get_batch( source: Tensor, i: int ) -> Tuple[ Tensor, Tensor ]:
  """
  Args:
    source: Tensor, shape ``[seq_len, batch_size]``
    i: int

  Returns: 
    tuple (data, target), where data has shape ``[seq_len, batch_size]`` and target has shape ``[seq_len * batch_size]``
  """
  seq_len = min( bptt, len( source ) -1 - i )
  data = source[ i:i+seq_len ]
  target = source[ i+1:i+1+seq_len ].reshape( -1 )
  return data, target

In [10]:
ntokens = len(vocab) # size of vocabulary
emsize = 200 # embedding dimension
d_hid = 200 # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # number of heads in nn.MultiheadAttention
dropout = 0.2 # dropout probability
model = TransformerModel( ntokens, emsize, nhead, d_hid, nlayers, dropout ).to(device)



In [15]:
import time 

criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD( model.parameters(), lr = lr )
scheduler = torch.optim.lr_scheduler.StepLR( optimizer, 1.0, gamma = 0.95 )

def train( model: nn.Module ) -> None:
  model.train() # Turn on the train mode
  total_loss = 0.
  log_interval = 200
  start_time = time.time()

  num_batches = len( train_data ) // bptt
  for batch, i in enumerate( range( 0, train_data.size( 0 ) - 1, bptt ) ):
    data, targets = get_batch( train_data, i )
    output = model( data )
    output_flat = output.view( -1, ntokens )
    loss = criterion( output_flat, targets )

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_( model.parameters(), 0.5 )
    optimizer.step()

    total_loss += loss.item()
    if batch % log_interval == 0 and batch > 0:
      lr = scheduler.get_last_lr()[0]
      ms_per_batch = ( time.time() - start_time ) * 1000 / log_interval
      cur_loss = total_loss / log_interval
      ppl = math.exp( cur_loss )
      print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
            f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
            f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
      total_loss = 0
      start_time = time.time()

def evaluate( model: nn.Module, eval_data: Tensor ) -> float:
  model.eval() # Turn on the evaluation mode
  total_loss = 0
  with torch.no_grad():
    for i in range( 0, eval_data.size(0) -1, bptt ):
      data, targets = get_batch( eval_data, i )
      seq_len = data.size( 0 )
      output = model( data )
      output_flat = output.view( -1, ntokens )
      total_loss += seq_len * criterion( output_flat, targets ).item()
  return total_loss / ( len( eval_data ) - 1 )



In [16]:
best_val_loss = float('inf')
epochs = 3

with TemporaryDirectory() as tempdir:
  best_model_params_path = os.path.join( tempdir, 'best_model_params.pt' )

  for epoch in range( 1, epochs + 1 ):
    epoch_start_time = time.time()
    train( model )
    val_loss = evaluate( model, val_data )
    val_ppl = math.exp( val_loss )
    elapsed = time.time() - epoch_start_time
    print( '-' * 89 )
    print( f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}' )
    print( '-' * 89 )

    if val_loss < best_val_loss:
      best_val_loss = val_loss
      torch.save( model.state_dict(), best_model_params_path )

    scheduler.step()

model.load_state_dict( torch.load( best_model_params_path ) )

| epoch   1 |   200/ 2928 batches | lr 5.00 | ms/batch  7.93 | loss  6.76 | ppl   859.52
| epoch   1 |   400/ 2928 batches | lr 5.00 | ms/batch  7.75 | loss  6.53 | ppl   688.61
| epoch   1 |   600/ 2928 batches | lr 5.00 | ms/batch  7.73 | loss  6.27 | ppl   527.78
| epoch   1 |   800/ 2928 batches | lr 5.00 | ms/batch  7.74 | loss  6.20 | ppl   493.28
| epoch   1 |  1000/ 2928 batches | lr 5.00 | ms/batch  7.71 | loss  6.11 | ppl   451.24
| epoch   1 |  1200/ 2928 batches | lr 5.00 | ms/batch  7.76 | loss  6.10 | ppl   445.94
| epoch   1 |  1400/ 2928 batches | lr 5.00 | ms/batch  7.72 | loss  6.07 | ppl   433.94
| epoch   1 |  1600/ 2928 batches | lr 5.00 | ms/batch  7.67 | loss  6.07 | ppl   431.96
| epoch   1 |  1800/ 2928 batches | lr 5.00 | ms/batch  7.61 | loss  5.99 | ppl   399.64
| epoch   1 |  2000/ 2928 batches | lr 5.00 | ms/batch  7.67 | loss  5.99 | ppl   400.09
| epoch   1 |  2200/ 2928 batches | lr 5.00 | ms/batch  7.76 | loss  5.87 | ppl   355.75
| epoch   1 |  2400/ 

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmpz7bnhj2d/best_model_params.pt'

In [None]:
test_loss = evaluate( model, test_data )
test_ppl = math.exp( test_loss )
print( '=' * 89 )
print( f'| End of training | test loss {test_loss:5.2f} | '
      f'test ppl {test_ppl:8.2f}' )
print( '=' * 89 )