In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
csv_file = pd.read_csv('/content/sample_data/shortjokes.csv')

NUM_JOKES = 10000
jokes = csv_file['Body'].head(NUM_JOKES).to_numpy()
train_jokes, val_jokes = train_test_split(jokes, test_size=0.2, random_state=42)
print(jokes)

FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_data/shortjokes.csv'

In [None]:
import spacy
#from spacy.lang.fr.examples import sentences
from typing import List, Tuple
import locale
import torch
import torch.nn as nn
from torch import Tensor
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
# !python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")

In [None]:
import math
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
      super().__init__()
      self.d_model = d_model
      self.num_heads = num_heads
      # compute head dimension
      self.head_dim = d_model // num_heads
      # --> d_model = head_dim * num_heads
      # so we can still use one linear transformation
      # to get q, k, v for all heads in one pass
      self.q_proj = nn.Linear(d_model, d_model)
      self.k_proj = nn.Linear(d_model, d_model)
      self.v_proj = nn.Linear(d_model, d_model)
      # output projection to combine heads
      self.o_proj = nn.Linear(d_model, d_model)
      self.softmax = nn.Softmax(dim=-1)
      self.out_dropout = nn.Dropout(dropout)
      self.score_dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
      """
      :param x: input of size batch_size x seq_len x d_model
      :param mask: attn mask of size seq_len x seq_len
      :returns: output z of size batch_size x seq_len x d_model
              attn_scores of size batch_size x num heads x seq_len x seq_len
      """
      #TODO: compute queries, keys, and values
      # q = ...
      # k = ...
      # v = ...
      q = self.q_proj(x)
      k = self.k_proj(x)
      v = self.v_proj(x)

      # split qkv into individual heads
      bs, seq_len, _ = x.size()
      # bs x seq_len x d_model --> bs x num_heads x seq_len x head_dim
      q = q.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
      k = k.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
      v = v.view(bs, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

      # TODO: just good old self-attention
      attn_scores = q @ k.transpose(-2, -1)
      attn_scores = attn_scores / math.sqrt(self.head_dim)
      if mask is not None:
          attn_scores += mask
      attn_scores = self.softmax(attn_scores)
      attn_scores = self.score_dropout(attn_scores)

      output = attn_scores @ v

      # combine heads
      # bs x num_heads x seq_len x head_dim  --> bs x seq_len x d_model
      output = output.transpose(1, 2).contiguous()
      output = output.view(bs, seq_len, self.head_dim*self.num_heads)

      #TODO: apply output projection(wO)
      output = self.o_proj(output)
      output = self.out_dropout(output)

      return output, attn_scores

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # embeddings + possitional embeddings
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [None]:
class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff=None, dropout=0.1):
    super().__init__()
    self.d_ff = d_ff if d_ff else 4 * d_model
    self.w_1 = nn.Linear(d_model, self.d_ff)
    self.w_2 = nn.Linear(self.d_ff, d_model)
    self.activation = nn.ReLU()
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    """
    :param x: input of size batch_size x seq_len x d_model
    :returns: FFN representtions, same shape as input
    """
    #TODO: pass x through the FFN (see equation above)
    x = self.w_2(self.activation(self.w_1(x)))
    x = self.dropout(x)
    return x

In [None]:
class TransformerLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff=None, dropout=0.1):
    super().__init__()
    self.d_model = d_model
    self.mha = MultiHeadAttention(
        d_model=d_model,
        num_heads=num_heads,
        dropout=dropout
    )
    self.ffn = FeedForward(
        d_model=d_model,
        d_ff=d_ff,
        dropout=dropout
    )
    #layer norms
    self.norm_attn = nn.LayerNorm(d_model)
    self.norm_ffn = nn.LayerNorm(d_model)

  def forward(self, x, mask=None):
    attn_output, attn_scores = self.mha(x, mask)
    #TODO: write the residual
    x = self.norm_attn(x + attn_output)
    x = self.norm_ffn(x + self.ffn(x))
    return x, attn_scores

In [None]:
MAX_LENGTH = 250
BATCH_SIZE = 32

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(text):
    return tokenizer.tokenize(text)

class Vocabulary:
    """
    Helper class that maps words to unique indices and the other way around
    """
    def __init__(self, samples: List[str]):
        vocabulary = set()
        for sample in samples:
          tokens = tokenize(sample)
          vocabulary.update(tokens)

        self.token_to_idx = {token:idx for (idx, token)
                            in enumerate(vocabulary)}
        self.idx_to_token = {idx:token for (idx, token)
                            in enumerate(vocabulary)}

    def size(self):
        return len(self.token_to_idx)

    def __str__(self):
        return str(self.token_to_idx)

vocab = Vocabulary(jokes)
print("Vocabulary size: ", vocab.size())
print("Vocabulary: \n", vocab)

In [None]:
def text_to_tensor(text: str, vocab: Vocabulary) -> torch.LongTensor:
    """
    Convert a string to a Tensor with corresponding character indices
    e.g. "We have" -> [48, 13]
    """
    #TODO
    token_ids = [vocab.token_to_idx[token] for token in tokenize(text)]
    return torch.tensor(token_ids)

def tensor_to_text(x: torch.LongTensor, vocab: Vocabulary) -> str:
    """
    Convert a Tensor of token indices to its string representation
    e.g. [48, 13] -> "We have"
    """
    #TODO
    return "".join(vocab.idx_to_token[idx.item()] for idx in x)

In [None]:
def get_sequences(jokes, vocab, max_length=250):
    sequences = []
    for joke in jokes:
        tokens = text_to_tensor(joke, vocab)
        sequences.append(tokens)
    return sequences

train_sequences = get_sequences(train_jokes, vocab, max_length=MAX_LENGTH)
val_sequences = get_sequences(val_jokes, vocab, max_length=MAX_LENGTH)

print("Train sequences:", len(train_sequences))
print("Val sequences:", len(val_sequences))


In [None]:
from torch.utils.data import Dataset, DataLoader
class StoriesDataset(Dataset):
    def __init__(self, sequences):
        self.data = sequences

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        sequence = self.data[idx]
        x = torch.LongTensor(sequence)
        input = x[:-1]
        label = x[1:] # shift to the right
        return input, label

from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    inputs, labels = zip(*batch)  # list of (input, label)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=0)
    return inputs_padded, labels_padded


In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)


In [None]:
x, y = next(iter(train_loader))
print('Batch input shape:', x.shape)
print('Batch label shape:', y.shape)


In [None]:
_hyperparameters_dict = {
    "batch_size": BATCH_SIZE, # =32
    "num_epochs": 3,
    "max_len": MAX_LENGTH, # =250
    "embedding_size": 256,
    "learning_algo": "adam",
    "learning_rate": 1e-4,
    "max_grad_norm": 5.0,
    "num_layers": 6
}
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
class TransformerLM(nn.Module):
    def __init__(self, vocab_size: int, d_model: int,
                 dropout_p: float, seq_len: int, num_layers: int):
        super().__init__()
        self.name = 'transformer'
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout_p)
        self.seq_len = seq_len
        self.num_layers = num_layers
        self.norm = nn.LayerNorm(d_model)

        # TODO: instantiate Modules with the correct arguments
        # self.embedding = nn.Embedding(...)
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=d_model
        )

        # self.positional_encoding = PositionalEncoding(
        #     ...
        # )

        self.positional_encoding = PositionalEncoding(
            d_model=d_model,
            dropout=dropout_p
        )

        # stack num_layers transformer layers
        self.transformer = nn.ModuleList([
            TransformerLayer(d_model=d_model,
            num_heads=8,
            dropout=dropout_p)
            for _ in range(num_layers)
        ])

        # instantiate the output layer
        # self.output_layer = nn.Linear(
        #     ...
        # )
        self.output_layer = nn.Linear(
            in_features=d_model,
            out_features=vocab_size
        )

        # https://paperswithcode.com/method/weight-tying
        self.embedding.weight = self.output_layer.weight

    def forward(self, x: torch.LongTensor) -> torch.FloatTensor:
        """
        :param x: input of size batch_size x max_len
        :return: logits of size batch_size x seq_len x vocab_size
        """
        seq_len = x.size(1)
        # compute embeddings, then multiply them by sqrt(d_model)
        #x = ...
        x = self.embedding(x) * math.sqrt(self.d_model)
        # add positional_encodings
        x = self.positional_encoding(x)

        # generate a causal mask for the sequence
        # masked positions are filled with -inf
        mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(device)

        for layer in self.transformer:
          x, attn_scores = layer(x, mask=mask)

        # apply layernorm to final transformer output
        x = self.norm(x)

        # output layer
        # logits = ...
        logits = self.output_layer(x)

        return logits

In [None]:
x = torch.randint(0, vocab.size(), (1, MAX_LENGTH)).to(device)
model = TransformerLM(
    d_model = _hyperparameters_dict["embedding_size"],
    seq_len = _hyperparameters_dict["max_len"],
    num_layers = 6,
    dropout_p = 0.1,
    vocab_size = vocab.size()
).to(device)
output = model(x)
print(output.shape)

In [None]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Trainable params', trainable_params)

In [None]:
from torch.optim import SGD, Adam

class Trainer:
    def __init__(self, model: nn.Module,
                 train_loader: DataLoader,
                 val_loader: DataLoader,
                 vocab: Vocabulary,
                 hyperparams: dict,
                 num_train_examples: int,
                 num_val_examples: int):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.vocab = vocab
        self.optimizer = Adam(params = self.model.parameters(),
                              lr = hyperparams['learning_rate'])
        self.num_epochs = hyperparams['num_epochs']
        self.max_len = hyperparams['max_len']
        self.batch_size = hyperparams['batch_size']
        self.max_grad_norm = hyperparams['max_grad_norm']
        self.num_train_examples = num_train_examples
        self.num_val_examples = num_val_examples
        # loss for output layer
        self.loss_fn = nn.CrossEntropyLoss()


    def train_epoch(self, epoch_num: int) -> float:
        """
        Compute the loss on the training set
        :param epoch_num: number of current epoch
        """
        self.model.train()
        epoch_loss = 0.0
        for batch_num, (x, y) in enumerate(train_loader):

            x, y = x.to(device), y.to(device)
            # reset gradients
            self.optimizer.zero_grad()

            # unnormalized probabilities
            # batch x seq_len x vocab_size
            logits = self.model(x)

            # TODO: compute loss
            # remember, we are making next token predictions over the vocab
            # for each token in the sequence, for each sequence in the batch
            # !! so we need to flatten the first two dimensions before cross entropy
            batch_loss = self.loss_fn(
                # batch x seq_len x vocab_size --> (batch*seq_len) x vocab_size
                logits.view(-1, logits.size(-1)),
                # batch x seq_len --> batch*seq_len
                y.view(-1)
            )

            epoch_loss += batch_loss.item()

            # backpropagation (gradient of loss wrt parameters)
            batch_loss.backward()

            # clip gradients if they get too large
            torch.nn.utils.clip_grad_norm_(list(self.model.parameters()),
                                           self.max_grad_norm)

            # update parameters
            self.optimizer.step()

            if batch_num % 100 == 0:
                print("epoch %d, %d/%d examples, batch loss = %f"
                      % (epoch_num, (batch_num + 1) * self.batch_size,
                         self.num_train_examples, batch_loss.item()))
        epoch_loss /= (batch_num + 1)

        return epoch_loss

    def eval_epoch(self, epoch_num: int) -> float:
        """
        Compute the loss on the validation set
        :param epoch_num: number of current epoch
        """
        epoch_loss = 0.0
        self.model.eval()
        with torch.no_grad():
            for batch_num, (x, y) in enumerate(val_loader):

                x, y = x.to(device), y.to(device)
                #batch x timesteps x vocab_size
                logits = self.model(x)

                # TODO: compute loss
                # remember, we are making next token predictions over the vocab
                # for each token in the sequence, for each sequence in the batch
                # !! so we need to flatten the first two dimensions before cross entropy
                batch_loss = self.loss_fn(
                    # batch x seq_len x vocab_size --> (batch*seq_len) x vocab_size
                    logits.view(-1, logits.size(-1)),
                    # batch x seq_len --> batch*seq_len
                    y.view(-1)
                )
                epoch_loss += batch_loss.item()

            epoch_loss /= (batch_num + 1)

        return epoch_loss

    def train(self) -> dict:
        train_losses, val_losses = [], []
        for epoch in range(self.num_epochs):
            epoch_train_loss = self.train_epoch(epoch)
            epoch_val_loss = self.eval_epoch(epoch)
            train_losses.append(epoch_train_loss)
            val_losses.append(epoch_val_loss)
        return {"train_losses": train_losses,
                "val_losses": val_losses}

def plot_losses(metrics: dict):
    """
    Plots training/validation losses.
    :param metrics: dictionar
    """
    plt.figure()
    plt.plot(metrics['train_losses'], c='b', label='Train')
    plt.plot(metrics['val_losses'], c='g', label='Valid')
    plt.ylabel('Loss')
    plt.xlabel('Iteration')
    plt.legend()
    plt.show()

In [None]:
# train the transformer for three epochs, should take about 10-15 mins
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    vocab=vocab,
    hyperparams= _hyperparameters_dict,
    num_train_examples=len(train_sequences),
    num_val_examples=len(val_sequences)
)
metrics = trainer.train()
plot_losses(metrics)