Transformer model with pytorch

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim                         # Adam optimizer
import torch.nn.functional as F                     # Softmax function
from torch.utils.data import DataLoader, Dataset    # Loading batches
import torch.nn.utils.rnn as rnn_utils              # Padding the sequence
from torch.optim.lr_scheduler import OneCycleLR                  # Learning rate scheduler
from transformers import AutoTokenizer              # BPE Tokenizer
import pandas as pd
import numpy as np
import math

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [15]:
all_slogans = pd.read_csv('all_slogans.csv', sep=';')
slogans = all_slogans['slogan']
slogans = slogans.str.lower()

# reducing invaluable tokens
to_remove = ['\n', '\r', '>', '\x80', '\x93', '\x94', '\x99', '\x9d', '\xa0',
             '¦', '®', '°', 'º', '¼', '½','×', 'â', 'ã', 'è', 'é', 'ï', 'ñ', 'ú', 'ü',
             '⁄', '（', '）', '，', '·']

dict_to_remove = {"’" : "'", "‘" : "'", "“" : '"', "”" : '"',
                  "…" : '...', '—': '-', '–': '-'}


# removing useless toknes
for char in to_remove:
    slogans = slogans.str.replace(char, ' ')

# replacing tokens with normalised versions
for key, value in dict_to_remove.items():
    slogans = slogans.str.replace(key, value)


In [None]:
# BPE tokenizer for bert
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenizing the dataset
encoded_slogans = tokenizer.batch_encode_plus(
    slogans.tolist(),
    add_special_tokens=True, # <BoS> and <EoS>
    padding=True,            # Pad for same seq_length
    truncation=True,         # Truncate to max length
    return_tensors='pt'      # Torch datatype
)

# Focusing only on tokens
encoded_slogans = encoded_slogans['input_ids']

# test example
encoded_slogans.shape
tokenizer.decode(encoded_slogans[2])

'[CLS] the way i like to travel [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [18]:
# define hyperparameters
vocab_size = tokenizer.vocab_size
d_model = 384 # dim of the embedding vector               # TO CHANGE
nhead = 8 # number of attention heads
num_decoder_layers = 3 # number of decoder layers
dim_feedforward = 2048 # feed-forward network dimension
max_seq_length = 20                                       # TO CHANGE 
batch_size = 128
dropout = 0.1
PAD_TOKEN = tokenizer.pad_token_id


In [29]:
class SloganDataset(Dataset):
    def __init__(self, encoded_slogans, max_seq_length=20):
        self.encoded_slogans = encoded_slogans ### CHANGE TO ENCODED SLOGANS
        self.max_seq_length = max_seq_length
        
    def __len__(self):
        return len(self.encoded_slogans)
    
    def __getitem__(self, idx):
        slogan = self.encoded_slogans[idx]
        
        # Truncate if slogan is too long
        if len(slogan) > self.max_seq_length:
            slogan = slogan[:self.max_seq_length]     

        input_sequence = slogan[:-1]
        target_sequence = slogan[1:]
        return input_sequence, target_sequence
    


# Test with subset of slogans
subset_encoded_slogans = encoded_slogans
dataset = SloganDataset(subset_encoded_slogans)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


### Building Positional Encoding and masks

In [60]:
# Sinusoidal positional encoding
def positional_encoding(seq_len, embed_dim):
    pe = torch.zeros(seq_len, embed_dim)
    for pos in range(seq_len):
        for i in range(0, embed_dim, 2):
            pe[pos, i] = math.sin(pos / (10000 ** (2 * i / embed_dim)))
            pe[pos, i + 1] = math.cos(pos / (10000 ** (2 * i / embed_dim)))
    return pe.unsqueeze(0) # Output for batch_dim propagation

# Generate padding mask to prevent looking at not used tokens
def generate_padding_mask(sequence, pad_token=0):
    mask = (sequence == pad_token).float()
    mask = mask.masked_fill(mask == 1, float('-inf')).masked_fill(mask == 0, float(0.0))
    return mask

# Generate look ahead mask to prevent looking at future tokens
def generate_look_ahead_mask(size):
    mask = torch.triu(torch.ones(size, size) * float('-inf'), diagonal=1)
    return mask



In [66]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_decoder_layers, 
                 dim_feedforward, max_seq_length):
        super(TransformerModel, self).__init__()
        
        # Create the token embedding
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Initialize weights with Xavier normal for stability
        nn.init.xavier_normal_(self.embedding.weight) 

        # Unsqueeze to add batch dimension
        self.pos_encoder = positional_encoding(max_seq_length, d_model).to(device)

        # Transformer Decoder layers
        self.transformer_decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True)
        self.transformer_decoder = nn.TransformerDecoder(
            self.transformer_decoder_layer, num_layers=num_decoder_layers
        )

        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, src):
        # Generate look ahead mask to prevent looking at future tokens
        tgt_mask = generate_look_ahead_mask(src.size(1)).to(device) # check the change to 1
        # Use padding mask to prevent looking at not used tokens
        src_pad_mask = generate_padding_mask(src).to(device)
        # sqrt for stabilization
        src = self.embedding(src) * math.sqrt(d_model) # (batch_size, seq_len, d_model)
        # add positional encoding 
        src = src + self.pos_encoder[:, :src.size(1), :] # src.size(1) = seq_len
        output = self.transformer_decoder(tgt=src, memory=src, tgt_mask=tgt_mask,
                                          memory_mask=tgt_mask, tgt_key_padding_mask=src_pad_mask) # Change the memory mask
        output = self.dropout(output)
        output = self.fc_out(output)
        
        return output
    
model = TransformerModel(vocab_size, d_model, nhead, 
                          num_decoder_layers, dim_feedforward, max_seq_length).to(device) # Watch out



criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Warmup with LR scheduling (Cosine annealing)
scheduler = OneCycleLR(optimizer, max_lr=0.0001, epochs=20, steps_per_epoch=batch_size)

In [67]:
# Example training loop with dataloader
num_epochs = 20
for epoch in range(num_epochs):
    print(f'Epoch {epoch}')
    for batch in dataloader:
        # Move to GPU
        input_sequences, target_sequences = batch
        input_sequences = input_sequences.to(device)   # To GPU
        target_sequences = target_sequences.to(device) # To GPU
        optimizer.zero_grad()
        output = model(input_sequences)
        loss = criterion(output.view(-1, vocab_size), target_sequences.view(-1))
        loss.backward()

        optimizer.step()
        scheduler.step()

        
    print(f'Epoch: {epoch}, Loss: {loss.item()}, LR: {scheduler.get_last_lr()[0]:.6f}')

Epoch 0
Epoch: 0, Loss: 8.952378273010254, LR: 0.000007
Epoch 1
Epoch: 1, Loss: 8.133126258850098, LR: 0.000017
Epoch 2
Epoch: 2, Loss: 7.200924396514893, LR: 0.000031
Epoch 3
Epoch: 3, Loss: 6.444535255432129, LR: 0.000048
Epoch 4
Epoch: 4, Loss: 5.773065090179443, LR: 0.000066
Epoch 5
Epoch: 5, Loss: 5.479922771453857, LR: 0.000082
Epoch 6
Epoch: 6, Loss: 5.400160789489746, LR: 0.000093
Epoch 7
Epoch: 7, Loss: 4.892806053161621, LR: 0.000099
Epoch 8
Epoch: 8, Loss: 4.753689289093018, LR: 0.000100
Epoch 9
Epoch: 9, Loss: 4.50779390335083, LR: 0.000098
Epoch 10
Epoch: 10, Loss: 4.216386318206787, LR: 0.000096
Epoch 11
Epoch: 11, Loss: 4.13455867767334, LR: 0.000092
Epoch 12
Epoch: 12, Loss: 3.9435667991638184, LR: 0.000087
Epoch 13
Epoch: 13, Loss: 3.761566162109375, LR: 0.000082
Epoch 14
Epoch: 14, Loss: 3.7107436656951904, LR: 0.000075
Epoch 15
Epoch: 15, Loss: 3.506303071975708, LR: 0.000068
Epoch 16
Epoch: 16, Loss: 3.4615390300750732, LR: 0.000060
Epoch 17
Epoch: 17, Loss: 3.30448

In [154]:
def generate_slogan(model, start_sequence, max_lenght=20):
    model.eval()
    input_sequence = torch.tensor(tokenizer.encode(start_sequence), dtype=torch.long).unsqueeze(0)
    generated_sequence = input_sequence.tolist()[0]

    for _ in range(max_lenght - len(start_sequence)):   # Watch out
        input_tensor = torch.tensor(generated_sequence[-max_lenght:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_tensor)
        next_token = torch.argmax(F.softmax(output[0, -1, :], dim=0)).item()
        generated_sequence.append(next_token)
        if next_token == 102:
            break
    
    return ' '.join([tokenizer.decode(idx, skip_special_tokens=True) for idx in generated_sequence])

start_sequence = "better"
generated_slogan = generate_slogan(model, start_sequence)
print(f"Generated slogan: {generated_slogan}")

Generated slogan:  better  . better than the best . 
