In [2]:
import json
import torch
import torch.nn as nn
import math
from torch.utils.data import Dataset, DataLoader
import collections
from tqdm.notebook import tqdm
import uuid

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device}")

Using cuda


#### MultiHadAttention

Attention($Q$, $K$, $V$) = softmax($\frac {QK^T} {\sqrt {d_k}}$)$V$

![attention](./images/attention.jpg)

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_Q = nn.Linear(d_model, d_model)
        self.W_K = nn.Linear(d_model, d_model)
        self.W_V = nn.Linear(d_model, d_model)

        self.output_linear = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # matmul Q and K and scale
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # apply mask
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # softmax layer
        attention = torch.softmax(scores, dim=-1)

        # matmul attention and V
        context = torch.matmul(attention, V)

        return context
    
    def split_heads(self, x):
        batch_size, seq_len, _ = x.size()

        return x.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
    
    def combine_heads(self, x):
        batch_size, _, seq_len, _ = x.size()

        return x.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
    
    def forward(self, Q, K, V, mask=None):
        # batch_size x seq_len x d_model
        Q = self.W_Q(Q)
        K = self.W_K(K)
        V = self.W_V(V)

        # batch_size x num_heads x seq_len x d_k
        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)

        attention = self.scaled_dot_product_attention(Q, K, V, mask)

        # batch_size x seq_len x d_model
        attention = self.combine_heads(attention)

        output = self.output_linear(attention)

        return output


#### Positional Encoding

$PE_{(pos, 2i)} = sin(pos / 10000^{2i / d_{model}})$, for even

$PE_{(pos, 2i+1)} = cos(pos / 10000^{2i / d_{model}})$, for odd

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, 
                                dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)
                             .float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

#### Position wise feed forward network

In [6]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()

        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.linear1(x) # d_model x d_ff
        x = torch.relu(x)
        x = self.linear2(x) # d_ff x d_model

        return x

#### Encoder

![encoder](./images/encoder.jpg)

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()

        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.position_wise_feed_forward = PositionWiseFeedForward(d_model, d_ff)

        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # multi-head attention
        attention = self.multi_head_attention(x, x, x, mask)

        # add and norm
        x = self.layer_norm1(x + attention)
        x = self.dropout1(x)

        # position-wise feed forward
        feed_forward = self.position_wise_feed_forward(x)

        # add and norm
        x = self.layer_norm2(x + feed_forward)
        x = self.dropout2(x)

        return x

#### Decoder

![decoder](./images/decoder.jpg)

In [8]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()

        self.multi_head_attention1 = MultiHeadAttention(d_model, num_heads)
        self.multi_head_attention2 = MultiHeadAttention(d_model, num_heads)

        self.position_wise_feed_forward = PositionWiseFeedForward(d_model, d_ff)

        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, encoder_output, src_mask=None, trg_mask=None):
        # masked multi-head attention
        masked_attention = self.multi_head_attention1(x, x, x, trg_mask)

        # add and norm
        x = self.layer_norm1(x + masked_attention)
        x = self.dropout1(x)

        # multi-head attention
        attention = self.multi_head_attention2(x, encoder_output, encoder_output, src_mask)

        # add and norm
        x = self.layer_norm2(x + attention)
        x = self.dropout2(x)

        # position-wise feed forward
        feed_forward = self.position_wise_feed_forward(x)

        # add and norm
        x = self.layer_norm3(x + feed_forward)
        x = self.dropout3(x)

        return x
        

#### Transformer

In [9]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super().__init__()

        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)

        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.linear = nn.Linear(d_model, tgt_vocab_size)
    
    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2).to(device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3).to(device)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device)
        tgt_mask = tgt_mask & nopeak_mask
        
        return src_mask, tgt_mask
    
    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        src = self.src_embedding(src)
        tgt = self.tgt_embedding(tgt)

        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)

        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)
        
        output = self.linear(tgt)

        return output

#### Loading data

In [10]:
en = []
hr = []

with open('palesamnasvijetu.txt', 'r', encoding = 'utf-8') as file:
    lines = file.readlines()

    i = 0

    for line in lines:
        if i == 0:
            hr.append(line)
            i = 1
        elif i == 1:
            en.append(line)
            i = 2
        else:
            i = 0

In [11]:
# Character tokenization
hr_tokens = [list(sentence) for sentence in hr]
en_tokens = [list(sentence) for sentence in en]

# Flatten character tokens
# hr_flat_tokens = [token.lower() for sentence_tokens in hr_tokens for token in sentence_tokens]
# en_flat_tokens = [token.lower() for sentence_tokens in en_tokens for token in sentence_tokens]

hr_flat_tokens = [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', '\\', ']', '^', '_', '`', 
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                        'y', 'z', 
                        '{', '|', '}', '~', 'č', 'ć', 'đ', 'š', 'ž']
en_flat_tokens = [' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', 
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', '\\', ']', '^', '_', '`', 
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
                        'y', 'z', 
                        '{', '|', '}', '~']

# Count character frequencies
hr_vocab_counter = collections.Counter(hr_flat_tokens)
en_vocab_counter = collections.Counter(en_flat_tokens)

# Special tokens and vocabulary
special_tokens = ["<PAD>", "<UNK>", "<START>", "<END>"]
hr_vocab = special_tokens + sorted(hr_vocab_counter.keys())
en_vocab = special_tokens + sorted(en_vocab_counter.keys())

# Create token-to-index and index-to-token dictionaries
hr_token2index = {token: idx for idx, token in enumerate(hr_vocab)}
en_token2index = {token: idx for idx, token in enumerate(en_vocab)}
hr_index2token = {idx: token for idx, token in enumerate(hr_vocab)}
en_index2token = {idx: token for idx, token in enumerate(en_vocab)}

In [12]:
# Function for sentence to index conversion
def sentence2index(src, sentence_tokens, sentence_length, start_token=False, end_token=False):
    token2index = hr_token2index if src == "hr" else en_token2index

    if start_token:
        sentence_index = [token2index["<START>"]]
    else:
        sentence_index = []

    sentence_index += [token2index[token.lower()] if token.lower() in token2index else token2index["<UNK>"] for token in list(sentence_tokens)]

    if end_token:
        sentence_index.append(token2index["<END>"])

    sentence_index += [token2index["<PAD>"] for _ in range(sentence_length - len(sentence_index))]

    return sentence_index

# Example usage
hr_sentence = "pale sam na svijetu"
hr_indexed_sentence = sentence2index("hr", hr_sentence, 20, True, True)
print(hr_indexed_sentence)

[2, 57, 42, 53, 46, 4, 60, 42, 54, 4, 55, 42, 4, 60, 63, 50, 51, 46, 61, 62, 3]


In [13]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, hr_dataset, en_dataset, max_seq_length, take):
        hr, en = self.remove_long_sentences(hr_dataset, en_dataset, max_seq_length)

        train_data = []
        test_data = []

        for tokens in hr[:take]:
            train_data.append(sentence2index("hr", tokens, max_seq_length))

        for tokens in en[:take]:
            test_data.append(sentence2index("en", tokens, max_seq_length, True, True))

        train_data = torch.tensor(train_data)
        test_data = torch.tensor(test_data)

        self.X = train_data
        self.y = test_data

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def remove_long_sentences(self, hr, en, max_length):
        hr_short = []
        en_short = []

        print(hr[0])

        for c, e in zip(hr, en):
            if len(c) < max_length - 2 and len(e) < max_length - 2:
                hr_short.append(c)
                en_short.append(e)

        return hr_short, en_short

#### Initialization

In [14]:
src_vocab_size = len(hr_vocab)
tgt_vocab_size = len(en_vocab)
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 200
dropout = 0.1

batch_size = 8
num_epochs = 200

In [15]:
translation_dataset = TranslationDataset(hr, en, max_seq_length, 3000)
lengths = [int(0.8 * len(translation_dataset)),
         int(0.1 * len(translation_dataset)), 
         len(translation_dataset) - int(0.8 * len(translation_dataset)) - int(0.1 * len(translation_dataset))]
train_dataset, dev_dataset, test_dataset = torch.utils.data.dataset.random_split(translation_dataset, lengths=lengths)

print(f'Veličina skupa za treniranje: {len(train_dataset)}')
print(f'Veličina skupa za validaciju: {len(dev_dataset)}')
print(f'Veličina skupa za testiranje: {len(test_dataset)}')


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Jednog je jutra Pale ustao vrlo rano.

Veličina skupa za treniranje: 81
Veličina skupa za validaciju: 10
Veličina skupa za testiranje: 11


In [16]:
X_batch, y_batch = next(iter(train_loader))
X_batch = X_batch.to(device)
y_batch = y_batch.to(device)

print(X_batch.size(), y_batch.size())

torch.Size([8, 200]) torch.Size([8, 200])


In [17]:
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
transformer = transformer.to(device)

#### Training

In [25]:
def train_one_batch(transformer, X_batch, y_batch, num_epochs, tgt_vocab_size, leraning_rate=0.0001):
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(transformer.parameters(), lr=leraning_rate, betas=(0.9, 0.98), eps=1e-9)
    progress_bar = tqdm(range(num_epochs), position=0, leave=True)

    transformer.train()

    for epoch in progress_bar:
        optimizer.zero_grad()

        output = transformer(X_batch, y_batch[:, :-1])

        o = output.contiguous().view(-1, tgt_vocab_size)
        y = y_batch[:, 1:].contiguous().view(-1)

        loss = criterion(o, y)
        loss.backward()
        optimizer.step()
        
        progress_bar.set_description(f"Epoch {epoch + 1} | Train Loss: {loss.item()}")

train_one_batch(transformer, X_batch, y_batch, num_epochs, tgt_vocab_size)

  0%|          | 0/200 [00:00<?, ?it/s]

In [19]:
def save_transformer(transformer):
    path = "./models"

    model_id = str(uuid.uuid4())[:8]

    model_args = { "src_vocab_size": len(hr_vocab), 
                   "tgt_vocab_size": len(en_vocab), 
                   "d_model": d_model, 
                   "num_heads": num_heads, 
                   "num_layers": num_layers, 
                   "d_ff": d_ff, 
                   "max_seq_length": max_seq_length, 
                   "dropout": dropout }
    
    json_object = json.dumps(model_args, indent=4)

    torch.save(transformer.state_dict(), path + f'/{model_id}.pt')

    with open(path + f'/{model_id}.json', "w") as outfile:
        outfile.write(json_object)

    return model_id

save_transformer(transformer)

'df8c8944'

In [26]:
sent = X_batch[2]
for s in sent:
    print(hr_index2token[s.item()], end="")

ali u trgovini nema nikoga kome bi platio. uvidi da mu novci ništa ne vrijede.<UNK><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>

In [33]:
def translate(sentance, model):
    result = ""
    src = torch.tensor([sentence2index("hr", sentance, max_seq_length)]).to(device)
    
    for i in range(max_seq_length//2):
        tgt = torch.tensor([sentence2index("en", result, max_seq_length, start_token = True)]).to(device)

        output = model(src, tgt)

        next_word_prob = output[0][i]
        next_word_index = torch.argmax(next_word_prob).item()
        next_word = en_index2token[next_word_index]

        if next_word == "<END>":
            break

        result += next_word

    return result

translate("ali u trgovini nema nikoga kome bi platio. uvidi da mu novci ništa ne vrijede.", transformer)

'but there was no one in the store to pay. he realized that his money was worthless.<UNK>al eoser    ewrw'

In [22]:
def train(transformer, train_loader, dev_loader, num_epochs, tgt_vocab_size, leraning_rate=0.0001):
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(transformer.parameters(), lr=leraning_rate, betas=(0.9, 0.98), eps=1e-9)

    progress_bar = tqdm(range(num_epochs), position=0, leave=True)

    train_epoch_loss = 0
    dev_epoch_loss = 0

    for epoch in progress_bar:
        transformer.train()

        for i, (X_batch, y_batch) in enumerate(train_loader):
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            output = transformer(X_batch, y_batch[:, :-1])

            o = output.contiguous().view(-1, tgt_vocab_size)
            y = y_batch[:, 1:].contiguous().view(-1)

            loss = criterion(o, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_epoch_loss += loss.item()

        transformer.eval()
        
        for i, (X_batch, y_batch) in enumerate(dev_loader):
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            output = transformer(X_batch, y_batch[:, :-1])

            o = output.contiguous().view(-1, tgt_vocab_size)
            y = y_batch[:, 1:].contiguous().view(-1)

            loss = criterion(o, y)
            dev_epoch_loss += loss.item()

        progress_bar.set_description(f"Epoch {epoch + 1} | Batch {i + 1} | Train loss: {train_epoch_loss / len(train_loader)} | Dev Loss: {dev_epoch_loss / len(dev_loader)}")

# train(transformer, train_loader, dev_loader, num_epochs, tgt_vocab_size)

#### Loading model

In [23]:
def load_transformer(model_id):
    with open(f"./models/{model_id}.json", "r") as file:
        model_args = json.load(file)

    transformer = Transformer(**model_args)
    transformer.load_state_dict(torch.load(f"./models/{model_id}.pt"))
    transformer = transformer.to(device)
    transformer.eval()

    return transformer

In [34]:
saved_transformer = load_transformer("df8c8944")

translate("ali u trgovini nema nikoga kome bi platio. uvidi da mu novci ništa ne vrijede.", transformer)

'but there was no one in the store to pay. he realized that his money was worthless.<UNK><UNK>l eosloe.er   y'