In [9]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np
import time

In [10]:
data_file = open("dataset.txt", "r")
lines = data_file.readlines()

letter_to_idx = {}
idx_to_letter = {}
cnt = 0
for i in range(10):
    letter_to_idx[str(i)] = len(letter_to_idx)
for line in lines:
    if "sin" in line or "cos" in line or "tan" in line:
        line = line.replace("sin", "!")
        line = line.replace("cos", "@")
        line = line.replace("tan", "#")
        cnt += 1
    
    for letter in line:
        if letter.isalpha(): letter = "$"
        if letter not in letter_to_idx:
            letter_to_idx[letter] = len(letter_to_idx)
letter_to_idx["SOS"] = len(letter_to_idx); letter_to_idx["EOS"] = len(letter_to_idx);
for k,v in letter_to_idx.items():
    idx_to_letter[v] = k
random.shuffle(lines)
train_lines = lines[: int(len(lines)*0.99) ]
test_lines = lines[int(len(lines)*0.99): ]
print(len(train_lines),len(test_lines))
print(cnt)
print(letter_to_idx)
print(idx_to_letter)
print(len(lines))

990000 10000
30095
{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '(': 10, '-': 11, '*': 12, '$': 13, ')': 14, '=': 15, '\n': 16, '+': 17, '@': 18, '#': 19, '!': 20, 'SOS': 21, 'EOS': 22}
{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '(', 11: '-', 12: '*', 13: '$', 14: ')', 15: '=', 16: '\n', 17: '+', 18: '@', 19: '#', 20: '!', 21: 'SOS', 22: 'EOS'}
1000000


In [11]:
def line_2_tokens(line):
    tokens = [ letter_to_idx[letter] if not letter.isalpha() else letter_to_idx["$"] for letter in line]
    return tokens

def tokens_2_lines(line):
    tokens = [ idx_to_letter[letter] for letter in line]
    return tokens

In [12]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask.to(self.device)

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [13]:
embedding_size = 384
src_vocab_size = len(letter_to_idx)
trg_vocab_size = len(letter_to_idx)
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 29 + 2 # SOS and EOS
forward_expansion = 4
src_pad_idx = letter_to_idx["="] #use "=" as padding
device = torch.device("cuda")

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
)
model.to(device)

Transformer(
  (src_word_embedding): Embedding(23, 384)
  (src_position_embedding): Embedding(31, 384)
  (trg_word_embedding): Embedding(23, 384)
  (trg_position_embedding): Embedding(31, 384)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=384, out_features=384, bias=True)
          )
          (linear1): Linear(in_features=384, out_features=4, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=4, out_features=384, bias=True)
          (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(

In [14]:

learning_rate = 3e-4
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
def evaluate():
    token_correct = total_tokens = sentence_correct = total_sentence = 0
    for i, line in enumerate(test_lines): #use "=" as EOS
        left = line.split("=")[0]; right = line.split("=")[1]
        left = [letter_to_idx["SOS"]] + line_2_tokens(left) + [letter_to_idx["EOS"]]
        right = [letter_to_idx["SOS"]] + line_2_tokens(right) + [letter_to_idx["EOS"]]
        left = torch.tensor(left).reshape(-1,1).to(device); right = torch.tensor(right).reshape(-1,1).to(device)
        
        outputs = model(left, right[:-1,:])
        outputs = outputs.reshape(-1, outputs.shape[2])
        preds = np.argmax(outputs.cpu().detach().numpy(), axis=-1)

        ground_truth = right[1:].reshape(-1)

        sentence_is_correct = True
        for p,g in zip(preds,ground_truth):
            if p == g: token_correct += 1
            else: sentence_is_correct = False
            total_tokens += 1
        if sentence_is_correct: sentence_correct += 1
        total_sentence += 1
    return token_correct, total_tokens, sentence_correct, total_sentence

In [None]:
batch_size = 128
one_batch_left = one_batch_right = []
losses_batch = []
for i, line in enumerate(train_lines): #use "=" as EOS
    left = line.split("=")[0]; right = line.split("=")[1]
    left = [letter_to_idx["SOS"]] + line_2_tokens(left) + [letter_to_idx["EOS"]]
    right = [letter_to_idx["SOS"]] + line_2_tokens(right) + [letter_to_idx["EOS"]]
    left = torch.tensor(left).reshape(-1,1).to(device); right = torch.tensor(right).reshape(-1,1).to(device)
    
    outputs = model(left, right[:-1,:])
    outputs = outputs.reshape(-1, outputs.shape[2])
    ground_truth = right[1:].reshape(-1)
    test = np.argmax(outputs.cpu().detach().numpy(), axis=-1)
    loss = criterion(outputs, ground_truth)
    # loss /= batch_size
    losses_batch.append(loss)
    if len(losses_batch) == batch_size:
        optimizer.zero_grad()
        losses_to_backprop = sum(losses_batch)
        losses_to_backprop.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        losses_batch = []
    if i % 40000 == 0 and i != 0: 
        token_correct, total_tokens, sentence_correct, total_sentence = evaluate()
        print(token_correct, total_tokens, token_correct/total_tokens, sentence_correct, total_sentence, sentence_correct/total_sentence)



131704 163402 0.8060121663137538 1234 10000 0.1234
142888 163402 0.8744568609931335 2318 10000 0.2318
146891 163402 0.8989547251563629 3034 10000 0.3034
148508 163402 0.9088505648645672 3456 10000 0.3456
150126 163402 0.9187525244489052 3653 10000 0.3653
151048 163402 0.924395050244183 3715 10000 0.3715
152113 163402 0.9309127183265811 3888 10000 0.3888
152723 163402 0.9346458427681423 3988 10000 0.3988
153092 163402 0.9369040770614803 4058 10000 0.4058
153646 163402 0.940294488439554 4155 10000 0.4155
153976 163402 0.9423140475636773 4230 10000 0.423
154224 163402 0.9438317768448367 4323 10000 0.4323
154500 163402 0.9455208626577398 4417 10000 0.4417
154894 163402 0.9479320938544203 4556 10000 0.4556
155080 163402 0.9490703908152899 4578 10000 0.4578
155188 163402 0.9497313374377303 4634 10000 0.4634
155508 163402 0.9516896978005165 4704 10000 0.4704
155456 163402 0.9513714642415637 4731 10000 0.4731


In [1]:
batch_size = 128
one_batch_left = one_batch_right = []
losses_batch = []
for i, line in enumerate(train_lines): #use "=" as EOS
    left = line.split("=")[0]; right = line.split("=")[1]
    left = [letter_to_idx["SOS"]] + line_2_tokens(left) + [letter_to_idx["EOS"]]
    right = [letter_to_idx["SOS"]] + line_2_tokens(right) + [letter_to_idx["EOS"]]
    left = torch.tensor(left).reshape(-1,1).to(device); right = torch.tensor(right).reshape(-1,1).to(device)
    
    outputs = model(left, right[:-1,:])
    outputs = outputs.reshape(-1, outputs.shape[2])
    ground_truth = right[1:].reshape(-1)
    test = np.argmax(outputs.cpu().detach().numpy(), axis=-1)
    loss = criterion(outputs, ground_truth)
    # loss /= batch_size
    losses_batch.append(loss)
    if len(losses_batch) == batch_size:
        optimizer.zero_grad()
        losses_to_backprop = sum(losses_batch)
        losses_to_backprop.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        losses_batch = []
    if i % 40000 == 0 and i != 0: 
        token_correct, total_tokens, sentence_correct, total_sentence = evaluate()
        print(token_correct, total_tokens, token_correct/total_tokens, sentence_correct, total_sentence, sentence_correct/total_sentence)



NameError: ignored

In [None]:
src = torch.rand(3,2).long()
trg = torch.rand(14,2).long()
print(src,trg)


In [None]:
output = model(src, trg[:-1, :])
output = output.reshape(-1, output.shape[2])
print(output.shape)
target = trg[1:].reshape(-1)
print(target.shape)

torch.Size([26, 20])
torch.Size([26])


In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters() ) #if p.requires_grad)
print(pytorch_total_params)

4810027
