In [None]:
!pip install torch==1.8.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp38-cp38-linux_x86_64.whl (1982.2 MB)
[K     |█████████████▌                  | 834.1 MB 1.2 MB/s eta 0:15:44tcmalloc: large alloc 1147494400 bytes == 0x39b1c000 @  0x7f63ddcd9615 0x5d6f4c 0x51edd1 0x51ef5b 0x4f750a 0x4997a2 0x4fd8b5 0x4997c7 0x4fd8b5 0x49abe4 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x5d8868 0x5da092 0x587116 0x5d8d8c 0x55dc1e 0x55cd91 0x5d8941 0x49abe4 0x55cd91 0x5d8941 0x4990ca 0x5d8868 0x4997a2 0x4fd8b5 0x49abe4
[K     |█████████████████               | 1055.7 MB 1.2 MB/s eta 0:12:25tcmalloc: large alloc 1434370048 bytes == 0x7e172000 @  0x7f63ddcd9615 0x5d6f4c 0x51edd1 0x51ef5b 0x4f750a 0x4997a2 0x4fd8b5 0x4997c7 0x4fd8b5 0x49abe4 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x4f5fe9 0x55e14

In [None]:
!pip install torchtext==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torchtext==0.9.0
  Downloading torchtext-0.9.0-cp38-cp38-manylinux1_x86_64.whl (7.0 MB)
[K     |████████████████████████████████| 7.0 MB 39.2 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.14.0
    Uninstalling torchtext-0.14.0:
      Successfully uninstalled torchtext-0.14.0
Successfully installed torchtext-0.9.0


In [None]:
import torch
import torch.nn as nn

import random

SEED = 1039

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(
            num_embeddings=input_dim,
            embedding_dim=emb_dim
        )

        self.rnn = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout
        )

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, src):

        embedded = self.embedding(src)
        embedded = self.dropout(embedded)

        output, (hidden, cell) = self.rnn(embedded)

        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(
            num_embeddings=output_dim,
            embedding_dim=emb_dim
        )

        self.rnn = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hid_dim,
            num_layers=n_layers,
            dropout=dropout
        )

        self.out = nn.Linear(
            in_features=hid_dim,
            out_features=output_dim
        )

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input, hidden, cell):

        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        prediction = self.out(output.squeeze(0))

        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]

        for t in range(1, max_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)

        return outputs


In [None]:
import os
import io

from torchtext.legacy.data import Field, Dataset
from torchtext.legacy.datasets import TranslationDataset
from torchtext.legacy import data


class PunctuationDataset(Dataset):
    urls = []  # insert our link from google drive
    name = ''
    dirname = 'punc'

    @staticmethod
    def sort_key(ex):
        return data.interleave_keys(len(ex.src), len(ex.trg))

    def __init__(self, path, exts, fields, **kwargs):
        if not isinstance(fields[0], (tuple, list)):
            fields = [('src', fields[0]), ('trg', fields[1])]

        src_path, trg_path = tuple(os.path.expanduser(path + x) for x in exts)

        examples = []
        with io.open(src_path, mode='r', encoding='utf-8') as src_file, \
                io.open(trg_path, mode='r', encoding='utf-8') as trg_file:
            for src_line, trg_line in zip(src_file, trg_file):
                src_line, trg_line = src_line.strip(), trg_line.strip()
                if src_line != '' and trg_line != '':
                    examples.append(data.Example.fromlist(
                        [src_line, trg_line], fields))

        super(PunctuationDataset, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, exts, fields, path=None, root='data',
               train='train', validation='val', test='test', **kwargs):
        """Create dataset objects for splits of a TranslationDataset.
        Args:
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            path (str): Common prefix of the splits' file paths, or None to use
                the result of cls.download(root).
            root: Root dataset storage directory. Default is '.data'.
            train: The prefix of the train data. Default: 'train'.
            validation: The prefix of the validation data. Default: 'val'.
            test: The prefix of the test data. Default: 'test'.
            Remaining keyword arguments: Passed to the splits method of
                Dataset.
        """
        if path is None:
            path = cls.download(root)

        train_data = None if train is None else cls(
            os.path.join(path, train), exts, fields, **kwargs)
        val_data = None if validation is None else cls(
            os.path.join(path, validation), exts, fields, **kwargs)
        test_data = None if test is None else cls(
            os.path.join(path, test), exts, fields, **kwargs)
        return tuple(d for d in (train_data, val_data, test_data)
                     if d is not None)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, BucketIterator, Dataset

import random
import math
import numpy as np
import time

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


def tokenize(text):
    return text.split()


SRC = Field(tokenize=tokenize,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

TRG = Field(tokenize=tokenize,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

train_data, valid_data, test_data = PunctuationDataset.splits(fields=(SRC, TRG),
                                                    exts=('.um', '.m'))

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

SRC.build_vocab(train_data, min_freq=10)
TRG.build_vocab(train_data, min_freq=10)

print(f"Unique tokens in source (unmarked) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (marked) vocabulary: {len(TRG.vocab)}")

Number of training examples: 349143
Number of validation examples: 116381
Number of testing examples: 116382
Unique tokens in source (unmarked) vocabulary: 14113
Unique tokens in target (marked) vocabulary: 14136


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device.type)

BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform(param, -0.05, 0.05)


model.apply(init_weights)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

example_idx = np.random.choice(np.arange(len(test_data)))
print(vars(train_data.examples[example_idx])['src'])
print(type(vars(train_data.examples[example_idx])['src']))

cuda
The model has 21,839,928 trainable parameters
['что', 'с', 'вами', 'вчера', 'было']
<class 'list'>


  nn.init.uniform(param, -0.05, 0.05)


In [None]:
def train(model, iterator, optimizer, criterion, clip, train_history=None, valid_history=None, plot_local=False):
    model.train()
    epoch_loss = 0
    history = []
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

        history.append(loss.cpu().data.numpy())

    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    history = []

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
    model.eval()

    tokens = sentence
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for t in range(1, max_len):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        output, hidden, cell = model.decoder(trg_tensor, hidden, cell)

        pred_token = output.argmax(1).item()

        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    return trg_tokens[1:]

In [None]:
def punct(sentence):
    i = sentence
    temp_data = []
    for j in range(0, len(i), 2):
        if i[j] == 'DASH'.lower():
            temp_data.append('-')
            continue
        if i[j] == 'DIGIT_COMMA'.lower():
            temp_data.append(i[j + 1] + ',')
            continue
        if i[j] == 'DIGIT_O'.lower():
            temp_data.append(i[j + 1])
            continue
        if i[j] == 'DIGIT_PERIOD'.lower():
            temp_data.append(i[j + 1] + '.')
            continue
        if i[j] == 'DIGIT_QUESTION'.lower():
            temp_data.append(i[j + 1] + '?')
            continue
        if i[j] == 'DIGIT_EXCLAMATION'.lower():
            temp_data.append(i[j + 1] + '!')
            continue
        if i[j] == 'DIGIT_COLON'.lower():
            temp_data.append(i[j + 1] + ':')
            continue
        if i[j] == 'DIGIT_COLON_COMMA'.lower():
            temp_data.append(i[j + 1] + ';')
            continue
        if i[j] == 'DIGIT_DASH'.lower():
            temp_data.append(i[j + 1] + '-')
            continue
        if i[j] == 'DIGIT_UNK'.lower():
            temp_data.append(i[j + 1])
            continue

        if i[j] == 'UPPER_O'.lower():
            temp_data.append(i[j + 1].capitalize())
            continue
        if i[j] == 'UPPER_COMMA'.lower():
            temp_data.append(i[j + 1].capitalize() + ',')
            continue
        if i[j] == 'UPPER_PERIOD'.lower():
            temp_data.append(i[j + 1].capitalize() + '.')
            continue
        if i[j] == 'UPPER_QUESTION'.lower():
            temp_data.append(i[j + 1].capitalize() + '?')
            continue
        if i[j] == 'UPPER_EXCLAMATION'.lower():
            temp_data.append(i[j + 1].capitalize() + '!')
            continue
        if i[j] == 'UPPER_COLON'.lower():
            temp_data.append(i[j + 1].capitalize() + ':')
            continue
        if i[j] == 'UPPER_COLON_COMMA'.lower():
            temp_data.append(i[j + 1].capitalize() + ';')
            continue
        if i[j] == 'UPPER_DASH'.lower():
            temp_data.append(i[j + 1].capitalize() + '-')
            continue
        if i[j] == 'UPPER_UNK'.lower():
            temp_data.append(i[j + 1].capitalize())
            continue

        if i[j] == 'LOWER_O'.lower():
            temp_data.append(i[j + 1])
            continue
        if i[j] == 'LOWER_COMMA'.lower():
            temp_data.append(i[j + 1] + ',')
            continue
        if i[j] == 'LOWER_PERIOD'.lower():
            temp_data.append(i[j + 1] + '.')
            continue
        if i[j] == 'LOWER_QUESTION'.lower():
            temp_data.append(i[j + 1] + '?')
            continue
        if i[j] == 'яЦ'.lower():
            temp_data.append(i[j + 1] + '!')
            continue
        if i[j] == 'LOWER_COLON'.lower():
            temp_data.append(i[j + 1] + ':')
            continue
        if i[j] == 'LOWER_COLON_COMMA'.lower():
            temp_data.append(i[j + 1] + ';')
            continue
        if i[j] == 'LOWER_DASH'.lower():
            temp_data.append(i[j + 1] + '-')
            continue
        if i[j] == 'LOWER_UNK'.lower():
            temp_data.append(i[j + 1])
            continue

        if i[j] == 'UNK'.lower():
            temp_data.append(i[j + 1])
            continue
    return " ".join(temp_data)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def get_example_translation():
    example_idx = np.random.choice(np.arange(len(valid_data)))
    
    src = vars(valid_data.examples[example_idx])['src']
    trg = vars(valid_data.examples[example_idx])['trg']

    src_string = f'src = {" ".join(src)}'
    trg_string = f'trg = {punct(trg)}'

    translation = translate_sentence(src, SRC, TRG, model, device)

    for i in range(1, len(trg), 2):
      translation[i] = trg[i]

    translation_string = f'predicted trg = {punct(translation)}'
    return ('\n\n'.join([src_string, trg_string, translation_string]))

In [None]:
def find_metric():
  accuracy = 0
  for i in range(len(valid_data)):
    src = vars(valid_data[i])['src']
    trg = vars(valid_data[i])['trg']
    trans = translate_sentence(src, SRC, TRG, model, device)[:-1]
    arr = [1 if a == b else 0 for a, b in zip(trans, trg)]
    aux = np.sum(arr) / len(arr)
    accuracy += aux
  return accuracy / len(valid_data)

In [None]:
train_history = []
valid_history = []

N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP, train_history, valid_history)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    # train_history.append(train_loss)
    # valid_history.append(valid_loss)

    val_example_data = next(iter(valid_iterator))
    to_print = []
    print('-----------------------------------------------------------------------------')
    for i in range(10):
        print(get_example_translation())
        print('-----------------------------------------------------------------------------')

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

-----------------------------------------------------------------------------
src = может быть лучше было не ворошить прошлое

trg = Может быть, лучше было не ворошить прошлое.

predicted trg = Может быть, лучше было не ворошить прошлое.
-----------------------------------------------------------------------------
src = кто-то вырвал из книги страницу

trg = Кто-то вырвал из книги страницу.

predicted trg = Кто-то вырвал из книги страницу.
-----------------------------------------------------------------------------
src = территорию страны на девяносто процентов составляет пустыня

trg = Территорию страны на девяносто процентов составляет пустыня.

predicted trg = Территорию страны на девяносто процентов составляет пустыня.
-----------------------------------------------------------------------------
src = она говорит я приехала сюда вчера

trg = Она говорит: Я приехала сюда вчера.

predicted trg = Она говорит я приехала сюда вчера.
-----------------------------------------------------

TypeError: ignored

In [None]:
print(f'\t Accuracy: {find_metric()}')

	 Accuracy: 0.8327190114684226


In [None]:
for i in range(10):
  print(get_example_translation())
  print('-----------------------------------------------------------')

src = чем ждать тома лучше поедим сейчас пока еда не остыла

trg = Чем ждать Тома, лучше поедим сейчас, пока еда не остыла.

predicted trg = Чем ждать Тома лучше поедим сейчас, пока еда не остыла.
-----------------------------------------------------------
src = никто не имеет права указывать мне

trg = Никто не имеет права указывать мне!

predicted trg = Никто не имеет права указывать мне.
-----------------------------------------------------------
src = у меня не было достаточно наличных денег поэтому я расплатился кредитной картой

trg = У меня не было достаточно наличных денег, поэтому я расплатился кредитной картой.

predicted trg = У меня не было достаточно наличных, денег поэтому, я расплатился кредитной картой.
-----------------------------------------------------------
src = мой номер у вас есть позвоните мне на днях

trg = Мой номер у вас есть. Позвоните мне на днях.

predicted trg = Мой номер у вас есть позвоните мне на днях.
-------------------------------------------------