## Task agnostic data augmentation

* take random russian texts
* lemmatize
* make random word order

In [1]:
#!g1.1
with open('lenta.txt', 'r', encoding='utf-8') as file:
    lenta_data = file.read()

In [2]:
#!g1.1
lenta_data[:30]

'Бои у Сопоцкина и Друскеник за'

In [3]:
#!g1.1
len(lenta_data)

11536552

In [7]:
#!g1.1
from razdel import sentenize
from pymystem3 import Mystem
import pandas as pd
import numpy as np
import random
from pprint import pprint
from tqdm import tqdm

- Split into separate sentences
- tokenize and lemmatize this sentences and keep this in the dataframe

In [8]:
#!g1.1
russian_sents = []
lemma_sents = []
fake_rsl_sents = []

mystem = Mystem(entire_input=False)

for sentence in tqdm(list(sentenize(lenta_data))):  # take all data
    russian_sents.append(sentence.text)
    
    lemma_sent = mystem.lemmatize(sentence.text)
    lemma_sents.append(lemma_sent)
    fake_rsl_sents.append(random.sample(lemma_sent, len(lemma_sent)))

pprint(fake_rsl_sents[1])
print(lemma_sents[1])

100%|██████████| 75659/75659 [01:14<00:00, 1019.49it/s]


['неприятель',
 'приближаться',
 'артиллерийский',
 'крепость',
 'осовец',
 'начинать',
 'север',
 'к',
 'борьба',
 'с',
 'с']
['неприятель', 'приближаться', 'с', 'север', 'к', 'осовец', 'начинать', 'артиллерийский', 'борьба', 'с', 'крепость']


In [9]:
#!g1.1
generated_data = pd.DataFrame()
generated_data['rsl'] = fake_rsl_sents
generated_data['rus'] = russian_sents
generated_data['lemmatized'] = lemma_sents

In [10]:
#!g1.1
generated_data.head()

Unnamed: 0,rsl,rus,lemmatized
0,"[бой, заканчиваться, друскеник, сопоцкин, у, о...",Бои у Сопоцкина и Друскеник закончились отступ...,"[бой, у, сопоцкин, и, друскеник, заканчиваться..."
1,"[неприятель, приближаться, артиллерийский, кре...","Неприятель, приблизившись с севера к Осовцу на...","[неприятель, приближаться, с, север, к, осовец..."
2,"[принимать, участие, тяжелый, в, калибр, бой, ...",В артиллерийском бою принимают участие тяжелые...,"[в, артиллерийский, бой, принимать, участие, т..."
3,"[утро, достигать, значительный, огонь, напряже...",С раннего утра 14 сентября огонь достиг значит...,"[с, ранний, утро, сентябрь, огонь, достигать, ..."
4,"[пехота, крепость, пробиваться, германский, бл...",Попытка германской пехоты пробиться ближе к кр...,"[попытка, германский, пехота, пробиваться, бли..."


In [11]:
#!g1.1
import os, sys
sys.path.append('../')
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from argparse import Namespace
from pyeaf.text import VocabularyVectorizer, TextStemmer, RSLStemmer, GramBinarizer


st = TextStemmer()

stem_sentences_rus, gram_sentences_rus = st.stem(generated_data['rus'], gram=True)

In [12]:
#!g1.1
stem_sentences_rus[:2]

[['бой',
  'у',
  'сопоцкина',
  'и',
  'друскеник',
  'заканчиваться',
  'отступление',
  'германец'],
 ['неприятель',
  'приближаться',
  'с',
  'север',
  'к',
  'осовец',
  'начинать',
  'артиллерийский',
  'борьба',
  'с',
  'крепость']]

In [13]:
#!g1.1
import json

test_data = pd.read_csv('test_data.csv', encoding='utf-8')

test_data['test_stem_rus'] = test_data['test_stem_rus'].apply(lambda sent: sent.strip('[]\'').split('\', \''))
test_data['test_rsl'] = test_data['test_rsl'].apply(lambda sent: sent.strip('[]\'').split('\', \''))
test_data['test_gram_rus'] = test_data['test_gram_rus'].apply(lambda sent: json.loads(sent.replace('\'', '\"')))

### Train model

In [14]:
#!g1.1
voc_rus = VocabularyVectorizer(phrase_border=True)
bin_gram = GramBinarizer(phrase_border=True)
voc_rsl = VocabularyVectorizer(phrase_border=True)

In [15]:
#!g1.1
voc_rus = voc_rus.fit(stem_sentences_rus)
bin_gram = bin_gram.fit(gram_sentences_rus)
voc_rsl = voc_rsl.fit(list(generated_data['rsl']))

In [16]:
#!g1.1
# Save\load vocabs
import pickle

def save_vocab(vocab, path):
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()
    
def load_vocab(path):
    output = open(path, 'rb')
    vocab = pickle.load(output)
    output.close()
    return vocab

In [17]:
#!g1.1
save_vocab(voc_rus, 'voc_rus4.pkl')
save_vocab(bin_gram, 'bin_gram4.pkl')
save_vocab(voc_rsl, 'voc_rsl4.pkl')

In [None]:
#!g1.1
# Load vocabs
#voc_rus = load_vocab('voc_rus2.pkl')
#bin_gram = load_vocab('bin_gram2.pkl')
#voc_rsl = load_vocab('voc_rsl2.pkl')

In [18]:
#!g1.1
voc_rus.index_to_text([[5]])

[['бой']]

In [19]:
#!g1.1
vec_sentences_rus_train = voc_rus.text_to_index(stem_sentences_rus)
vec_gram_train = bin_gram.transform(gram_sentences_rus)
vec_sentences_rsl_train = voc_rsl.text_to_index(list(generated_data['rsl']))

In [20]:
#!g1.1
words = [w for s in list(generated_data['rsl']) for w in s]

### Net

In [21]:
#!g1.1
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F

from model.model import Translator

In [22]:
#!g1.1
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.
    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better
    
    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
         
        # If loss worsened
        if loss_t >= loss_tm1:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def normalize_sizes(y_pred, y_true):
    """Normalize tensor sizes
    
    Args:
        y_pred (torch.Tensor): the output of the model
            If a 3-dimensional tensor, reshapes to a matrix
        y_true (torch.Tensor): the target predictions
            If a matrix, reshapes to be a vector
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

In [23]:
#!g1.1
import random 

def batch_generator(rus_data, rsl_data, batch_size=32):
    rus_data = np.array(rus_data)
    rsl_data = np.array(rsl_data)
    
    data_length = len(rus_data)
    tail_length = batch_size - data_length % batch_size
    index = list(range(data_length))
    random.shuffle(index)
    
    index = np.array(index + random.choices(index, k=tail_length))
    num_batches = len(index) // batch_size
    index = index.reshape((num_batches, batch_size))
    
    for batch_ind, inds in enumerate(tqdm(index)):
        yield batch_ind, torch.tensor(rus_data[inds]), torch.tensor(rsl_data[inds])

In [31]:
#!g1.1
from argparse import Namespace
from model.model import device

args = Namespace(
    seed = 1337,
    learning_rate = 5e-4, 
    batch_size = 64,  # 64
    num_epochs = 30,  # 30
    rus_emb_size = 16,  # 16
    rsl_emb_size = 16,  # 16
    rnn_size = 64,  # 64
    early_stopping_criteria = 5,
    mask_index = voc_rsl.mask_ind,
    max_norm = 2.0,
    norm_type = 2
)

set_seed_everywhere(args.seed, torch.cuda.is_available())

model_zero = Translator(voc_rus.word_count, args.rus_emb_size, voc_rsl.word_count, args.rsl_emb_size, args.rnn_size, voc_rsl.bos_ind)

optimizer = optim.Adam(model_zero.parameters(), lr=args.learning_rate)

In [32]:
#!g1.1
print(device)

cuda


In [33]:
#!g1.1
model_zero.to(device)

Translator(
  (encoder): TranslatorEncoder(
    (rus_embeddings): Embedding(50350, 16, max_norm=1.0)
    (rus_birnn): GRU(16, 64, batch_first=True, bidirectional=True)
  )
  (decoder): TranslatorDecoder(
    (rsl_embedding): Embedding(50757, 16, max_norm=1.0)
    (gru_cell): GRUCell(144, 128)
    (linear_map): Linear(in_features=128, out_features=128, bias=True)
    (classifier): Linear(in_features=256, out_features=50757, bias=True)
  )
)

In [34]:
#!g1.1
# train

for epoch_index in range(args.num_epochs):
    # sample_probability = (10 + epoch_index) / args.num_epochs
    if epoch_index < 0.5 * args.num_epochs:
        sample_probability = 0.05
    else:
        sample_probability = ( 2 * (epoch_index+1) - args.num_epochs) / args.num_epochs
    
    running_loss = 0.0
    running_acc = 0.0
    model_zero.train()
    
    for batch_ind, rus_batch, rsl_batch in batch_generator(vec_sentences_rus_train, vec_sentences_rsl_train, args.batch_size):
        optimizer.zero_grad()
        
        rus_batch = rus_batch.to(device)
        rsl_batch = rsl_batch.to(device)
        
        y_pred = model_zero(rus_batch, rsl_batch, sample_probability)  # 0.0
        y_pred = y_pred.to(device)
        
        loss = sequence_loss(y_pred, rsl_batch, args.mask_index)
        
        loss.backward()
        
        #nn.utils.clip_grad_value_(model_zero.parameters(), clip_value=1.0)
        nn.utils.clip_grad_norm_(model_zero.parameters(), args.max_norm, args.norm_type)
        
        optimizer.step()
        
        running_loss += (loss.item() - running_loss) / (batch_ind + 1)
        acc_t = compute_accuracy(y_pred, rsl_batch, args.mask_index)
        
    print('Epoch: ', epoch_index, 'Loss: ', running_loss, '\tacc: ', acc_t, '\tsample_prob: ', sample_probability)

100%|██████████| 1183/1183 [09:43<00:00,  2.03it/s]


Epoch:  0 Loss:  5.989664720461114 	acc:  55.86264656616415 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:44<00:00,  2.03it/s]


Epoch:  1 Loss:  2.8882254588835736 	acc:  81.5068493150685 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:43<00:00,  2.03it/s]


Epoch:  2 Loss:  1.7003645707970434 	acc:  82.72921108742004 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:43<00:00,  2.03it/s]


Epoch:  3 Loss:  1.2858160284748557 	acc:  94.59053343350864 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:42<00:00,  2.03it/s]


Epoch:  4 Loss:  1.0655821849255886 	acc:  86.29518072289156 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:44<00:00,  2.03it/s]


Epoch:  5 Loss:  0.9350797632742213 	acc:  92.58987527512839 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:42<00:00,  2.03it/s]


Epoch:  6 Loss:  0.8421056348695054 	acc:  93.93258426966293 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:43<00:00,  2.03it/s]


Epoch:  7 Loss:  0.7732615658189914 	acc:  92.62711864406779 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:44<00:00,  2.03it/s]


Epoch:  8 Loss:  0.7029077553789386 	acc:  90.04707464694015 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:45<00:00,  2.02it/s]


Epoch:  9 Loss:  0.680064434747725 	acc:  96.79888656924147 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:44<00:00,  2.02it/s]


Epoch:  10 Loss:  0.5969116804036463 	acc:  93.61856417693981 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:45<00:00,  2.02it/s]


Epoch:  11 Loss:  0.5850044311113205 	acc:  90.73668854850474 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:43<00:00,  2.03it/s]


Epoch:  12 Loss:  0.5316171836759815 	acc:  99.57924263674614 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:42<00:00,  2.03it/s]


Epoch:  13 Loss:  0.5124693227426438 	acc:  99.16142557651992 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:43<00:00,  2.03it/s]


Epoch:  14 Loss:  0.5237196948558771 	acc:  98.93148962916405 	sample_prob:  0.05


100%|██████████| 1183/1183 [09:46<00:00,  2.02it/s]


Epoch:  15 Loss:  0.6017869923984914 	acc:  92.32522796352583 	sample_prob:  0.06666666666666667


100%|██████████| 1183/1183 [09:58<00:00,  1.98it/s]


Epoch:  16 Loss:  1.153186984530263 	acc:  87.1111111111111 	sample_prob:  0.13333333333333333


100%|██████████| 1183/1183 [10:08<00:00,  1.94it/s]


Epoch:  17 Loss:  1.6358763048408018 	acc:  77.86946736684172 	sample_prob:  0.2


100%|██████████| 1183/1183 [10:18<00:00,  1.91it/s]


Epoch:  18 Loss:  2.182811187317285 	acc:  71.83641975308642 	sample_prob:  0.26666666666666666


100%|██████████| 1183/1183 [10:29<00:00,  1.88it/s]


Epoch:  19 Loss:  2.5814182330064614 	acc:  68.82591093117408 	sample_prob:  0.3333333333333333


100%|██████████| 1183/1183 [10:41<00:00,  1.84it/s]


Epoch:  20 Loss:  3.0426389950871524 	acc:  76.70850767085076 	sample_prob:  0.4


100%|██████████| 1183/1183 [10:51<00:00,  1.82it/s]


Epoch:  21 Loss:  3.4505207538604754 	acc:  65.21136521136522 	sample_prob:  0.4666666666666667


100%|██████████| 1183/1183 [11:02<00:00,  1.78it/s]


Epoch:  22 Loss:  3.8756305834850027 	acc:  37.23978411719352 	sample_prob:  0.5333333333333333


100%|██████████| 1183/1183 [11:12<00:00,  1.76it/s]


Epoch:  23 Loss:  4.251237797616202 	acc:  44.868735083532215 	sample_prob:  0.6


100%|██████████| 1183/1183 [11:21<00:00,  1.73it/s]


Epoch:  24 Loss:  4.631076355053021 	acc:  41.356184798807746 	sample_prob:  0.6666666666666666


100%|██████████| 1183/1183 [11:32<00:00,  1.71it/s]


Epoch:  25 Loss:  4.979520787249559 	acc:  31.978931527464262 	sample_prob:  0.7333333333333333


100%|██████████| 1183/1183 [11:42<00:00,  1.68it/s]


Epoch:  26 Loss:  5.2878783111943095 	acc:  38.8927820602663 	sample_prob:  0.8


100%|██████████| 1183/1183 [11:53<00:00,  1.66it/s]


Epoch:  27 Loss:  5.575650962046015 	acc:  18.550106609808104 	sample_prob:  0.8666666666666667


100%|██████████| 1183/1183 [12:02<00:00,  1.64it/s]


Epoch:  28 Loss:  5.837788695112089 	acc:  14.50381679389313 	sample_prob:  0.9333333333333333


100%|██████████| 1183/1183 [12:13<00:00,  1.61it/s]


Epoch:  29 Loss:  5.921336669389817 	acc:  15.438324282389448 	sample_prob:  1.0


When sample probability started to grow, the model accuracy started to drop dramatically. Probably it's because this word order is totally random, and when the sample probability was super low, it overfit quickly, as soon as it started to rise, the model got confused with the random word order.

In [35]:
#!g1.1
torch.save(model_zero.state_dict(), "model30_gen_task_agnostic.pth")

In [36]:
#!g1.1 
model_zero.eval()

Translator(
  (encoder): TranslatorEncoder(
    (rus_embeddings): Embedding(50350, 16, max_norm=1.0)
    (rus_birnn): GRU(16, 64, batch_first=True, bidirectional=True)
  )
  (decoder): TranslatorDecoder(
    (rsl_embedding): Embedding(50757, 16, max_norm=1.0)
    (gru_cell): GRUCell(144, 128)
    (linear_map): Linear(in_features=128, out_features=128, bias=True)
    (classifier): Linear(in_features=256, out_features=50757, bias=True)
  )
)

In [37]:
#!g1.1
import re


regex = re.compile(r'(?:<bos>|<eos>.*)')
def pred_vs_rsl(tensor, true_rsl, rus): 
    
    tensor = tensor.argmax(2).tolist()
    
    pred_rsl = voc_rsl.index_to_text(tensor)
    true_rsl = voc_rsl.index_to_text(true_rsl)
    rus = voc_rus.index_to_text(rus)
                   
    f = lambda x: regex.sub('', " ".join(x))
                   
    pred_rsl = [f(sentence) for sentence in pred_rsl]
    true_rsl = [f(sentence) for sentence in true_rsl]
    true_rus = [f(sentence) for sentence in rus]
    
    return pred_rsl, true_rsl, true_rus

In [38]:
#!g1.1
vec_sentences_rus_test = voc_rus.text_to_index(list(test_data['test_stem_rus']))
vec_gram_test = bin_gram.transform(list(test_data['test_gram_rus']))
vec_sentences_rsl_test = voc_rsl.text_to_index(list(test_data['test_rsl']))

In [None]:
#!g1.1
rus = torch.tensor(np.array(vec_sentences_rus_test)).to(device)
rsl = torch.tensor(np.array(vec_sentences_rsl_test)).to(device)
y_pred = model_zero(rus,
                    rsl, 1).to_device  #0
trans, truth_rsl, rus = pred_vs_rsl(y_pred,
                                    rsl,
                                    rus)

RuntimeError: CUDA out of memory. Tried to allocate 140.00 MiB (GPU 0; 31.75 GiB total capacity; 30.23 GiB already allocated; 93.50 MiB free; 30.42 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [42]:
#!g1.1
voc_rus.word_count, voc_rsl.word_count

(455, 460)

In [43]:
#!g1.1
for i in range(10):
    print("RSL: ", truth_rsl[i])
    print("TRANS: ", trans[i])
    print("RUS: ", rus[i])
    print('\n')

RSL:   <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> приходить <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 
TRANS:   и 
RUS:   а когда <unk> <unk> <unk> <unk> <unk> <unk> <unk> приходить к <unk> и сказать а <unk> то <unk> в <unk> <unk> <unk> <unk> 


RSL:   <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 
TRANS:   и 
RUS:   а <unk> <unk> <unk> весь <unk> 


RSL:    
TRANS:   
RUS:   <unk> <unk> <unk> что <unk> быть в <unk> год у мы <unk> быть <unk> <unk> <unk> и это <unk> <unk> <unk> в себя в то <unk> и <unk> <unk> <unk> <unk> в другой <unk> 


RSL:    
TRANS:    
RUS:   <unk> <unk> <unk> <unk> <unk> свой <unk> по <unk> <unk> <unk> <unk> 


RSL:   <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> свой <unk> <unk> и <unk> <unk> 
TRANS:    
RUS:   <unk> <unk> хотеть <unk> свой <unk> <unk> и начинать <unk> 


RSL:   <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 
TRANS:    
RUS:   <unk> <unk> это <unk> в <unk> 


RSL:   <unk> <unk

In [44]:
#!g1.1
import sacrebleu

rus_rsl_bleu = sacrebleu.corpus_bleu(trans, [truth_rsl])
print("--------------------------")
print("Russian to RSL: ", rus_rsl_bleu.score)

--------------------------
Russian to RSL:  0.0


In [45]:
#!g1.1
# save results to file
result_data = pd.DataFrame.from_dict({"truth RSL": truth_rsl, "TRANS": trans, "RUS": rus})
result_data.to_csv('test_result_task_agnostic_seq2seq.csv', index=False)  