In [None]:
import random
import numpy as np
import torch
import pandas as pd
from tqdm.notebook import tqdm
import nltk
from torch.utils.data import Dataset, random_split
import re
# from spellchecker import SpellChecker
from nltk import word_tokenize, sent_tokenize
import string
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from collections import Counter, defaultdict
import torch.nn as nn
import copy

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def correct_spellings(text):
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [None]:
PAD = "<PAD>"
EOS = "<EOS>"
UNK = "<UNK>"
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def tokenize_adv(sent):
  text = decontracted(sent.lower())
  text = re.sub(r'[^\w\s]', '', text)
  table = text.maketrans('', '', string.punctuation)
  text = text.translate(table)
  # sent = correct_spellings(sent)
  words = word_tokenize(text)
  return words 

In [None]:
!wget https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt

--2021-02-09 10:22:12--  https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4573338 (4.4M) [text/plain]
Saving to: ‘shakespeare_input.txt’


2021-02-09 10:22:12 (11.3 MB/s) - ‘shakespeare_input.txt’ saved [4573338/4573338]



In [None]:
with open('shakespeare_input.txt', 'r') as f:
    data = f.read().lower()

In [None]:
sonnets = data.split('\n\n')

In [None]:
sonnets = []
for son in data.split('\n\n'):
  son = re.sub(r'[\s\w]*:\n','',son.lower())
  son = re.sub(r'\n',' ',son)
  sonnets.append(son)

In [None]:
sentences = []
for son in sonnets:
  sent = sent_tokenize(son)
  sentences = sentences + sent

In [None]:
tokens=[]
for sent in sentences:
  words = tokenize_adv(sent) + [EOS]
  # if len(words)>3:
  tokens.append(words)

In [None]:
# train,test = tokens[:np.ceil(len(tokens) * 0.8).astype(int)],tokens[np.ceil(len(tokens) * 0.8).astype(int):]
trainval, test = train_test_split(tokens, test_size=0.2, random_state=42, shuffle=False)

In [None]:
train, val = train_test_split(trainval, test_size=0.2, random_state=42, shuffle=False)

In [None]:
len_tokens = {'train':len(train), 'val':len(val), 'test':len(test)}

In [None]:
print('Lenght of train with 1/2-words sentences {0} \nLenght of val  with 1/2-words sentences {1} \nLenght of test  with 1/2-words sentences {2} \n'.format(len(train), len(val), len(test)))

Lenght of train with 1/2-words sentences 34038 
Lenght of val  with 1/2-words sentences 8510 
Lenght of test  with 1/2-words sentences 10638 



In [None]:
tokens=[]
for sent in sentences:
  words = tokenize_adv(sent) + [EOS]
  if len(words)>3:
    tokens.append(words)
trainval, test = train_test_split(tokens, test_size=0.2, random_state=42, shuffle=False)
train, val = train_test_split(trainval, test_size=0.2, random_state=42, shuffle=False)
print('Lenght of train with 1/2-words sentences {0} \nLenght of val  with 1/2-words sentences {1} \nLenght of test  with 1/2-words sentences {2} \n'.format(len(train), len(val), len(test)))

Lenght of train with 1/2-words sentences 31254 
Lenght of val  with 1/2-words sentences 7814 
Lenght of test  with 1/2-words sentences 9767 



In [None]:
len_tokens_wo_1_2 = {'train':len(train), 'val':len(val), 'test':len(test)}

In [None]:
diff = len_tokens['train'] - len_tokens_wo_1_2['train'] + len_tokens['val'] - len_tokens_wo_1_2['val'] + len_tokens['test'] - len_tokens_wo_1_2['test']

In [None]:
diff/(sum(len_tokens.values()))

0.08180724250742677

Таким образом потери данных при отбрассывании 1/2 словных предложений составляют примерно 4351 предложение или 8%.

В принципе в обоих случаях можно обойтисть только тестом и трейном. Но для обеспечения более непредвзятой оценки выделим еще и валидационный датасет. Для n-gram модели он не имеет практически смысла, но все равно посмотрим оценки для него.

Также еще следует сказать о том, что для сравнения моделей обучать мы их должны тоже на одинаковых датасетах, поэтому n-gram модель тоже следут обучать только на трейне 

In [None]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in (PAD,UNK):
  int2token[cnt] = w
  cnt+= 1

for w in set(tok for token in train for tok in token):
  int2token[cnt] = w
  cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

Количественная величина, которая позволяет сравнивать LM: перплекция. Для её вычисления используется следующая формула:

$$
\text{Ppr} = \exp^{\frac{1}{|D|} \sum_{t \in D}\sum_{w \in t} - \log (p(w))},
$$
где $D$ – валидационный датасет, $|D|$ – общая длина текстов.

# NGram

In [None]:
class NGramModel(object):
  def __init__(self,n=2,k=0.5):
    self.ngrams = defaultdict(Counter)
    self.n = n
    self.vocab_ksmooth = None
    self.k = k

  def compute_ngrams(self,dataset):
    self.ngrams = defaultdict(Counter)
    for sent in tqdm(dataset):
      ngram = [PAD] * self.n
      for token in sent:
        ngram[:-1] = ngram[1:]
        ngram[-1] = token
        self.ngrams[tuple(ngram[:-1])].update([ngram[-1]])
        self.ngrams[(UNK)].update([ngram[-1]])
    self.vocab_ksmooth = Counter(dict.fromkeys(self.ngrams[(UNK)], self.k))

  def get_probs(self,tokens):
    if len(tokens)<self.n-1:
      tokens = [PAD] * (self.n - len(tokens) - 1) + tokens
    else:
      tokens = tokens[-self.n + 1:]
    if len(self.ngrams[tuple(tokens)])>0:
      possible_predicts = self.ngrams[tuple(tokens)] + self.vocab_ksmooth
    else: 
      possible_predicts = self.ngrams[(UNK)]
    sum_freq = sum((possible_predicts.values()))
    return {tok:possible_predicts[tok]/(sum_freq) for tok in possible_predicts}

  def sample(self, prefix):
    tokens = tokenize_adv(prefix)
    possible_predicts = self.get_probs(tokens)
    if len(possible_predicts) > 0:
        end = np.random.choice(list(possible_predicts.keys()), p=(list(possible_predicts.values())))
        return end
    return EOS

  def generate_text(self, prefix, length=100):
    text = "" + prefix
    while len(text) < length:
        token = self.sample(text)
        text += " " + token
        if token == EOS:
            break
    return text

  def perpelexity_ngram(self, data):
    lengths = 0
    log_prob = 0
    for row in tqdm(data):
        lengths += len(row)
        ngram = [PAD] * self.n
        for token in row:
            ngram[:-1] = ngram[1:]
            ngram[-1] = token
            log_prob += np.log(self.get_probs(ngram[:-1]).get(ngram[-1], 0.0001))
    return np.exp(-log_prob / lengths)

In [None]:
treegram = NGramModel(4)

In [None]:
treegram.compute_ngrams(train)

HBox(children=(FloatProgress(value=0.0, max=31254.0), HTML(value='')))





In [None]:
treegram.generate_text('how ill agrees')

'how ill agrees an scrupulous boy from the the <EOS>'

In [None]:
print('Perpelexity for 3-gram model on validation set - {}'.format(treegram.perpelexity_ngram(val)))

HBox(children=(FloatProgress(value=0.0, max=7814.0), HTML(value='')))


Perpelexity for 3-gram model on validation set - 1247.3379081405071


In [None]:
print('Perpelexity for 3-gram model on test set -  {}'.format(treegram.perpelexity_ngram(test)))

HBox(children=(FloatProgress(value=0.0, max=9767.0), HTML(value='')))


Perpelexity for 3-gram model on test set -  1294.9876736380197


In [None]:
model = NGramModel(5)
model.compute_ngrams(train)
print('Perpelexity for 3-gram model on validation set - {}'.format(model.perpelexity_ngram(val)))
print('Perpelexity for 3-gram model on test set - {}'.format(model.perpelexity_ngram(test)))

HBox(children=(FloatProgress(value=0.0, max=31254.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7814.0), HTML(value='')))


Perpelexity for 3-gram model on validation set - 852.6655317730408


HBox(children=(FloatProgress(value=0.0, max=9767.0), HTML(value='')))


Perpelexity for 3-gram model on test set - 887.1047522316857


#NN

In [None]:
# create integer-to-token mapping
int2token = {}
cnt = 0

for w in (PAD,UNK):
  int2token[cnt] = w
  cnt+= 1

for w in set(tok for token in train for tok in token):
  int2token[cnt] = w
  cnt+= 1

# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}

In [None]:
def get_integer_seq(seq):
  ids=[]
  for w in seq:
    if w not in token2int:
      ids.append(token2int[UNK])
    else:
      ids.append(token2int[w])
  return ids

In [None]:
def batch_generator(train_x, train_y, batch_size, shuffle=True, seed=42):
    if shuffle:  
      np.random.seed(seed)
      perm = np.random.permutation(len(train_x))
    else:
      perm = np.arange(len(train_x))
    prv=0
    for i in range(batch_size, len(train_x), batch_size):
      x = train_x[perm[prv:i]]
      y = train_y[perm[prv:i]]
      prv=i
      max_len = max(len(i) for i in x)
      input_embeds_x = np.zeros((len(x), max_len))
      input_embeds_y = np.zeros((len(y), max_len))
      for idx, row in enumerate(x):
        input_embeds_x[idx][:len(row)] += row
      for idx, row in enumerate(y):
        input_embeds_y[idx][:len(row)] += row
      x = torch.LongTensor(input_embeds_x)
      y = torch.LongTensor(input_embeds_y)
      yield x, y

In [None]:
x_train = []
y_train = []
x_val = []
y_val = []
for sent in train:
  try:
    x_train.append(get_integer_seq(sent[:-1]))
    y_train.append(get_integer_seq(sent[1:]))
  except:
    print(sent,i)
    break

for sent in val:
  try:
    x_val.append(get_integer_seq(sent[:-1]))
    y_val.append(get_integer_seq(sent[1:]))
  except:
    print(sent,i)
    break  

In [None]:
x_train_int = np.array(x_train)
y_train_int = np.array(y_train)
x_val_int = np.array(x_val)
y_val_int = np.array(y_val)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [None]:
class WordLSTM(nn.Module):
    
  def __init__(self,vocab_size=len(token2int), n_hidden=100, n_layers=4, drop_prob=0.3, lr=0.001):
      super().__init__()

      self.drop_prob = drop_prob
      self.n_layers = n_layers
      self.n_hidden = n_hidden
      self.lr = lr
      
      self.emb_layer = nn.Embedding(vocab_size, 300)

      ## define the LSTM
      self.lstm = nn.LSTM(300, n_hidden, n_layers, 
                          dropout=drop_prob, batch_first=True)
      
      ## define a dropout layer
      self.dropout = nn.Dropout(drop_prob)
      
      ## define the fully-connected layer
      self.fc = nn.Linear(n_hidden, vocab_size)      
  
  def forward(self, x, hidden):
      ''' Forward pass through the network. 
          These inputs are x, and the hidden/cell state `hidden`. '''

      ## pass input through embedding layer
      embedded = self.emb_layer(x)     
      
      ## Get the outputs and the new hidden state from the lstm
      lstm_output, hidden = self.lstm(embedded, hidden)
      
      # ## pass through a dropout layer
      out = self.dropout(lstm_output)
      
      out = out.contiguous().view(-1, self.n_hidden) 
      # out = out.reshape(-1, self.n_hidden) 
      # out = lstm_output.reshape(-1, self.n_hidden) 

      ## put "out" through the fully-connected layer
      out = self.fc(out)

      # return the final output and the hidden state
      return out, hidden
  
  
  def init_hidden(self, batch_size):
      ''' initializes hidden state '''
      # Create two new tensors with sizes n_layers x batch_size x n_hidden,
      # initialized to zero, for hidden state and cell state of LSTM
      weight = next(self.parameters()).data

      # if GPU is available
      if (torch.cuda.is_available()):
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
      
      # if GPU is not available
      else:
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
      
      return hidden

  def sample(self, prefix, h=None, max_length=100):
    tokens = tokenize_adv(prefix)
    input_ids = get_integer_seq(tokens)
    input_ids_tensor = torch.LongTensor(input_ids).unsqueeze(0).to(device)
    with torch.no_grad():
      if h==None:
        h=self.init_hidden(1)
      while True:
        h = tuple([each.data for each in h])
        output, h = self.forward(input_ids_tensor,h)
        probs = torch.softmax(output[-1,:].cpu(), -1).numpy()
        next_id = np.random.choice(np.arange(len(token2int)), p=probs)
        tokens += [int2token[next_id]]
        if tokens[-1] == EOS or len(tokens) > max_length:
                break
        input_ids += [next_id]
        input_ids_tensor = torch.LongTensor(input_ids).unsqueeze(0).to(device)
    return " ".join(t.split("_")[0] for t in tokens)

In [None]:
def train_model(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32, shuffle=True, seed=42):
    
  # optimizer
  opt = torch.optim.Adam(net.parameters(), lr=lr)
  val_pp = 10000
  # loss
  criterion = nn.CrossEntropyLoss(ignore_index=token2int[PAD])
  
  # push model to GPU
  # net.cuda()
  
  counter = 0

  for e in range(epochs):
    h = net.init_hidden(batch_size)
    net.train()        
    for x, y in batch_generator(x_train_int, y_train_int, batch_size, shuffle=shuffle, seed=seed):
      counter+= 1
      inputs, targets = x.to(device), y.to(device)
      # print(x,y)
      # print(inputs.shape,targets.shape)
      h = tuple([each.data for each in h])
      output, h = net(inputs, h)
      loss = criterion(output, targets.view(-1))
      net.zero_grad()
      loss.backward()
      nn.utils.clip_grad_norm_(net.parameters(), clip)
      opt.step()          

      if counter % print_every == 0:
      
        print("Epoch: {}/{}...".format(e+1, epochs),
              "Step: {}...".format(counter))
      # print("Epoch: {}/{}...".format(e+1, epochs),
      #         "Step: {}...".format(counter))
        # printm()
            
    net.eval()
    valid_loss = 0
    n_iter = 0
    with torch.no_grad():
      for x, y in batch_generator(x_val_int, y_val_int, batch_size):
        inputs, targets = x.to(device), y.to(device)
        n_iter += 1
        prediction, h = net(inputs,h)
        valid_loss += criterion(prediction, targets.view(-1))
        valid_perpelexity = torch.exp(valid_loss / n_iter)
    if valid_perpelexity<=val_pp:
      val_pp=valid_perpelexity
      best_model, hidden = copy.deepcopy(net.state_dict()), copy.deepcopy(h)
    print(f"Valid Loss: {valid_loss / n_iter}, Valid Peprplexity: {torch.exp(valid_loss / n_iter)}")
  return best_model, hidden

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
net = WordLSTM()

# push the model to GPU (avoid it if you are not using the GPU)
net.to(device)

print(net)

WordLSTM(
  (emb_layer): Embedding(22613, 300)
  (lstm): LSTM(300, 100, num_layers=4, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=100, out_features=22613, bias=True)
)


In [None]:
best_model, h = train_model(net, batch_size = 64, epochs=20, print_every=256, shuffle=False)

Epoch: 1/20... Step: 256...
Valid Loss: 6.656827926635742, Valid Peprplexity: 778.0789184570312
Epoch: 2/20... Step: 512...
Epoch: 2/20... Step: 768...
Valid Loss: 6.461060523986816, Valid Peprplexity: 639.7391967773438
Epoch: 3/20... Step: 1024...
Epoch: 3/20... Step: 1280...
Valid Loss: 6.3397393226623535, Valid Peprplexity: 566.6485595703125
Epoch: 4/20... Step: 1536...
Epoch: 4/20... Step: 1792...
Valid Loss: 6.290966033935547, Valid Peprplexity: 539.6744384765625
Epoch: 5/20... Step: 2048...
Epoch: 5/20... Step: 2304...
Valid Loss: 6.247931480407715, Valid Peprplexity: 516.9424438476562
Epoch: 6/20... Step: 2560...
Epoch: 6/20... Step: 2816...
Valid Loss: 6.1983819007873535, Valid Peprplexity: 491.95233154296875
Epoch: 7/20... Step: 3072...
Epoch: 7/20... Step: 3328...
Valid Loss: 6.179967403411865, Valid Peprplexity: 482.9762268066406
Epoch: 8/20... Step: 3584...
Epoch: 8/20... Step: 3840...
Valid Loss: 6.173185348510742, Valid Peprplexity: 479.7117004394531
Epoch: 9/20... Step: 

In [None]:
x_test=[]
y_test=[]
for sent in test:
  try:
    x_test.append(get_integer_seq(sent[:-1]))
    y_test.append(get_integer_seq(sent[1:]))
  except:
    print(sent)
    break
  
x_test_int = np.array(x_test)
y_test_int = np.array(y_test)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [None]:
net = WordLSTM()
net.load_state_dict(best_model)
net.to(device)
net.eval()
valid_loss = 0
n_iter = 0
criterion = nn.CrossEntropyLoss(ignore_index=token2int[PAD])
with torch.no_grad():
  for x, y in batch_generator(x_test_int, y_test_int, 64):
    inputs, targets = x.to(device), y.to(device)
    n_iter += 1
    prediction, h = net(inputs,h)
    valid_loss += criterion(prediction, targets.view(-1))
print(f"Valid Loss: {valid_loss / n_iter}, Peprplexity nn for test set for case without permutation: {torch.exp(valid_loss / n_iter)}")

Valid Loss: 6.221892833709717, Peprplexity nn for test set for case without permutation: 503.6556701660156


In [None]:
net = WordLSTM()
net.to(device)
best_model, h = train_model(net, batch_size = 64, epochs=20, print_every=256, shuffle=True, seed=42)
net.load_state_dict(best_model)
net.to(device)
net.eval()
valid_loss = 0
n_iter = 0
criterion = nn.CrossEntropyLoss(ignore_index=token2int[PAD])
with torch.no_grad():
  for x, y in batch_generator(x_test_int, y_test_int, 64):
    inputs, targets = x.to(device), y.to(device)
    n_iter += 1
    prediction, h = net(inputs,h)
    valid_loss += criterion(prediction, targets.view(-1))
print(f"Valid Loss: {valid_loss / n_iter}, Peprplexity nn for test set for case with permutation and seed=42: {torch.exp(valid_loss / n_iter)}")

Epoch: 1/20... Step: 256...
Valid Loss: 6.626049041748047, Valid Peprplexity: 754.495361328125
Epoch: 2/20... Step: 512...
Epoch: 2/20... Step: 768...
Valid Loss: 6.456684112548828, Valid Peprplexity: 636.9454956054688
Epoch: 3/20... Step: 1024...
Epoch: 3/20... Step: 1280...
Valid Loss: 6.35479736328125, Valid Peprplexity: 575.2457275390625
Epoch: 4/20... Step: 1536...
Epoch: 4/20... Step: 1792...
Valid Loss: 6.296908378601074, Valid Peprplexity: 542.8909301757812
Epoch: 5/20... Step: 2048...
Epoch: 5/20... Step: 2304...
Valid Loss: 6.238011837005615, Valid Peprplexity: 511.8398742675781
Epoch: 6/20... Step: 2560...
Epoch: 6/20... Step: 2816...
Valid Loss: 6.1901044845581055, Valid Peprplexity: 487.8970947265625
Epoch: 7/20... Step: 3072...
Epoch: 7/20... Step: 3328...
Valid Loss: 6.16194486618042, Valid Peprplexity: 474.3497009277344
Epoch: 8/20... Step: 3584...
Epoch: 8/20... Step: 3840...
Valid Loss: 6.145797252655029, Valid Peprplexity: 466.7515869140625
Epoch: 9/20... Step: 4096.

In [None]:
net.sample('how ill agrees')

'how ill agrees the lands of what got every thing <EOS>'

In [None]:
# net = WordLSTM()
# net.to(device)
# best_model, h = train_model(net, batch_size = 64, epochs=20, print_every=256, shuffle=True, seed=101)
net.load_state_dict(best_model)
net.to(device)
net.eval()
valid_loss = 0
n_iter = 0
criterion = nn.CrossEntropyLoss(ignore_index=token2int[PAD])
with torch.no_grad():
  for x, y in batch_generator(x_test_int, y_test_int, 64):
    inputs, targets = x.to(device), y.to(device)
    n_iter += 1
    prediction, h = net(inputs,h)
    valid_loss += criterion(prediction, targets.view(-1))
print(f"Valid Loss: {valid_loss / n_iter}, Peprplexity nn for test set for case with permutation and seed=101: {torch.exp(valid_loss / n_iter)}")
net.sample('how ill agrees')

Valid Loss: 6.198301792144775, Peprplexity nn for test set for case with permutation and seed=101: 491.9129638671875


'how ill agrees up just the king <EOS>'

In [None]:
net = WordLSTM()
net.to(device)
best_model, h = train_model(net, batch_size = 64, epochs=20, print_every=256, shuffle=True, seed=4)
net.load_state_dict(best_model)
net.to(device)
net.eval()
valid_loss = 0
n_iter = 0
criterion = nn.CrossEntropyLoss(ignore_index=token2int[PAD])
with torch.no_grad():
  for x, y in batch_generator(x_test_int, y_test_int, 64):
    inputs, targets = x.to(device), y.to(device)
    n_iter += 1
    prediction, h = net(inputs,h)
    valid_loss += criterion(prediction, targets.view(-1))
print(f"Valid Loss: {valid_loss / n_iter}, Peprplexity nn for test set for case with permutation and seed=4: {torch.exp(valid_loss / n_iter)}")
net.sample('how ill agrees')

Epoch: 1/20... Step: 256...
Valid Loss: 6.6598124504089355, Valid Peprplexity: 780.404541015625
Epoch: 2/20... Step: 512...
Epoch: 2/20... Step: 768...
Valid Loss: 6.502918720245361, Valid Peprplexity: 667.0858154296875
Epoch: 3/20... Step: 1024...
Epoch: 3/20... Step: 1280...
Valid Loss: 6.359167098999023, Valid Peprplexity: 577.7649536132812
Epoch: 4/20... Step: 1536...
Epoch: 4/20... Step: 1792...
Valid Loss: 6.299463748931885, Valid Peprplexity: 544.280029296875
Epoch: 5/20... Step: 2048...
Epoch: 5/20... Step: 2304...
Valid Loss: 6.2424187660217285, Valid Peprplexity: 514.1005249023438
Epoch: 6/20... Step: 2560...
Epoch: 6/20... Step: 2816...
Valid Loss: 6.194631576538086, Valid Peprplexity: 490.11083984375
Epoch: 7/20... Step: 3072...
Epoch: 7/20... Step: 3328...
Valid Loss: 6.159933567047119, Valid Peprplexity: 473.3966064453125
Epoch: 8/20... Step: 3584...
Epoch: 8/20... Step: 3840...
Valid Loss: 6.144692897796631, Valid Peprplexity: 466.2364196777344
Epoch: 9/20... Step: 4096.

'how ill agrees we hast forgot the weather whom the hire softly out of her as drive my pleasure in dull to so stray to steal <EOS>'

In [None]:
net = WordLSTM()
net.to(device)
best_model, h = train_model(net, batch_size = 64, epochs=20, print_every=256, shuffle=True, seed=63)
net.load_state_dict(best_model)
net.to(device)
net.eval()
valid_loss = 0
n_iter = 0
criterion = nn.CrossEntropyLoss(ignore_index=token2int[PAD])
with torch.no_grad():
  for x, y in batch_generator(x_test_int, y_test_int, 64):
    inputs, targets = x.to(device), y.to(device)
    n_iter += 1
    prediction, h = net(inputs,h)
    valid_loss += criterion(prediction, targets.view(-1))
print(f"Valid Loss: {valid_loss / n_iter}, Peprplexity nn for test set for case with permutation and seed=63: {torch.exp(valid_loss / n_iter)}")
net.sample('how ill agrees')

Epoch: 1/20... Step: 256...
Valid Loss: 6.682488441467285, Valid Peprplexity: 798.30322265625
Epoch: 2/20... Step: 512...
Epoch: 2/20... Step: 768...
Valid Loss: 6.476461887359619, Valid Peprplexity: 649.6682739257812
Epoch: 3/20... Step: 1024...
Epoch: 3/20... Step: 1280...
Valid Loss: 6.362300872802734, Valid Peprplexity: 579.5783081054688
Epoch: 4/20... Step: 1536...
Epoch: 4/20... Step: 1792...
Valid Loss: 6.306774139404297, Valid Peprplexity: 548.2734375
Epoch: 5/20... Step: 2048...
Epoch: 5/20... Step: 2304...
Valid Loss: 6.2605462074279785, Valid Peprplexity: 523.5048217773438
Epoch: 6/20... Step: 2560...
Epoch: 6/20... Step: 2816...
Valid Loss: 6.2092742919921875, Valid Peprplexity: 497.34014892578125
Epoch: 7/20... Step: 3072...
Epoch: 7/20... Step: 3328...
Valid Loss: 6.177935600280762, Valid Peprplexity: 481.9958801269531
Epoch: 8/20... Step: 3584...
Epoch: 8/20... Step: 3840...
Valid Loss: 6.16451358795166, Valid Peprplexity: 475.5697326660156
Epoch: 9/20... Step: 4096...
E

'how ill agrees sparkling like nero which hurry from upright self late unto some addition and at my merry honour live a sucking barbermonger standing hath companyof unmake against him with that emmence and beggar for a war that this hours flies <EOS>'

Как видим, получается разный результат в зависимости от параметра seed. Также заметим, что в случае без перемешивания порядка предложений модель показала худший результат. 

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isnt guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 GPUs = GPU.getGPUs()
 # XXX: only one GPU on Colab and isnt guaranteed
 gpu = GPUs[0]
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.4 GB  | Proc size: 667.0 MB
GPU RAM Free: 15069MB | Used: 10MB | Util   0% | Total 15079MB


In [None]:
# seqs_train = [create_seq(i,8) for i in train]

# # merge list-of-lists into a single list
# seqs_train = sum(seqs_train, [])

# # count of sequences
# len(seqs_train)

In [None]:
# seqs_val = [create_seq(i,8) for i in val]

# # merge list-of-lists into a single list
# seqs_val = sum(seqs_val, [])

# # count of sequences
# len(seqs_val)