In [1]:
%pip install wandb

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import wandb
import random

import math 

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from nltk.translate.bleu_score import sentence_bleu ,SmoothingFunction 

from tokenizers import Tokenizer
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

In [3]:
!mkdir /kaggle/working/model

In [4]:
wandb.login(key="2448fa0c4f7d7dcf19f87fbb3ce5a9743515b380")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
wandb.init(
    project="G2P",
    config={
    "BATCH_SIZE":  512,
    "NUM_EPOHS":  40,
    "LR": 3e-4,
    "D_MODEL":  512,
    "D_FF": 2048,
    "NUM": 3,
    "NUM_HEADS" : 4,
    "RANDOM_STATE": 42,
    "MAX_LEN": 32,
    "SAVE_DIR": "/kaggle/working/model",
    "BOS_TOKEN": "<bos>",
    "EOS_TOKEN": "<eos>",
    "PAD_TOKEN": "<pad>",
    'UNK_TOKEN': "<unk>"
    },
    name="обучение на 512_2048_33_44 без стресса",
)

[34m[1mwandb[0m: Currently logged in as: [33mnicitacom2018[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
device  = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
def pd_get_list(path):
    df = pd.read_csv(path)
    df = df.dropna()
    word_list = df['word'].to_list()
    phoneme_list = df['phoneme'].to_list()
    return word_list, phoneme_list

In [8]:
word_list_test , phoneme_list_test = pd_get_list("/kaggle/input/lexicin200k/test.csv")
word_list_train , phoneme_list_train = pd_get_list("/kaggle/input/lexicin200k/train.csv")
word_list_val , phoneme_list_val = pd_get_list("/kaggle/input/lexicin200k/val.csv")

In [9]:
class CreateDataset:
    def __init__(self, word , phoneme, stress=True):
        self.word = word
        self.phoneme = phoneme
        self.stress = stress
        
    def __getitem__(self, idx: int):
        if  not(self.stress):
            if any( phonem[-1] in '0123' for phonem in  self.phoneme[idx].split() ):
                self.phoneme_ = re.sub(r'\d+', '', self.phoneme[idx]).strip()
                
            return {"input": self.word[idx],
                    "label": self.phoneme_}
        
        else:
             return {"input": self.word[idx].strip(),
                    "label": self.phoneme[idx]}
            
    
    def __len__(self):
        return len(self.word)

In [10]:
def crate_dataset(word_list ,phoneme_list):
    data_with_stress = CreateDataset(word_list ,phoneme_list , stress = True)
    data_without_stress = CreateDataset(word_list ,phoneme_list , stress = False)
    return data_with_stress , data_without_stress

In [11]:
data_train_with , data_train_without = crate_dataset(word_list_train ,phoneme_list_train)
data_test_with , data_test_without = crate_dataset(word_list_test ,phoneme_list_test)
data_val_with , data_val_without = crate_dataset(word_list_val ,phoneme_list_val)

In [12]:
data_bpe_train_l =  ['Ġ'.join(i["label"].split()) for i in data_train_without ]
data_bpe_train_s =  [i["input"].upper() for i in  data_test_without  ]
data_bpe_test_l =  ['Ġ'.join(i["label"].split()) for i in data_test_without ]
data_bpe_test_s =  [i["input"].upper() for i in  data_test_without ]
data_bpe_val_l = ['Ġ'.join(i["label"].split()) for i in data_val_without ]
data_bpe_val_s = [i["input"].upper() for i in data_val_without ]

In [13]:
from tokenizers import CharBPETokenizer
tokenizer = CharBPETokenizer()
data_for_bpe = data_bpe_train_l+data_bpe_train_s+data_bpe_test_l+data_bpe_test_s+data_bpe_val_l+data_bpe_val_s
tokenizer.train_from_iterator(data_for_bpe ,vocab_size=256)
tokenizer.add_special_tokens([wandb.config.PAD_TOKEN,
                              wandb.config.BOS_TOKEN,
                              wandb.config.EOS_TOKEN,
                              wandb.config.UNK_TOKEN])






3

In [14]:
tokenizer.save("/kaggle/working/bpe_256_lex")

In [15]:
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [16]:
class Dataset_WithPudding:
    def __init__(self, sentences , max_len: int = 32, debag=False):
        self.ds = sentences
        self.seq_len = max_len 
        self.debag = debag
        self.pad_id = torch.tensor(tokenizer.encode(wandb.config.PAD_TOKEN).ids)

    def __getitem__(self, idx: int):
        
        src_target_pair = self.ds[idx]
        
        src_text = src_target_pair["input"]
        tgt_text = 'Ġ'.join(src_target_pair['label'].split())
        
        enc_input_tokens = torch.tensor(tokenizer.encode(src_text).ids)
        dec_input_tokens = tokenizer.encode(tgt_text).ids 

    
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1
        
        #print(enc_num_padding_tokens , dec_num_padding_tokens)
        
        encoder_input = torch.cat(
            [
                torch.tensor(tokenizer.encode(wandb.config.BOS_TOKEN).ids),
                enc_input_tokens,
                torch.tensor(tokenizer.encode(wandb.config.EOS_TOKEN).ids),
                self.pad_id.repeat(enc_num_padding_tokens)
            ],
            dim=0,
        )
        
        
        # Add only <s> token
        decoder_input = torch.cat(
            [
                torch.tensor(tokenizer.encode(wandb.config.BOS_TOKEN).ids),
                torch.tensor(dec_input_tokens),
                self.pad_id.repeat(dec_num_padding_tokens)
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor(tokenizer.encode(wandb.config.EOS_TOKEN).ids),
                self.pad_id.repeat(dec_num_padding_tokens),
            ],
            dim=0,
        )
        
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_id).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_id).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text.upper(), # (seq_len)
            "tgt_text": src_target_pair['label'] # (seq_len)
        }


    def __len__(self) -> int:
        return len(self.ds)
    


In [17]:
def collate_fn(bath):
    encoder_input = torch.stack([i["encoder_input"].to(device) for i in bath])
    decoder_input =  torch.stack([i["decoder_input"].to(device) for i in bath])
    encoder_mask =  torch.stack([i["encoder_mask"].to(device) for i in bath])
    decoder_mask = torch.stack([i["decoder_mask"].to(device) for i in bath])
    label = torch.stack( [i["label"] for i in bath] )
    src_text = [i["src_text"] for i in bath]
    tgt_text = [i["tgt_text"] for i in bath]
    
    return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": encoder_mask, # (1, 1, seq_len)
            "decoder_mask": decoder_mask, # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text, # (seq_len)
            "tgt_text": tgt_text, # (seq_len) 
           }

In [18]:
dataset_train = Dataset_WithPudding(data_train_without)
dataset_test = Dataset_WithPudding(data_test_without)
dataset_val = Dataset_WithPudding(data_val_without)

In [19]:
def decode_form_G(tokens):
    return ''.join(tokens).split('Ġ')

In [20]:
batch_size = wandb.config.BATCH_SIZE

train_dataloader = DataLoader(
        dataset_train,
        shuffle=True,
        batch_size=batch_size,
        drop_last=True,
        collate_fn=collate_fn)

val_dataloader = DataLoader(
        dataset_val,
        batch_size=batch_size,
        collate_fn=collate_fn)

test_dataloader =DataLoader(
        dataset_test,
        batch_size=batch_size,
        collate_fn=collate_fn)

In [21]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [22]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
        
        self.fc = nn.Linear(d_model, d_model)
        
    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)
    
    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)
        
        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = F.softmax(scores, dim=-1)
        
        out = torch.matmul(attn, v)
        out = out.permute(0, 2, 1, 3).contiguous()
        out = out.view(batch_size, -1, self.d_model)
        
        out = self.fc(out)
        return out

In [23]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [24]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
        
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.layernorm1(x + self.dropout(attn_output))
        
        ffn_output = self.ffn(x)
        x = self.layernorm2(x + self.dropout(ffn_output))
        return x

In [25]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
        self.cross_attn = MultiHeadSelfAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
        
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.layernorm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        self_attn_output = self.self_attn(q=x, k=x, v=x, mask=tgt_mask)
        x = self.layernorm1(x + self.dropout(self_attn_output))
        
        cross_attn_output = self.cross_attn(q=x, k=enc_output, v=enc_output, mask=src_mask)
        x = self.layernorm2(x + self.dropout(cross_attn_output))
        
        ffn_output = self.ffn(x)
        x = self.layernorm3(x + self.dropout(ffn_output))
        return x

In [26]:
class TransformerBlock(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, d_model=512, num_heads=8, num_encoder_layers=3, num_decoder_layers=3, d_ff=2048, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(target_vocab_size, d_model)
        
        self.pos_embedding = PositionalEncoding(d_model , wandb.config.MAX_LEN)
        
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_decoder_layers)])
        
        self.fc_out = nn.Linear(d_model, target_vocab_size)
    
    def encode(self, src, src_mask):
        src = self.pos_embedding(self.encoder_embedding(src))
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        return src
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        tgt = self.pos_embedding(self.decoder_embedding(tgt))
        for layer in self.decoder_layers:
            tgt = layer(tgt, memory, src_mask, tgt_mask)
        return tgt
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        memory = self.encode(src, src_mask)
        output = self.decode(memory, src_mask, tgt, tgt_mask)
        output = self.fc_out(output)
        return output

    def decode_from_G_out(self ,seq):
        return ''.join(seq).split('Ġ')
    
    
    def pred(self, srs):
        
        model.eval()
        with torch.no_grad():
            seq = ''.join(list(srs.upper()))

            enc_input_tokens = tokenizer.encode(seq).ids
            pad_id =torch.tensor(tokenizer.encode(wandb.config.PAD_TOKEN).ids)
            enc_num_padding_tokens = 32 - len(enc_input_tokens) - 2
            encoder_input = torch.cat(
                        [
                            torch.tensor(tokenizer.encode(wandb.config.BOS_TOKEN).ids),
                            torch.tensor(enc_input_tokens),
                            torch.tensor(tokenizer.encode(wandb.config.EOS_TOKEN).ids),
                            pad_id.repeat(enc_num_padding_tokens)
                        ],
                        dim=0,
                    )

            encoder_mask = (encoder_input != pad_id).unsqueeze(0).unsqueeze(0).int()

            label   = beam_search(model=model.to(device),
                                                     src=encoder_input.to(device),
                                                     src_mask=encoder_mask.to(device),
                                                     max_len=wandb.config.MAX_LEN,
                                                     start_symbol=tokenizer.encode(wandb.config.BOS_TOKEN).ids[0],
                                                     trg=None ,
                                                     metricks=False)
            model.train()
        return self.decode_from_G_out( tokenizer.decode(label[0].tolist()) )
        

In [27]:
def bleu_score(word,test_pronunciation):
    smooth = SmoothingFunction().method1
    return sentence_bleu(word, test_pronunciation, smoothing_function=smooth)

In [28]:
def phoneme_error_rate(predicted, target):
    m, n = len(predicted), len(target)
    if m == 0:
        return float(n)
    if n == 0:
        return float(m)

    d = np.zeros((m + 1, n + 1), dtype=int)

    for i in range(m + 1):
        d[i][0] = i
    for j in range(n + 1):
        d[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if predicted[i - 1] == target[j - 1] else 1
            d[i][j] = min(d[i - 1][j] + 1,    # Deletion
                          d[i][j - 1] + 1,    # Insertion
                          d[i - 1][j - 1] + cost)  # Substitution
            if i > 1 and j > 1 and predicted[i - 1] == target[j - 2] and predicted[i - 2] == target[j - 1]:
                d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)  

    num_errors = d[m][n]
    num_phonemes = len(target)

    return num_errors / num_phonemes


In [29]:
def word_error_rate(predicted, target):
    return 0 if predicted == target else 1

In [30]:
def beam_search(model, src, src_mask, max_len, start_symbol, trg, metricks:False ,beam_size=5):
    src = src.unsqueeze(0)  
    src_mask = src_mask.unsqueeze(0) 
    memory = model.encode(src, src_mask)
    
    beams = [(torch.ones(1, 1).fill_(start_symbol).type_as(src.data), 0.0)]  # (последовательность, score)

    for _ in range(max_len - 1):
        new_beams = []
        for seq, score in beams:
            tgt_mask = causal_mask(seq.size(1)).unsqueeze(0).to(device)
            out = model.decode(memory, src_mask, seq, tgt_mask)
            prob = model.fc_out(out[:, -1])
            log_prob = F.log_softmax(prob, dim=-1)
            
            top_log_prob, top_indices = log_prob.topk(beam_size)

            for i in range(beam_size):
                next_seq = torch.cat([seq, torch.ones(1, 1).type_as(src.data).fill_(top_indices[0, i])], dim=1)
                new_score = score + top_log_prob[0, i].item()
                new_beams.append((next_seq, new_score))


        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]
        
        if beams[0][0][0][-1].item() == tokenizer.encode(wandb.config.EOS_TOKEN).ids[0]:
            break
    
    best_seq, best_score = beams[0]
    
    if metricks:
        pred = decode_form_G(tokenizer.decode(best_seq[0].tolist()))    
        per = phoneme_error_rate( pred  , trg.split() )
        wer = word_error_rate( pred  , trg.split() )
        bleu = bleu_score(pred  , trg.split())
        
        return best_seq , per , wer ,bleu
    return best_seq


In [31]:
def model_eval(model , dataloader , beam=True):
    model.eval()
    with torch.no_grad():
        per_list = []
        wer_list = []
        bleu_list = []
        for batch in tqdm( dataloader ):
            per_batch=0.0
            wer_batch=0.0
            bleu_batch=0.0
            for idx in range(len(batch)):
                encoder_input = batch['encoder_input'][idx].to(device)
                encoder_mask = batch['encoder_mask'][idx].to(device)
                trg = batch['tgt_text'][idx]
                if beam:
                    label ,per ,wer,bleu  = beam_search( model=model,
                                                         src=encoder_input,
                                                         src_mask=encoder_mask,
                                                         max_len=wandb.config.MAX_LEN,
                                                         start_symbol=tokenizer.encode(wandb.config.BOS_TOKEN).ids[0],
                                                         trg=trg ,
                                                         metricks=True)
                per_batch+=per
                wer_batch+=wer
                bleu_batch+=bleu

            per_list.append(per_batch / len(batch))
            wer_list.append(wer_batch / len(batch))
            bleu_list.append(bleu_batch / len(batch))

    return sum(per_list)/len(dataloader) , sum(wer_list)/len(dataloader),  sum(bleu_list)/len(dataloader)

In [32]:
best_per = 10
best_per_test = 10
num_cycles = 4

In [33]:
model = TransformerBlock(input_vocab_size=tokenizer.get_vocab_size(),
                        target_vocab_size=tokenizer.get_vocab_size(),
                         num_encoder_layers=wandb.config.NUM,
                        num_decoder_layers=wandb.config.NUM,
                         num_heads=wandb.config.NUM_HEADS,
                         d_model = wandb.config.D_MODEL,
                         d_ff = wandb.config.D_FF).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.LR , betas=(0.9, 0.98))
num_epohs = wandb.config.NUM_EPOHS
num_training_steps = len(train_dataloader) * num_epohs
num_warmup_steps = int(0.1 * num_training_steps)
loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer.encode(wandb.config.PAD_TOKEN).ids[0] , label_smoothing=0.1 ).to(device)


In [34]:
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer ,
                                                               num_warmup_steps=num_warmup_steps,
                                                               num_training_steps=num_training_steps,
                                                               num_cycles=num_cycles)

In [35]:
def masked(seq):
    pad_list = torch.tensor( [(tokenizer.encode(wandb.config.PAD_TOKEN).ids[0]) for _ in range(len(seq))]).to(device)
    count_pad = (pad_list == seq).int().sum().item()
    total_length = wandb.config.MAX_LEN - count_pad - 2
    num_zeros = int((total_length*0.15)//1)
    # Задаем случайные индексы для установки 1 вместо 0
    indices = np.random.choice(np.arange(1, total_length+1), num_zeros, replace=False)
    seq[indices] = tokenizer.encode(wandb.config.UNK_TOKEN).ids[0]
    return seq

In [36]:
def masked_batch(batch):
    return  torch.stack([masked(seq) for seq in batch])

In [None]:
for epoch in range(wandb.config.NUM_EPOHS):
    model.train()
    loss_epoch = 0.0
    for batch_idx, batch in tqdm(enumerate(train_dataloader), desc=f"Processing Epoch {epoch:02d}"):
        encoder_input = batch['encoder_input'].to(device)  # (B, seq_len) with unk 
        decoder_input = batch['decoder_input'].to(device)  # (B, seq_len)
        encoder_mask = batch['encoder_mask'].to(device)  # (B, 1, 1, seq_len)
        decoder_mask = batch['decoder_mask'].to(device)  # (B, 1, seq_len, seq_len)
        label = batch['label'].to(device)  # (B, seq_len)

        output = model(encoder_input, decoder_input, encoder_mask, decoder_mask)
        loss = loss_fn(output.view(-1, tokenizer.get_vocab_size()), label.view(-1))
        loss_epoch += loss.item()
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()  
        scheduler.step()


    # Evaluate the model
    per, wer, bleu = model_eval(model, val_dataloader)

    if per < best_per:
        best_per = per
        per_test, wer_test, bleu_test = model_eval(model, test_dataloader)
        
        if per_test < best_per_test:
            best_per_test = per_test
            torch.save(model.state_dict(), f"/kaggle/working/model/model{per:.4f}.pt")

    print(f"PER: {per*100:.2f} | Loss: {loss_epoch/len(train_dataloader):.4f} | WER: {wer*100:.2f} | BLEU: {bleu:.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")
    wandb.log({ "PER": per*100, 
                "loss": loss_epoch/len(train_dataloader), 
                "WER": wer*100, 
                "BLEU": bleu, 
                "LR": scheduler.get_last_lr()[0]})

Processing Epoch 00: 0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

PER: 28.84 | Loss: 3.4655 | WER: 87.50 | BLEU: 0.0551 | LR: 0.000075


Processing Epoch 01: 0it [00:00, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

PER: 16.72 | Loss: 1.5216 | WER: 67.86 | BLEU: 0.0558 | LR: 0.000150


Processing Epoch 02: 0it [00:00, ?it/s]

In [None]:
per, wer , bleu = model_eval(model , test_dataloader)
per*100, wer*100

In [None]:
torch.save(model, "/kaggle/working/model/modellast.pt")