In [None]:
#!/usr/bin/env python3
"""
Hindi Meeting Summarizer + Semantic Extractor
Copy-paste this file and run.

Requirements:
pip install torch sentencepiece sklearn nltk tqdm

Notes:
- Training seq2seq well requires (transcript, summary) pairs. If you don't have many labeled pairs,
  consider creating a small seed dataset or using an extractive pseudo-summary as targets to bootstrap.
- Training on GPU is strongly recommended.
"""

import os
import math
import random
from pathlib import Path
from typing import List, Tuple

import sentencepiece as spm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from tqdm import tqdm

# Download punkt for sentence tokenization (Hindi works reasonably)
nltk.download('punkt')

# ----------------------------
# Config / Hyperparameters
# ----------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER_MODEL = "hindi_spm.model"
VOCAB_SIZE = 8000         # subword vocab size
MAX_SRC_LEN = 1024
MAX_TGT_LEN = 120
BATCH_SIZE = 8
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 1024
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
LR = 1e-4
EPOCHS = 10
PAD_TOKEN = "<pad>"
BOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
UNK_TOKEN = "<unk>"

# ----------------------------
# Utilities: Tokenizer (SentencePiece)
# ----------------------------
def train_sentencepiece(input_txt_path: str, model_prefix: str = "hindi_spm", vocab_size: int = VOCAB_SIZE):
    """
    Train a SentencePiece tokenizer on a text file.
    input_txt_path: a single file with many Hindi sentences (one per line preferred).
    """
    spm.SentencePieceTrainer.train(
        input=input_txt_path,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        character_coverage=0.9995,   # high coverage for Hindi
        model_type='unigram',        # or 'bpe'
        pad_id=0,
        unk_id=1,
        bos_id=2,
        eos_id=3,
    )
    print("Trained SentencePiece model:", model_prefix + ".model")

def load_tokenizer(model_path: str = TOKENIZER_MODEL):
    sp = spm.SentencePieceProcessor()
    sp.load(model_path)
    return sp

# ----------------------------
# Dataset
# ----------------------------
class MeetingSummaryDataset(Dataset):
    """
    dataset expects list of (transcript_text, summary_text) pairs for supervised training.
    If you don't have summaries, you can use extractive heuristics to create pseudo summaries.
    """
    def __init__(self, pairs: List[Tuple[str,str]], sp: spm.SentencePieceProcessor,
                 max_src_len=MAX_SRC_LEN, max_tgt_len=MAX_TGT_LEN):
        self.pairs = pairs
        self.sp = sp
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        # encode, return ids
        src_ids = self.sp.encode(src, out_type=int)[: self.max_src_len - 2]
        tgt_ids = self.sp.encode(tgt, out_type=int)[: self.max_tgt_len - 2]
        # add BOS/EOS (sentencepiece uses ids 2=bos, 3=eos due to train config above)
        src_ids = [self.sp.bos_id()] + src_ids + [self.sp.eos_id()]
        tgt_ids = [self.sp.bos_id()] + tgt_ids + [self.sp.eos_id()]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

def collate_fn(batch):
    srcs, tgts = zip(*batch)
    src_lens = [len(s) for s in srcs]
    tgt_lens = [len(t) for t in tgts]
    max_src = max(src_lens)
    max_tgt = max(tgt_lens)
    pad_id = 0  # we assigned pad_id=0 in SentencePiece training
    src_batch = torch.full((len(srcs), max_src), pad_id, dtype=torch.long)
    tgt_batch = torch.full((len(tgts), max_tgt), pad_id, dtype=torch.long)
    for i, s in enumerate(srcs):
        src_batch[i, :len(s)] = s
    for i, t in enumerate(tgts):
        tgt_batch[i, :len(t)] = t
    return src_batch, tgt_batch

# ----------------------------
# Model: Transformer Seq2Seq using PyTorch's nn.Transformer
# ----------------------------
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, emb_size,
                 nhead, src_vocab_size, tgt_vocab_size, dim_feedforward=FFN_HID_DIM, dropout=0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(d_model=emb_size,
                                          nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size, padding_idx=0)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size, padding_idx=0)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

        self.emb_size = emb_size

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        # src: (S, N) expected by nn.Transformer as seq_len x batch
        src_emb = self.positional_encoding(self.src_tok_emb(src) * math.sqrt(self.emb_size))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt) * math.sqrt(self.emb_size))
        memory = self.transformer.encoder(src_emb, src_key_padding_mask=src_padding_mask)
        outs = self.transformer.decoder(tgt_emb, memory,
                                        tgt_mask=tgt_mask,
                                        tgt_key_padding_mask=tgt_padding_mask,
                                        memory_key_padding_mask=memory_key_padding_mask)
        logits = self.generator(outs)
        return logits

    def encode(self, src, src_mask, src_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src) * math.sqrt(self.emb_size))
        return self.transformer.encoder(src_emb, src_key_padding_mask=src_padding_mask)

    def decode(self, tgt, memory, tgt_mask):
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt) * math.sqrt(self.emb_size))
        return self.transformer.decoder(tgt_emb, memory, tgt_mask=tgt_mask)

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout, maxlen=5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).unsqueeze(1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)  # (maxlen, 1, emb_size)
        self.dropout = nn.Dropout(p=dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        # token_embedding shape: seq_len, batch_size, emb_size
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# ----------------------------
# Mask helpers
# ----------------------------
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.size(0)
    tgt_seq_len = tgt.size(0)

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == 0).transpose(0, 1)
    tgt_padding_mask = (tgt == 0).transpose(0, 1)
    return src_mask, tgt_mask.to(DEVICE), src_padding_mask.to(DEVICE), tgt_padding_mask.to(DEVICE)

# ----------------------------
# Training loop
# ----------------------------
def train_epoch(model, optimizer, dataloader, sp):
    model.train()
    losses = 0
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    for src_batch, tgt_batch in tqdm(dataloader, desc="Training batches"):
        src_batch = src_batch.transpose(0,1).to(DEVICE)    # seq_len, batch
        tgt_batch = tgt_batch.transpose(0,1).to(DEVICE)    # seq_len, batch
        tgt_input = tgt_batch[:-1, :]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_batch, tgt_input)
        memory_key_padding_mask = src_padding_mask

        optimizer.zero_grad()
        logits = model(src_batch, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        # logits: seq_len x batch x vocab
        tgt_out = tgt_batch[1:, :].reshape(-1)
        logits = logits.reshape(-1, logits.shape[-1])
        loss = criterion(logits, tgt_out)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        losses += loss.item()
    return losses / len(dataloader)

# ----------------------------
# Inference / Greedy decode
# ----------------------------
@torch.no_grad()
def summarize(model, src_text: str, sp: spm.SentencePieceProcessor, max_len=MAX_TGT_LEN):
    model.eval()
    src_ids = sp.encode(src_text, out_type=int)[:MAX_SRC_LEN-2]
    src_ids = [sp.bos_id()] + src_ids + [sp.eos_id()]
    src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(1).to(DEVICE)  # seq_len x 1
    src_mask = torch.zeros((src.size(0), src.size(0)), device=DEVICE).type(torch.bool)
    src_padding_mask = (src == 0).transpose(0,1)

    memory = model.encode(src, src_mask, src_padding_mask)
    ys = torch.tensor([sp.bos_id()], dtype=torch.long).unsqueeze(1).to(DEVICE)
    for i in range(max_len):
        tgt_mask = generate_square_subsequent_mask(ys.size(0)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0,1)  # batch x seq x emb
        prob = model.generator(out[:, -1, :])  # batch x vocab
        next_token = torch.argmax(prob, dim=1).item()
        ys = torch.cat([ys, torch.tensor([[next_token]], dtype=torch.long).to(DEVICE)], dim=0)
        if next_token == sp.eos_id():
            break
    out_ids = ys.squeeze(1).tolist()
    # remove BOS and everything after EOS
    if out_ids and out_ids[0] == sp.bos_id():
        out_ids = out_ids[1:]
    if sp.eos_id() in out_ids:
        out_ids = out_ids[:out_ids.index(sp.eos_id())]
    summary = sp.decode(out_ids)
    return summary

# ----------------------------
# Semantic extraction: sentence embeddings and cluster-based topic extraction
# ----------------------------
def extract_semantic_insights(transcript_text: str, sp: spm.SentencePieceProcessor, model: Seq2SeqTransformer,
                              n_topics=3, top_n_sentences=3):
    """
    Returns:
      - representative_sentences: list of top_n_sentences that best represent clusters
      - keywords: top tf-idf keywords across transcript
    """
    # 1) Split transcript into sentences (nltk)
    sentences = nltk.tokenize.sent_tokenize(transcript_text)
    if len(sentences) == 0:
        return [], []
    # 2) Get sentence embeddings by encoding sentences and averaging token embeddings from encoder
    model.eval()
    sent_embeddings = []
    with torch.no_grad():
        for s in sentences:
            ids = sp.encode(s, out_type=int)[:MAX_SRC_LEN-2]
            ids = [sp.bos_id()] + ids + [sp.eos_id()]
            src = torch.tensor(ids, dtype=torch.long).unsqueeze(1).to(DEVICE)
            src_padding_mask = (src == 0).transpose(0,1)
            src_mask = torch.zeros((src.size(0), src.size(0)), device=DEVICE).type(torch.bool)
            memory = model.encode(src, src_mask, src_padding_mask)  # seq_len x batch x emb
            # average over sequence dimension
            emb = memory.mean(dim=0).squeeze(0).cpu().numpy()  # emb_size
            sent_embeddings.append(emb)
    # 3) Cluster sentence embeddings to find representative sentences
    n_clusters = min(n_topics, len(sentences))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sent_embeddings)
    clusters = kmeans.labels_
    representative_sentences = []
    for c in range(n_clusters):
        idxs = [i for i, lab in enumerate(clusters) if lab == c]
        if not idxs:
            continue
        # pick the sentence closest to cluster center
        center = kmeans.cluster_centers_[c]
        best_idx = min(idxs, key=lambda i: np_l2(sent_embeddings[i], center))
        representative_sentences.append(sentences[best_idx])
    # 4) TF-IDF keywords
    vectorizer = TfidfVectorizer(max_features=50, ngram_range=(1,2))
    try:
        tfidf = vectorizer.fit_transform(sentences)
        feature_names = vectorizer.get_feature_names_out()
        # average tfidf across sentences
        scores = tfidf.mean(axis=0).A1
        top_k_idx = scores.argsort()[::-1][:20]
        keywords = [feature_names[i] for i in top_k_idx]
    except Exception:
        keywords = []

    return representative_sentences[:top_n_sentences], keywords

def np_l2(a, b):
    import numpy as np
    return float(np.sum((np.array(a) - np.array(b))**2))

# ----------------------------
# Saving / Loading helpers
# ----------------------------
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print("Saved model to", path)

def load_model(path, src_vocab_size, tgt_vocab_size):
    model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                               NHEAD, src_vocab_size, tgt_vocab_size).to(DEVICE)
    model.load_state_dict(torch.load(path, map_location=DEVICE))
    model.eval()
    return model

# ----------------------------
# Example usage & CLI-like helper
# ----------------------------
def build_dummy_pairs_from_transcript(transcript_file: str, sp: spm.SentencePieceProcessor, max_pairs=200):
    """
    If you don't have (transcript, summary) labeled pairs, this helper creates
    pseudo-supervised pairs by extracting top sentences as short summaries (extractive).
    Not ideal but can bootstrap training.
    """
    with open(transcript_file, "r", encoding="utf-8") as f:
        text = f.read()
    sentences = nltk.tokenize.sent_tokenize(text)
    # create pseudo summaries: for each chunk of ~8 sentences, take top 2 sentences by TF-IDF as summary
    pairs = []
    chunk_size = 8
    for i in range(0, len(sentences), chunk_size):
        chunk = " ".join(sentences[i:i+chunk_size])
        if not chunk.strip():
            continue
        # simple tfidf on the chunk
        vect = TfidfVectorizer(max_features=50, ngram_range=(1,2))
        try:
            tfidf = vect.fit_transform([chunk])
            # pick first 2 sentences as summary fallback
            summary = " ".join(sentences[i:i+2])
        except Exception:
            summary = " ".join(sentences[i:i+2])
        pairs.append((chunk, summary))
        if len(pairs) >= max_pairs:
            break
    return pairs

def main_train_flow(transcript_file: str, sp_model: str = TOKENIZER_MODEL, train_new_tokenizer=False):
    """
    High-level flow:
      - optionally train tokenizer on transcript
      - create pseudo pairs (if no gold summaries)
      - train transformer seq2seq
    """
    # 1) Train tokenizer if requested
    if train_new_tokenizer:
        print("Training SentencePiece tokenizer... (this will create hindi_spm.model)")
        train_sentencepiece(transcript_file, model_prefix="hindi_spm", vocab_size=VOCAB_SIZE)
        sp_path = "hindi_spm.model"
    else:
        sp_path = sp_model

    sp = load_tokenizer(sp_path)
    # 2) Build pseudo supervised pairs
    pairs = build_dummy_pairs_from_transcript(transcript_file, sp, max_pairs=200)
    dataset = MeetingSummaryDataset(pairs, sp)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    src_vocab_size = sp.get_piece_size()
    tgt_vocab_size = sp.get_piece_size()

    # 3) Build model
    model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                               NHEAD, src_vocab_size, tgt_vocab_size).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    # 4) Train
    print("Starting training on", DEVICE)
    for epoch in range(1, EPOCHS+1):
        loss = train_epoch(model, optimizer, dataloader, sp)
        print(f"Epoch {epoch}/{EPOCHS} average loss: {loss:.4f}")
        save_model(model, f"transformer_epoch{epoch}.pth")
    print("Training complete. Last model saved as transformer_epoch{EPOCHS}.pth")
    return model, sp

# ----------------------------
# Quick inference demonstration (after you trained or load model)
# ----------------------------
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Hindi Meeting Summarizer - train or infer")
    parser.add_argument("--transcript", type=str, required=True, help="Path to Hindi transcript text file")
    parser.add_argument("--train-tokenizer", action="store_true", help="Train new SentencePiece tokenizer on transcript")
    parser.add_argument("--train", action="store_true", help="Train model (uses pseudo summaries if no labels)")
    parser.add_argument("--infer", action="store_true", help="Run summarization + semantic extraction on transcript")
    parser.add_argument("--model-path", type=str, default="", help="Path to saved model .pth for inference")

    # Check if running in a notebook environment by checking if sys.argv exists
    import sys
    if not hasattr(sys, 'argv'):
        sys.argv = [''] # Provide a dummy list if sys.argv doesn't exist (as in some notebook environments)

    args = parser.parse_args()

    # Provide a default or placeholder transcript path if running in a notebook
    if not args.transcript:
        print("No transcript path provided. Using a placeholder path for notebook execution.")
        args.transcript = "/content/dummy_transcript.txt" # Replace with a valid path or method to get the transcript

    # 1) ensure transcript exists
    transcript_file = args.transcript
    if not os.path.exists(transcript_file):
        # Create a dummy file if it doesn't exist for the placeholder case
        if args.transcript == "/content/dummy_transcript.txt":
            print("Creating a dummy transcript file for demonstration.")
            with open(transcript_file, "w", encoding="utf-8") as f:
                f.write("यह एक डमी हिंदी ट्रांसक्रिप्ट है। इसमें कुछ वाक्य हैं।") # Dummy Hindi text
        else:
             raise FileNotFoundError("Transcript file not found: " + transcript_file)


    # Optionally train tokenizer & model
    if args.train:
        model, sp = main_train_flow(transcript_file, train_new_tokenizer=args.train_tokenizer)
        # Save tokenizer model file is in hindi_spm.model (if train_new_tokenizer), otherwise use existing TOKENIZER_MODEL
        sp_save_path = TOKENIZER_MODEL if not args.train_tokenizer else "hindi_spm.model"
        print("Tokenizer model:", sp_save_path)
        print("Model trained and available in current dir.")
    else:
        # load tokenizer
        if args.train_tokenizer:
            train_sentencepiece(transcript_file, model_prefix="hindi_spm", vocab_size=VOCAB_SIZE)
            sp_path = "hindi_spm.model"
        else:
            sp_path = TOKENIZER_MODEL
        if not os.path.exists(sp_path):
            raise FileNotFoundError("SentencePiece model not found. Use --train-tokenizer to create one.")
        sp = load_tokenizer(sp_path)

    # Inference
    if args.infer:
        # either load provided model or assume you've trained a model and the latest "transformer_epochX.pth" exists
        if args.model_path:
            assert os.path.exists(args.model_path), "Model file not found: " + args.model_path
            model = load_model(args.model_path, sp.get_piece_size(), sp.get_piece_size()).to(DEVICE)
        else:
            # find latest transformer_epoch*.pth
            pths = sorted([p for p in os.listdir('.') if p.startswith("transformer_epoch") and p.endswith(".pth")])
            if not pths:
                raise FileNotFoundError("No model .pth found. Train first or provide --model-path.")
            model = load_model(pths[-1], sp.get_piece_size(), sp.get_piece_size()).to(DEVICE)

        with open(transcript_file, "r", encoding="utf-8") as f:
            text = f.read()

        print("\n=== Generating summary ===")
        summary = summarize(model, text, sp, max_len=120)
        print("\nSUMMARY (Hindi):\n", summary)

        print("\n=== Extracting semantic insights ===")
        reps, keywords = extract_semantic_insights(text, sp, model, n_topics=4, top_n_sentences=4)
        print("\nRepresentative sentences (topics):")
        for s in reps:
            print("-", s)
        print("\nTop keywords (TF-IDF):")
        print(", ".join(keywords[:30]))

    print("\nDone.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
usage: colab_kernel_launcher.py [-h] --transcript TRANSCRIPT
                                [--train-tokenizer] [--train] [--infer]
                                [--model-path MODEL_PATH]
colab_kernel_launcher.py: error: the following arguments are required: --transcript
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/lib/python3.12/argparse.py", line 1943, in _parse_known_args2
    namespace, args = self._parse_known_args(args, namespace, intermixed)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 2230, in _parse_known_args
    raise ArgumentError(None, _('the following arguments are required: %s') %
argparse.ArgumentError: the following arguments are required: --transcript

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-4119399577.py", line 409, in <cell line: 0>
    args = parser.parse_args()
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 1904, in parse_args
    args, argv = self.parse_known_args(args, names

TypeError: object of type 'NoneType' has no len()

In [None]:
!pip install torch sentencepiece scikit-learn nltk tqdm




In [None]:
from google.colab import files
uploaded = files.upload()  # upload your meeting_hindi.txt file


Saving meeting_hindi.txt to meeting_hindi.txt


In [None]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input="/content/meeting_hindi.txt",
    model_prefix="hindi_spm",
    vocab_size=1000,          # 👈 reduced
    character_coverage=1.0,
    model_type='bpe'
)


In [None]:
#!/usr/bin/env python3
"""
Hindi Meeting Summarizer + Semantic Extractor
Copy-paste this file and run.

Requirements:
pip install torch sentencepiece sklearn nltk tqdm

Notes:
- Training seq2seq well requires (transcript, summary) pairs. If you don't have many labeled pairs,
  consider creating a small seed dataset or using an extractive pseudo-summary as targets to bootstrap.
- Training on GPU is strongly recommended.
"""

import os
import math
import random
from pathlib import Path
from typing import List, Tuple

import sentencepiece as spm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from tqdm import tqdm

# Download punkt for sentence tokenization (Hindi works reasonably)
nltk.download('punkt')

# ----------------------------
# Config / Hyperparameters
# ----------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER_MODEL = "hindi_spm.model"
VOCAB_SIZE = 8000         # subword vocab size
MAX_SRC_LEN = 1024
MAX_TGT_LEN = 120
BATCH_SIZE = 8
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 1024
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
LR = 1e-4
EPOCHS = 1000
PAD_TOKEN = "<pad>"
BOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
UNK_TOKEN = "<unk>"

# ----------------------------
# Utilities: Tokenizer (SentencePiece)
# ----------------------------
def train_sentencepiece(input_txt_path: str, model_prefix: str = "hindi_spm", vocab_size: int = VOCAB_SIZE):
    """
    Train a SentencePiece tokenizer on a text file.
    input_txt_path: a single file with many Hindi sentences (one per line preferred).
    """
    spm.SentencePieceTrainer.train(
        input=input_txt_path,
        model_prefix=model_prefix,
        vocab_size=vocab_size,
        character_coverage=0.9995,   # high coverage for Hindi
        model_type='unigram',        # or 'bpe'
        pad_id=0,
        unk_id=1,
        bos_id=2,
        eos_id=3,
    )
    print("Trained SentencePiece model:", model_prefix + ".model")

def load_tokenizer(model_path: str = TOKENIZER_MODEL):
    sp = spm.SentencePieceProcessor()
    sp.load(model_path)
    return sp

# ----------------------------
# Dataset
# ----------------------------
class MeetingSummaryDataset(Dataset):
    """
    dataset expects list of (transcript_text, summary_text) pairs for supervised training.
    If you don't have summaries, you can use extractive heuristics to create pseudo summaries.
    """
    def __init__(self, pairs: List[Tuple[str,str]], sp: spm.SentencePieceProcessor,
                 max_src_len=MAX_SRC_LEN, max_tgt_len=MAX_TGT_LEN):
        self.pairs = pairs
        self.sp = sp
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        # encode, return ids
        src_ids = self.sp.encode(src, out_type=int)[: self.max_src_len - 2]
        tgt_ids = self.sp.encode(tgt, out_type=int)[: self.max_tgt_len - 2]
        # add BOS/EOS (sentencepiece uses ids 2=bos, 3=eos due to train config above)
        src_ids = [self.sp.bos_id()] + src_ids + [self.sp.eos_id()]
        tgt_ids = [self.sp.bos_id()] + tgt_ids + [self.sp.eos_id()]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

def collate_fn(batch):
    srcs, tgts = zip(*batch)
    src_lens = [len(s) for s in srcs]
    tgt_lens = [len(t) for t in tgts]
    max_src = max(src_lens)
    max_tgt = max(tgt_lens)
    pad_id = 0  # we assigned pad_id=0 in SentencePiece training
    src_batch = torch.full((len(srcs), max_src), pad_id, dtype=torch.long)
    tgt_batch = torch.full((len(tgts), max_tgt), pad_id, dtype=torch.long)
    for i, s in enumerate(srcs):
        src_batch[i, :len(s)] = s
    for i, t in enumerate(tgts):
        tgt_batch[i, :len(t)] = t
    return src_batch, tgt_batch

# ----------------------------
# Model: Transformer Seq2Seq using PyTorch's nn.Transformer
# ----------------------------
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, emb_size,
                 nhead, src_vocab_size, tgt_vocab_size, dim_feedforward=FFN_HID_DIM, dropout=0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(d_model=emb_size,
                                          nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size, padding_idx=0)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size, padding_idx=0)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

        self.emb_size = emb_size

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        # src: (S, N) expected by nn.Transformer as seq_len x batch
        src_emb = self.positional_encoding(self.src_tok_emb(src) * math.sqrt(self.emb_size))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt) * math.sqrt(self.emb_size))
        memory = self.transformer.encoder(src_emb, src_key_padding_mask=src_padding_mask)
        outs = self.transformer.decoder(tgt_emb, memory,
                                        tgt_mask=tgt_mask,
                                        tgt_key_padding_mask=tgt_padding_mask,
                                        memory_key_padding_mask=memory_key_padding_mask)
        logits = self.generator(outs)
        return logits

    def encode(self, src, src_mask, src_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src) * math.sqrt(self.emb_size))
        return self.transformer.encoder(src_emb, src_key_padding_mask=src_padding_mask)

    def decode(self, tgt, memory, tgt_mask):
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt) * math.sqrt(self.emb_size))
        return self.transformer.decoder(tgt_emb, memory, tgt_mask=tgt_mask)

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout, maxlen=5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).unsqueeze(1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)  # (maxlen, 1, emb_size)
        self.dropout = nn.Dropout(p=dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        # token_embedding shape: seq_len, batch_size, emb_size
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# ----------------------------
# Mask helpers
# ----------------------------
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.size(0)
    tgt_seq_len = tgt.size(0)

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)

    src_padding_mask = (src == 0).transpose(0, 1)
    tgt_padding_mask = (tgt == 0).transpose(0, 1)
    return src_mask, tgt_mask.to(DEVICE), src_padding_mask.to(DEVICE), tgt_padding_mask.to(DEVICE)

# ----------------------------
# Training loop
# ----------------------------
def train_epoch(model, optimizer, dataloader, sp):
    model.train()
    losses = 0
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    for src_batch, tgt_batch in tqdm(dataloader, desc="Training batches"):
        src_batch = src_batch.transpose(0,1).to(DEVICE)    # seq_len, batch
        tgt_batch = tgt_batch.transpose(0,1).to(DEVICE)    # seq_len, batch
        tgt_input = tgt_batch[:-1, :]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_batch, tgt_input)
        memory_key_padding_mask = src_padding_mask

        optimizer.zero_grad()
        logits = model(src_batch, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        # logits: seq_len x batch x vocab
        tgt_out = tgt_batch[1:, :].reshape(-1)
        logits = logits.reshape(-1, logits.shape[-1])
        loss = criterion(logits, tgt_out)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        losses += loss.item()
    return losses / len(dataloader)

# ----------------------------
# Inference / Greedy decode
# ----------------------------
@torch.no_grad()
def summarize(model, src_text: str, sp: spm.SentencePieceProcessor, max_len=MAX_TGT_LEN):
    model.eval()
    src_ids = sp.encode(src_text, out_type=int)[:MAX_SRC_LEN-2]
    src_ids = [sp.bos_id()] + src_ids + [sp.eos_id()]
    src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(1).to(DEVICE)  # seq_len x 1
    src_mask = torch.zeros((src.size(0), src.size(0)), device=DEVICE).type(torch.bool)
    src_padding_mask = (src == 0).transpose(0,1)

    memory = model.encode(src, src_mask, src_padding_mask)
    ys = torch.tensor([sp.bos_id()], dtype=torch.long).unsqueeze(1).to(DEVICE)
    for i in range(max_len):
        tgt_mask = generate_square_subsequent_mask(ys.size(0)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0,1)  # batch x seq x emb
        prob = model.generator(out[:, -1, :])  # batch x vocab
        next_token = torch.argmax(prob, dim=1).item()
        ys = torch.cat([ys, torch.tensor([[next_token]], dtype=torch.long).to(DEVICE)], dim=0)
        if next_token == sp.eos_id():
            break
    out_ids = ys.squeeze(1).tolist()
    # remove BOS and everything after EOS
    if out_ids and out_ids[0] == sp.bos_id():
        out_ids = out_ids[1:]
    if sp.eos_id() in out_ids:
        out_ids = out_ids[:out_ids.index(sp.eos_id())]
    summary = sp.decode(out_ids)
    return summary

# ----------------------------
# Semantic extraction: sentence embeddings and cluster-based topic extraction
# ----------------------------
def extract_semantic_insights(transcript_text: str, sp: spm.SentencePieceProcessor, model: Seq2SeqTransformer,
                              n_topics=3, top_n_sentences=3):
    """
    Returns:
      - representative_sentences: list of top_n_sentences that best represent clusters
      - keywords: top tf-idf keywords across transcript
    """
    # 1) Split transcript into sentences (nltk)
    sentences = nltk.tokenize.sent_tokenize(transcript_text)
    if len(sentences) == 0:
        return [], []
    # 2) Get sentence embeddings by encoding sentences and averaging token embeddings from encoder
    model.eval()
    sent_embeddings = []
    with torch.no_grad():
        for s in sentences:
            ids = sp.encode(s, out_type=int)[:MAX_SRC_LEN-2]
            ids = [sp.bos_id()] + ids + [sp.eos_id()]
            src = torch.tensor(ids, dtype=torch.long).unsqueeze(1).to(DEVICE)
            src_padding_mask = (src == 0).transpose(0,1)
            src_mask = torch.zeros((src.size(0), src.size(0)), device=DEVICE).type(torch.bool)
            memory = model.encode(src, src_mask, src_padding_mask)  # seq_len x batch x emb
            # average over sequence dimension
            emb = memory.mean(dim=0).squeeze(0).cpu().numpy()  # emb_size
            sent_embeddings.append(emb)
    # 3) Cluster sentence embeddings to find representative sentences
    n_clusters = min(n_topics, len(sentences))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(sent_embeddings)
    clusters = kmeans.labels_
    representative_sentences = []
    for c in range(n_clusters):
        idxs = [i for i, lab in enumerate(clusters) if lab == c]
        if not idxs:
            continue
        # pick the sentence closest to cluster center
        center = kmeans.cluster_centers_[c]
        best_idx = min(idxs, key=lambda i: np_l2(sent_embeddings[i], center))
        representative_sentences.append(sentences[best_idx])
    # 4) TF-IDF keywords
    vectorizer = TfidfVectorizer(max_features=50, ngram_range=(1,2))
    try:
        tfidf = vectorizer.fit_transform(sentences)
        feature_names = vectorizer.get_feature_names_out()
        # average tfidf across sentences
        scores = tfidf.mean(axis=0).A1
        top_k_idx = scores.argsort()[::-1][:20]
        keywords = [feature_names[i] for i in top_k_idx]
    except Exception:
        keywords = []

    return representative_sentences[:top_n_sentences], keywords

def np_l2(a, b):
    import numpy as np
    return float(np.sum((np.array(a) - np.array(b))**2))

# ----------------------------
# Saving / Loading helpers
# ----------------------------
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print("Saved model to", path)

def load_model(path, src_vocab_size, tgt_vocab_size):
    model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                               NHEAD, src_vocab_size, tgt_vocab_size).to(DEVICE)
    model.load_state_dict(torch.load(path, map_location=DEVICE))
    model.eval()
    return model

# ----------------------------
# Example usage & CLI-like helper
# ----------------------------
def build_dummy_pairs_from_transcript(transcript_file: str, sp: spm.SentencePieceProcessor, max_pairs=200):
    """
    If you don't have (transcript, summary) labeled pairs, this helper creates
    pseudo-supervised pairs by extracting top sentences as short summaries (extractive).
    Not ideal but can bootstrap training.
    """
    with open(transcript_file, "r", encoding="utf-8") as f:
        text = f.read()
    sentences = nltk.tokenize.sent_tokenize(text)
    # create pseudo summaries: for each chunk of ~8 sentences, take top 2 sentences by TF-IDF as summary
    pairs = []
    chunk_size = 8
    for i in range(0, len(sentences), chunk_size):
        chunk = " ".join(sentences[i:i+chunk_size])
        if not chunk.strip():
            continue
        # simple tfidf on the chunk
        vect = TfidfVectorizer(max_features=50, ngram_range=(1,2))
        try:
            tfidf = vect.fit_transform([chunk])
            # pick first 2 sentences as summary fallback
            summary = " ".join(sentences[i:i+2])
        except Exception:
            summary = " ".join(sentences[i:i+2])
        pairs.append((chunk, summary))
        if len(pairs) >= max_pairs:
            break
    return pairs

def main_train_flow(transcript_file: str, sp_model: str = TOKENIZER_MODEL, train_new_tokenizer=False):
    """
    High-level flow:
      - optionally train tokenizer on transcript
      - create pseudo pairs (if no gold summaries)
      - train transformer seq2seq
    """
    # 1) Train tokenizer if requested
    if train_new_tokenizer:
        print("Training SentencePiece tokenizer... (this will create hindi_spm.model)")
        train_sentencepiece(transcript_file, model_prefix="hindi_spm", vocab_size=VOCAB_SIZE)
        sp_path = "hindi_spm.model"
    else:
        sp_path = sp_model

    sp = load_tokenizer(sp_path)
    # 2) Build pseudo supervised pairs
    pairs = build_dummy_pairs_from_transcript(transcript_file, sp, max_pairs=200)
    dataset = MeetingSummaryDataset(pairs, sp)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    src_vocab_size = sp.get_piece_size()
    tgt_vocab_size = sp.get_piece_size()

    # 3) Build model
    model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                               NHEAD, src_vocab_size, tgt_vocab_size).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    # 4) Train
    print("Starting training on", DEVICE)
    for epoch in range(1, EPOCHS+1):
        loss = train_epoch(model, optimizer, dataloader, sp)
        print(f"Epoch {epoch}/{EPOCHS} average loss: {loss:.4f}")
        save_model(model, f"transformer_epoch{epoch}.pth")
    print("Training complete. Last model saved as transformer_epoch{EPOCHS}.pth")
    return model, sp



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
model, sp = main_train_flow("/content/meeting_hindi.txt", train_new_tokenizer=False)


Starting training on cpu


Training batches: 100%|██████████| 1/1 [00:16<00:00, 16.18s/it]


Epoch 1/1000 average loss: 7.0056
Saved model to transformer_epoch1.pth


Training batches: 100%|██████████| 1/1 [00:13<00:00, 13.55s/it]


Epoch 2/1000 average loss: 6.4280
Saved model to transformer_epoch2.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.09s/it]


Epoch 3/1000 average loss: 6.0456
Saved model to transformer_epoch3.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.20s/it]


Epoch 4/1000 average loss: 5.7786
Saved model to transformer_epoch4.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.27s/it]


Epoch 5/1000 average loss: 5.5712
Saved model to transformer_epoch5.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.28s/it]


Epoch 6/1000 average loss: 5.4285
Saved model to transformer_epoch6.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.97s/it]


Epoch 7/1000 average loss: 5.3454
Saved model to transformer_epoch7.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.16s/it]


Epoch 8/1000 average loss: 5.2210
Saved model to transformer_epoch8.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.95s/it]


Epoch 9/1000 average loss: 5.1173
Saved model to transformer_epoch9.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.48s/it]


Epoch 10/1000 average loss: 4.9961
Saved model to transformer_epoch10.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.48s/it]


Epoch 11/1000 average loss: 4.8922
Saved model to transformer_epoch11.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.85s/it]


Epoch 12/1000 average loss: 4.7646
Saved model to transformer_epoch12.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.05s/it]


Epoch 13/1000 average loss: 4.6271
Saved model to transformer_epoch13.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.88s/it]


Epoch 14/1000 average loss: 4.5302
Saved model to transformer_epoch14.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 15/1000 average loss: 4.3844
Saved model to transformer_epoch15.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 16/1000 average loss: 4.2655
Saved model to transformer_epoch16.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.05s/it]


Epoch 17/1000 average loss: 4.1378
Saved model to transformer_epoch17.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.47s/it]


Epoch 18/1000 average loss: 3.9772
Saved model to transformer_epoch18.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.12s/it]


Epoch 19/1000 average loss: 3.8631
Saved model to transformer_epoch19.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.98s/it]


Epoch 20/1000 average loss: 3.7480
Saved model to transformer_epoch20.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]


Epoch 21/1000 average loss: 3.6568
Saved model to transformer_epoch21.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.09s/it]


Epoch 22/1000 average loss: 3.5400
Saved model to transformer_epoch22.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]


Epoch 23/1000 average loss: 3.4239
Saved model to transformer_epoch23.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 24/1000 average loss: 3.3215
Saved model to transformer_epoch24.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.31s/it]


Epoch 25/1000 average loss: 3.2194
Saved model to transformer_epoch25.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.32s/it]


Epoch 26/1000 average loss: 3.1282
Saved model to transformer_epoch26.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 27/1000 average loss: 3.0047
Saved model to transformer_epoch27.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 28/1000 average loss: 2.8858
Saved model to transformer_epoch28.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 29/1000 average loss: 2.8139
Saved model to transformer_epoch29.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]


Epoch 30/1000 average loss: 2.7247
Saved model to transformer_epoch30.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.55s/it]


Epoch 31/1000 average loss: 2.6160
Saved model to transformer_epoch31.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 32/1000 average loss: 2.5538
Saved model to transformer_epoch32.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.28s/it]


Epoch 33/1000 average loss: 2.5001
Saved model to transformer_epoch33.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


Epoch 34/1000 average loss: 2.3909
Saved model to transformer_epoch34.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 35/1000 average loss: 2.2674
Saved model to transformer_epoch35.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 36/1000 average loss: 2.1838
Saved model to transformer_epoch36.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]


Epoch 37/1000 average loss: 2.1546
Saved model to transformer_epoch37.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.21s/it]


Epoch 38/1000 average loss: 2.0858
Saved model to transformer_epoch38.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  7.00s/it]


Epoch 39/1000 average loss: 1.9921
Saved model to transformer_epoch39.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]


Epoch 40/1000 average loss: 1.9160
Saved model to transformer_epoch40.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.04s/it]


Epoch 41/1000 average loss: 1.8342
Saved model to transformer_epoch41.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.13s/it]


Epoch 42/1000 average loss: 1.8047
Saved model to transformer_epoch42.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.08s/it]


Epoch 43/1000 average loss: 1.7494
Saved model to transformer_epoch43.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]


Epoch 44/1000 average loss: 1.6833
Saved model to transformer_epoch44.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.87s/it]


Epoch 45/1000 average loss: 1.6151
Saved model to transformer_epoch45.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 46/1000 average loss: 1.5604
Saved model to transformer_epoch46.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 47/1000 average loss: 1.5125
Saved model to transformer_epoch47.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]


Epoch 48/1000 average loss: 1.4235
Saved model to transformer_epoch48.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.85s/it]


Epoch 49/1000 average loss: 1.4091
Saved model to transformer_epoch49.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.70s/it]


Epoch 50/1000 average loss: 1.3711
Saved model to transformer_epoch50.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.01s/it]


Epoch 51/1000 average loss: 1.3175
Saved model to transformer_epoch51.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]


Epoch 52/1000 average loss: 1.2501
Saved model to transformer_epoch52.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.67s/it]


Epoch 53/1000 average loss: 1.1946
Saved model to transformer_epoch53.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 54/1000 average loss: 1.1476
Saved model to transformer_epoch54.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.13s/it]


Epoch 55/1000 average loss: 1.1202
Saved model to transformer_epoch55.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.98s/it]


Epoch 56/1000 average loss: 1.0808
Saved model to transformer_epoch56.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.24s/it]


Epoch 57/1000 average loss: 1.0252
Saved model to transformer_epoch57.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.12s/it]


Epoch 58/1000 average loss: 1.0007
Saved model to transformer_epoch58.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.11s/it]


Epoch 59/1000 average loss: 0.9552
Saved model to transformer_epoch59.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.04s/it]


Epoch 60/1000 average loss: 0.9064
Saved model to transformer_epoch60.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


Epoch 61/1000 average loss: 0.8914
Saved model to transformer_epoch61.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.94s/it]


Epoch 62/1000 average loss: 0.8594
Saved model to transformer_epoch62.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 63/1000 average loss: 0.8418
Saved model to transformer_epoch63.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 64/1000 average loss: 0.8004
Saved model to transformer_epoch64.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 65/1000 average loss: 0.7440
Saved model to transformer_epoch65.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 66/1000 average loss: 0.7380
Saved model to transformer_epoch66.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 67/1000 average loss: 0.7062
Saved model to transformer_epoch67.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 68/1000 average loss: 0.6980
Saved model to transformer_epoch68.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 69/1000 average loss: 0.6745
Saved model to transformer_epoch69.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.21s/it]


Epoch 70/1000 average loss: 0.6442
Saved model to transformer_epoch70.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 71/1000 average loss: 0.6295
Saved model to transformer_epoch71.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 72/1000 average loss: 0.5979
Saved model to transformer_epoch72.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 73/1000 average loss: 0.5878
Saved model to transformer_epoch73.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 74/1000 average loss: 0.5501
Saved model to transformer_epoch74.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 75/1000 average loss: 0.5690
Saved model to transformer_epoch75.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]


Epoch 76/1000 average loss: 0.5671
Saved model to transformer_epoch76.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 77/1000 average loss: 0.5158
Saved model to transformer_epoch77.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 78/1000 average loss: 0.5109
Saved model to transformer_epoch78.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 79/1000 average loss: 0.4968
Saved model to transformer_epoch79.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 80/1000 average loss: 0.4926
Saved model to transformer_epoch80.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 81/1000 average loss: 0.4772
Saved model to transformer_epoch81.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Epoch 82/1000 average loss: 0.4656
Saved model to transformer_epoch82.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 83/1000 average loss: 0.4667
Saved model to transformer_epoch83.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 84/1000 average loss: 0.4253
Saved model to transformer_epoch84.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 85/1000 average loss: 0.4012
Saved model to transformer_epoch85.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 86/1000 average loss: 0.4306
Saved model to transformer_epoch86.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 87/1000 average loss: 0.4188
Saved model to transformer_epoch87.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 88/1000 average loss: 0.3829
Saved model to transformer_epoch88.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.90s/it]


Epoch 89/1000 average loss: 0.4044
Saved model to transformer_epoch89.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 90/1000 average loss: 0.3812
Saved model to transformer_epoch90.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 91/1000 average loss: 0.3637
Saved model to transformer_epoch91.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 92/1000 average loss: 0.3793
Saved model to transformer_epoch92.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  6.00s/it]


Epoch 93/1000 average loss: 0.3774
Saved model to transformer_epoch93.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 94/1000 average loss: 0.3442
Saved model to transformer_epoch94.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 95/1000 average loss: 0.3301
Saved model to transformer_epoch95.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 96/1000 average loss: 0.3158
Saved model to transformer_epoch96.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 97/1000 average loss: 0.3136
Saved model to transformer_epoch97.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 98/1000 average loss: 0.3128
Saved model to transformer_epoch98.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 99/1000 average loss: 0.2964
Saved model to transformer_epoch99.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 100/1000 average loss: 0.2976
Saved model to transformer_epoch100.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 101/1000 average loss: 0.2889
Saved model to transformer_epoch101.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]


Epoch 102/1000 average loss: 0.2753
Saved model to transformer_epoch102.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 103/1000 average loss: 0.2684
Saved model to transformer_epoch103.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 104/1000 average loss: 0.2812
Saved model to transformer_epoch104.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it]


Epoch 105/1000 average loss: 0.2708
Saved model to transformer_epoch105.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 106/1000 average loss: 0.2673
Saved model to transformer_epoch106.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.95s/it]


Epoch 107/1000 average loss: 0.2626
Saved model to transformer_epoch107.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 108/1000 average loss: 0.2595
Saved model to transformer_epoch108.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it]


Epoch 109/1000 average loss: 0.2465
Saved model to transformer_epoch109.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]


Epoch 110/1000 average loss: 0.2253
Saved model to transformer_epoch110.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 111/1000 average loss: 0.2410
Saved model to transformer_epoch111.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.49s/it]


Epoch 112/1000 average loss: 0.2255
Saved model to transformer_epoch112.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it]


Epoch 113/1000 average loss: 0.2388
Saved model to transformer_epoch113.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.53s/it]


Epoch 114/1000 average loss: 0.1977
Saved model to transformer_epoch114.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 115/1000 average loss: 0.2217
Saved model to transformer_epoch115.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Epoch 116/1000 average loss: 0.2124
Saved model to transformer_epoch116.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 117/1000 average loss: 0.2053
Saved model to transformer_epoch117.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 118/1000 average loss: 0.1982
Saved model to transformer_epoch118.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 119/1000 average loss: 0.2411
Saved model to transformer_epoch119.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]


Epoch 120/1000 average loss: 0.1907
Saved model to transformer_epoch120.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 121/1000 average loss: 0.2033
Saved model to transformer_epoch121.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 122/1000 average loss: 0.1926
Saved model to transformer_epoch122.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 123/1000 average loss: 0.1710
Saved model to transformer_epoch123.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 124/1000 average loss: 0.1661
Saved model to transformer_epoch124.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 125/1000 average loss: 0.1801
Saved model to transformer_epoch125.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 126/1000 average loss: 0.2050
Saved model to transformer_epoch126.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]


Epoch 127/1000 average loss: 0.1908
Saved model to transformer_epoch127.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 128/1000 average loss: 0.1815
Saved model to transformer_epoch128.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 129/1000 average loss: 0.1890
Saved model to transformer_epoch129.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 130/1000 average loss: 0.1642
Saved model to transformer_epoch130.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 131/1000 average loss: 0.1592
Saved model to transformer_epoch131.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 132/1000 average loss: 0.1806
Saved model to transformer_epoch132.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 133/1000 average loss: 0.1674
Saved model to transformer_epoch133.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 134/1000 average loss: 0.1901
Saved model to transformer_epoch134.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 135/1000 average loss: 0.1601
Saved model to transformer_epoch135.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 136/1000 average loss: 0.1709
Saved model to transformer_epoch136.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 137/1000 average loss: 0.1730
Saved model to transformer_epoch137.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 138/1000 average loss: 0.1610
Saved model to transformer_epoch138.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 139/1000 average loss: 0.1786
Saved model to transformer_epoch139.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 140/1000 average loss: 0.1706
Saved model to transformer_epoch140.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 141/1000 average loss: 0.1566
Saved model to transformer_epoch141.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 142/1000 average loss: 0.1649
Saved model to transformer_epoch142.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.26s/it]


Epoch 143/1000 average loss: 0.1500
Saved model to transformer_epoch143.pth


Training batches: 100%|██████████| 1/1 [00:11<00:00, 11.60s/it]


Epoch 144/1000 average loss: 0.1384
Saved model to transformer_epoch144.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


Epoch 145/1000 average loss: 0.1623
Saved model to transformer_epoch145.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 146/1000 average loss: 0.1334
Saved model to transformer_epoch146.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.28s/it]


Epoch 147/1000 average loss: 0.1850
Saved model to transformer_epoch147.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Epoch 148/1000 average loss: 0.1620
Saved model to transformer_epoch148.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.34s/it]


Epoch 149/1000 average loss: 0.1612
Saved model to transformer_epoch149.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]


Epoch 150/1000 average loss: 0.1470
Saved model to transformer_epoch150.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 151/1000 average loss: 0.1346
Saved model to transformer_epoch151.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.92s/it]


Epoch 152/1000 average loss: 0.1482
Saved model to transformer_epoch152.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 153/1000 average loss: 0.1528
Saved model to transformer_epoch153.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.31s/it]


Epoch 154/1000 average loss: 0.1475
Saved model to transformer_epoch154.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


Epoch 155/1000 average loss: 0.1405
Saved model to transformer_epoch155.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.34s/it]


Epoch 156/1000 average loss: 0.1176
Saved model to transformer_epoch156.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.84s/it]


Epoch 157/1000 average loss: 0.1501
Saved model to transformer_epoch157.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.26s/it]


Epoch 158/1000 average loss: 0.1171
Saved model to transformer_epoch158.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 159/1000 average loss: 0.1380
Saved model to transformer_epoch159.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.07s/it]


Epoch 160/1000 average loss: 0.1276
Saved model to transformer_epoch160.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 161/1000 average loss: 0.1271
Saved model to transformer_epoch161.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 162/1000 average loss: 0.1198
Saved model to transformer_epoch162.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.91s/it]


Epoch 163/1000 average loss: 0.1230
Saved model to transformer_epoch163.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]


Epoch 164/1000 average loss: 0.1113
Saved model to transformer_epoch164.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 165/1000 average loss: 0.1204
Saved model to transformer_epoch165.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]


Epoch 166/1000 average loss: 0.1324
Saved model to transformer_epoch166.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 167/1000 average loss: 0.1331
Saved model to transformer_epoch167.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it]


Epoch 168/1000 average loss: 0.1177
Saved model to transformer_epoch168.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 169/1000 average loss: 0.1140
Saved model to transformer_epoch169.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 170/1000 average loss: 0.0955
Saved model to transformer_epoch170.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it]


Epoch 171/1000 average loss: 0.1094
Saved model to transformer_epoch171.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 172/1000 average loss: 0.1135
Saved model to transformer_epoch172.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Epoch 173/1000 average loss: 0.1045
Saved model to transformer_epoch173.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 174/1000 average loss: 0.0946
Saved model to transformer_epoch174.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 175/1000 average loss: 0.1014
Saved model to transformer_epoch175.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 176/1000 average loss: 0.1164
Saved model to transformer_epoch176.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 177/1000 average loss: 0.0987
Saved model to transformer_epoch177.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]


Epoch 178/1000 average loss: 0.1157
Saved model to transformer_epoch178.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 179/1000 average loss: 0.0940
Saved model to transformer_epoch179.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 180/1000 average loss: 0.0961
Saved model to transformer_epoch180.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Epoch 181/1000 average loss: 0.0832
Saved model to transformer_epoch181.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it]


Epoch 182/1000 average loss: 0.0943
Saved model to transformer_epoch182.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Epoch 183/1000 average loss: 0.0866
Saved model to transformer_epoch183.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 184/1000 average loss: 0.0861
Saved model to transformer_epoch184.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]


Epoch 185/1000 average loss: 0.0714
Saved model to transformer_epoch185.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 186/1000 average loss: 0.0827
Saved model to transformer_epoch186.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.49s/it]


Epoch 187/1000 average loss: 0.0812
Saved model to transformer_epoch187.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 188/1000 average loss: 0.1076
Saved model to transformer_epoch188.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.89s/it]


Epoch 189/1000 average loss: 0.0946
Saved model to transformer_epoch189.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.34s/it]


Epoch 190/1000 average loss: 0.0880
Saved model to transformer_epoch190.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]


Epoch 191/1000 average loss: 0.1012
Saved model to transformer_epoch191.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.66s/it]


Epoch 192/1000 average loss: 0.0827
Saved model to transformer_epoch192.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Epoch 193/1000 average loss: 0.0830
Saved model to transformer_epoch193.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 194/1000 average loss: 0.0749
Saved model to transformer_epoch194.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 195/1000 average loss: 0.0805
Saved model to transformer_epoch195.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 196/1000 average loss: 0.0700
Saved model to transformer_epoch196.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 197/1000 average loss: 0.0664
Saved model to transformer_epoch197.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 198/1000 average loss: 0.0735
Saved model to transformer_epoch198.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 199/1000 average loss: 0.0604
Saved model to transformer_epoch199.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.67s/it]


Epoch 200/1000 average loss: 0.0691
Saved model to transformer_epoch200.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.63s/it]


Epoch 201/1000 average loss: 0.0501
Saved model to transformer_epoch201.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.57s/it]


Epoch 202/1000 average loss: 0.0649
Saved model to transformer_epoch202.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 203/1000 average loss: 0.0585
Saved model to transformer_epoch203.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.10s/it]


Epoch 204/1000 average loss: 0.0648
Saved model to transformer_epoch204.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 205/1000 average loss: 0.0622
Saved model to transformer_epoch205.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]


Epoch 206/1000 average loss: 0.0517
Saved model to transformer_epoch206.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.33s/it]


Epoch 207/1000 average loss: 0.0695
Saved model to transformer_epoch207.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 208/1000 average loss: 0.0658
Saved model to transformer_epoch208.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]


Epoch 209/1000 average loss: 0.0915
Saved model to transformer_epoch209.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Epoch 210/1000 average loss: 0.0587
Saved model to transformer_epoch210.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.48s/it]


Epoch 211/1000 average loss: 0.0526
Saved model to transformer_epoch211.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


Epoch 212/1000 average loss: 0.0586
Saved model to transformer_epoch212.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 213/1000 average loss: 0.0457
Saved model to transformer_epoch213.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it]


Epoch 214/1000 average loss: 0.0603
Saved model to transformer_epoch214.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 215/1000 average loss: 0.0623
Saved model to transformer_epoch215.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]


Epoch 216/1000 average loss: 0.0476
Saved model to transformer_epoch216.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.40s/it]


Epoch 217/1000 average loss: 0.0383
Saved model to transformer_epoch217.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.03s/it]


Epoch 218/1000 average loss: 0.0485
Saved model to transformer_epoch218.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.48s/it]


Epoch 219/1000 average loss: 0.0464
Saved model to transformer_epoch219.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 220/1000 average loss: 0.0465
Saved model to transformer_epoch220.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]


Epoch 221/1000 average loss: 0.0473
Saved model to transformer_epoch221.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.14s/it]


Epoch 222/1000 average loss: 0.0933
Saved model to transformer_epoch222.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 223/1000 average loss: 0.0536
Saved model to transformer_epoch223.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.29s/it]


Epoch 224/1000 average loss: 0.0391
Saved model to transformer_epoch224.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.38s/it]


Epoch 225/1000 average loss: 0.0558
Saved model to transformer_epoch225.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]


Epoch 226/1000 average loss: 0.0453
Saved model to transformer_epoch226.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.12s/it]


Epoch 227/1000 average loss: 0.0728
Saved model to transformer_epoch227.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.28s/it]


Epoch 228/1000 average loss: 0.0410
Saved model to transformer_epoch228.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it]


Epoch 229/1000 average loss: 0.0746
Saved model to transformer_epoch229.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]


Epoch 230/1000 average loss: 0.0722
Saved model to transformer_epoch230.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.07s/it]


Epoch 231/1000 average loss: 0.0400
Saved model to transformer_epoch231.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.23s/it]


Epoch 232/1000 average loss: 0.0448
Saved model to transformer_epoch232.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]


Epoch 233/1000 average loss: 0.0424
Saved model to transformer_epoch233.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]


Epoch 234/1000 average loss: 0.0600
Saved model to transformer_epoch234.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.10s/it]


Epoch 235/1000 average loss: 0.0429
Saved model to transformer_epoch235.pth


Training batches: 100%|██████████| 1/1 [00:14<00:00, 14.67s/it]


Epoch 236/1000 average loss: 0.0705
Saved model to transformer_epoch236.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Epoch 237/1000 average loss: 0.0458
Saved model to transformer_epoch237.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.99s/it]


Epoch 238/1000 average loss: 0.0371
Saved model to transformer_epoch238.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 239/1000 average loss: 0.0437
Saved model to transformer_epoch239.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.05s/it]


Epoch 240/1000 average loss: 0.0274
Saved model to transformer_epoch240.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 241/1000 average loss: 0.0378
Saved model to transformer_epoch241.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.17s/it]


Epoch 242/1000 average loss: 0.0262
Saved model to transformer_epoch242.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]


Epoch 243/1000 average loss: 0.0301
Saved model to transformer_epoch243.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.98s/it]


Epoch 244/1000 average loss: 0.0417
Saved model to transformer_epoch244.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  6.00s/it]


Epoch 245/1000 average loss: 0.0270
Saved model to transformer_epoch245.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.97s/it]


Epoch 246/1000 average loss: 0.0443
Saved model to transformer_epoch246.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


Epoch 247/1000 average loss: 0.0371
Saved model to transformer_epoch247.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  7.00s/it]


Epoch 248/1000 average loss: 0.0424
Saved model to transformer_epoch248.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


Epoch 249/1000 average loss: 0.0297
Saved model to transformer_epoch249.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 250/1000 average loss: 0.0483
Saved model to transformer_epoch250.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  6.00s/it]


Epoch 251/1000 average loss: 0.0300
Saved model to transformer_epoch251.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.49s/it]


Epoch 252/1000 average loss: 0.0368
Saved model to transformer_epoch252.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it]


Epoch 253/1000 average loss: 0.0261
Saved model to transformer_epoch253.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 254/1000 average loss: 0.0208
Saved model to transformer_epoch254.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.38s/it]


Epoch 255/1000 average loss: 0.0284
Saved model to transformer_epoch255.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.19s/it]


Epoch 256/1000 average loss: 0.0303
Saved model to transformer_epoch256.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.90s/it]


Epoch 257/1000 average loss: 0.0162
Saved model to transformer_epoch257.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Epoch 258/1000 average loss: 0.0215
Saved model to transformer_epoch258.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 259/1000 average loss: 0.0148
Saved model to transformer_epoch259.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.53s/it]


Epoch 260/1000 average loss: 0.0205
Saved model to transformer_epoch260.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 261/1000 average loss: 0.0266
Saved model to transformer_epoch261.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.57s/it]


Epoch 262/1000 average loss: 0.0358
Saved model to transformer_epoch262.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.68s/it]


Epoch 263/1000 average loss: 0.0152
Saved model to transformer_epoch263.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 264/1000 average loss: 0.0133
Saved model to transformer_epoch264.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 265/1000 average loss: 0.0280
Saved model to transformer_epoch265.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]


Epoch 266/1000 average loss: 0.0322
Saved model to transformer_epoch266.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 267/1000 average loss: 0.0242
Saved model to transformer_epoch267.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 268/1000 average loss: 0.0209
Saved model to transformer_epoch268.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 269/1000 average loss: 0.0156
Saved model to transformer_epoch269.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 270/1000 average loss: 0.0348
Saved model to transformer_epoch270.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 271/1000 average loss: 0.0134
Saved model to transformer_epoch271.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]


Epoch 272/1000 average loss: 0.0269
Saved model to transformer_epoch272.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 273/1000 average loss: 0.0191
Saved model to transformer_epoch273.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 274/1000 average loss: 0.0262
Saved model to transformer_epoch274.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 275/1000 average loss: 0.0231
Saved model to transformer_epoch275.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


Epoch 276/1000 average loss: 0.0165
Saved model to transformer_epoch276.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 277/1000 average loss: 0.0222
Saved model to transformer_epoch277.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 278/1000 average loss: 0.0194
Saved model to transformer_epoch278.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 279/1000 average loss: 0.0117
Saved model to transformer_epoch279.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 280/1000 average loss: 0.0129
Saved model to transformer_epoch280.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 281/1000 average loss: 0.0222
Saved model to transformer_epoch281.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 282/1000 average loss: 0.0300
Saved model to transformer_epoch282.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 283/1000 average loss: 0.0130
Saved model to transformer_epoch283.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 284/1000 average loss: 0.0245
Saved model to transformer_epoch284.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 285/1000 average loss: 0.0139
Saved model to transformer_epoch285.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 286/1000 average loss: 0.0114
Saved model to transformer_epoch286.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 287/1000 average loss: 0.0282
Saved model to transformer_epoch287.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it]


Epoch 288/1000 average loss: 0.0189
Saved model to transformer_epoch288.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 289/1000 average loss: 0.0200
Saved model to transformer_epoch289.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 290/1000 average loss: 0.0316
Saved model to transformer_epoch290.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.60s/it]


Epoch 291/1000 average loss: 0.0170
Saved model to transformer_epoch291.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Epoch 292/1000 average loss: 0.0125
Saved model to transformer_epoch292.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 293/1000 average loss: 0.0148
Saved model to transformer_epoch293.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


Epoch 294/1000 average loss: 0.0233
Saved model to transformer_epoch294.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 295/1000 average loss: 0.0153
Saved model to transformer_epoch295.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


Epoch 296/1000 average loss: 0.0147
Saved model to transformer_epoch296.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.65s/it]


Epoch 297/1000 average loss: 0.0121
Saved model to transformer_epoch297.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.32s/it]


Epoch 298/1000 average loss: 0.0120
Saved model to transformer_epoch298.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.73s/it]


Epoch 299/1000 average loss: 0.0135
Saved model to transformer_epoch299.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.11s/it]


Epoch 300/1000 average loss: 0.0244
Saved model to transformer_epoch300.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 301/1000 average loss: 0.0100
Saved model to transformer_epoch301.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it]


Epoch 302/1000 average loss: 0.0105
Saved model to transformer_epoch302.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.11s/it]


Epoch 303/1000 average loss: 0.0086
Saved model to transformer_epoch303.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]


Epoch 304/1000 average loss: 0.0107
Saved model to transformer_epoch304.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.31s/it]


Epoch 305/1000 average loss: 0.0120
Saved model to transformer_epoch305.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 306/1000 average loss: 0.0104
Saved model to transformer_epoch306.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.36s/it]


Epoch 307/1000 average loss: 0.0129
Saved model to transformer_epoch307.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.03s/it]


Epoch 308/1000 average loss: 0.0114
Saved model to transformer_epoch308.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]


Epoch 309/1000 average loss: 0.0199
Saved model to transformer_epoch309.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 310/1000 average loss: 0.0147
Saved model to transformer_epoch310.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.56s/it]


Epoch 311/1000 average loss: 0.0255
Saved model to transformer_epoch311.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 312/1000 average loss: 0.0168
Saved model to transformer_epoch312.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]


Epoch 313/1000 average loss: 0.0091
Saved model to transformer_epoch313.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.68s/it]


Epoch 314/1000 average loss: 0.0112
Saved model to transformer_epoch314.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 315/1000 average loss: 0.0130
Saved model to transformer_epoch315.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.73s/it]


Epoch 316/1000 average loss: 0.0123
Saved model to transformer_epoch316.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 317/1000 average loss: 0.0280
Saved model to transformer_epoch317.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.67s/it]


Epoch 318/1000 average loss: 0.0107
Saved model to transformer_epoch318.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]


Epoch 319/1000 average loss: 0.0210
Saved model to transformer_epoch319.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.68s/it]


Epoch 320/1000 average loss: 0.0265
Saved model to transformer_epoch320.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.57s/it]


Epoch 321/1000 average loss: 0.0106
Saved model to transformer_epoch321.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 322/1000 average loss: 0.0092
Saved model to transformer_epoch322.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 323/1000 average loss: 0.0117
Saved model to transformer_epoch323.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 324/1000 average loss: 0.0184
Saved model to transformer_epoch324.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]


Epoch 325/1000 average loss: 0.0255
Saved model to transformer_epoch325.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 326/1000 average loss: 0.0148
Saved model to transformer_epoch326.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 327/1000 average loss: 0.0098
Saved model to transformer_epoch327.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 328/1000 average loss: 0.0108
Saved model to transformer_epoch328.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 329/1000 average loss: 0.0119
Saved model to transformer_epoch329.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.03s/it]


Epoch 330/1000 average loss: 0.0224
Saved model to transformer_epoch330.pth


Training batches: 100%|██████████| 1/1 [00:10<00:00, 10.56s/it]


Epoch 331/1000 average loss: 0.0148
Saved model to transformer_epoch331.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 332/1000 average loss: 0.0067
Saved model to transformer_epoch332.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]


Epoch 333/1000 average loss: 0.0078
Saved model to transformer_epoch333.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Epoch 334/1000 average loss: 0.0075
Saved model to transformer_epoch334.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 335/1000 average loss: 0.0112
Saved model to transformer_epoch335.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]


Epoch 336/1000 average loss: 0.0496
Saved model to transformer_epoch336.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


Epoch 337/1000 average loss: 0.0116
Saved model to transformer_epoch337.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.96s/it]


Epoch 338/1000 average loss: 0.0060
Saved model to transformer_epoch338.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]


Epoch 339/1000 average loss: 0.0064
Saved model to transformer_epoch339.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.12s/it]


Epoch 340/1000 average loss: 0.0058
Saved model to transformer_epoch340.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 341/1000 average loss: 0.0119
Saved model to transformer_epoch341.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.16s/it]


Epoch 342/1000 average loss: 0.0068
Saved model to transformer_epoch342.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.00s/it]


Epoch 343/1000 average loss: 0.0058
Saved model to transformer_epoch343.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.18s/it]


Epoch 344/1000 average loss: 0.0094
Saved model to transformer_epoch344.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 345/1000 average loss: 0.0066
Saved model to transformer_epoch345.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.93s/it]


Epoch 346/1000 average loss: 0.0204
Saved model to transformer_epoch346.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 347/1000 average loss: 0.0081
Saved model to transformer_epoch347.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 348/1000 average loss: 0.0158
Saved model to transformer_epoch348.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.66s/it]


Epoch 349/1000 average loss: 0.0051
Saved model to transformer_epoch349.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.00s/it]


Epoch 350/1000 average loss: 0.0053
Saved model to transformer_epoch350.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 351/1000 average loss: 0.0071
Saved model to transformer_epoch351.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 352/1000 average loss: 0.0050
Saved model to transformer_epoch352.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 353/1000 average loss: 0.0083
Saved model to transformer_epoch353.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Epoch 354/1000 average loss: 0.0064
Saved model to transformer_epoch354.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 355/1000 average loss: 0.0064
Saved model to transformer_epoch355.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.50s/it]


Epoch 356/1000 average loss: 0.0176
Saved model to transformer_epoch356.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 357/1000 average loss: 0.0051
Saved model to transformer_epoch357.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 358/1000 average loss: 0.0054
Saved model to transformer_epoch358.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 359/1000 average loss: 0.0050
Saved model to transformer_epoch359.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Epoch 360/1000 average loss: 0.0043
Saved model to transformer_epoch360.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 361/1000 average loss: 0.0088
Saved model to transformer_epoch361.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]


Epoch 362/1000 average loss: 0.0068
Saved model to transformer_epoch362.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 363/1000 average loss: 0.0120
Saved model to transformer_epoch363.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]


Epoch 364/1000 average loss: 0.0077
Saved model to transformer_epoch364.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 365/1000 average loss: 0.0063
Saved model to transformer_epoch365.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]


Epoch 366/1000 average loss: 0.0059
Saved model to transformer_epoch366.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 367/1000 average loss: 0.0074
Saved model to transformer_epoch367.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.14s/it]


Epoch 368/1000 average loss: 0.0049
Saved model to transformer_epoch368.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.64s/it]


Epoch 369/1000 average loss: 0.0143
Saved model to transformer_epoch369.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.69s/it]


Epoch 370/1000 average loss: 0.0052
Saved model to transformer_epoch370.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 371/1000 average loss: 0.0125
Saved model to transformer_epoch371.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]


Epoch 372/1000 average loss: 0.0053
Saved model to transformer_epoch372.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 373/1000 average loss: 0.0044
Saved model to transformer_epoch373.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.98s/it]


Epoch 374/1000 average loss: 0.0063
Saved model to transformer_epoch374.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 375/1000 average loss: 0.0047
Saved model to transformer_epoch375.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.85s/it]


Epoch 376/1000 average loss: 0.0051
Saved model to transformer_epoch376.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 377/1000 average loss: 0.0044
Saved model to transformer_epoch377.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.96s/it]


Epoch 378/1000 average loss: 0.0050
Saved model to transformer_epoch378.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Epoch 379/1000 average loss: 0.0048
Saved model to transformer_epoch379.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.96s/it]


Epoch 380/1000 average loss: 0.0042
Saved model to transformer_epoch380.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 381/1000 average loss: 0.0229
Saved model to transformer_epoch381.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.93s/it]


Epoch 382/1000 average loss: 0.0047
Saved model to transformer_epoch382.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 383/1000 average loss: 0.0050
Saved model to transformer_epoch383.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 384/1000 average loss: 0.0041
Saved model to transformer_epoch384.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 385/1000 average loss: 0.0083
Saved model to transformer_epoch385.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 386/1000 average loss: 0.0041
Saved model to transformer_epoch386.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 387/1000 average loss: 0.0048
Saved model to transformer_epoch387.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.77s/it]


Epoch 388/1000 average loss: 0.0047
Saved model to transformer_epoch388.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 389/1000 average loss: 0.0145
Saved model to transformer_epoch389.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 390/1000 average loss: 0.0038
Saved model to transformer_epoch390.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it]


Epoch 391/1000 average loss: 0.0041
Saved model to transformer_epoch391.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 392/1000 average loss: 0.0033
Saved model to transformer_epoch392.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 393/1000 average loss: 0.0043
Saved model to transformer_epoch393.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 394/1000 average loss: 0.0034
Saved model to transformer_epoch394.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 395/1000 average loss: 0.0036
Saved model to transformer_epoch395.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 396/1000 average loss: 0.0062
Saved model to transformer_epoch396.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 397/1000 average loss: 0.0046
Saved model to transformer_epoch397.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 398/1000 average loss: 0.0033
Saved model to transformer_epoch398.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 399/1000 average loss: 0.0039
Saved model to transformer_epoch399.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.86s/it]


Epoch 400/1000 average loss: 0.0038
Saved model to transformer_epoch400.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]


Epoch 401/1000 average loss: 0.0037
Saved model to transformer_epoch401.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]


Epoch 402/1000 average loss: 0.0059
Saved model to transformer_epoch402.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 403/1000 average loss: 0.0040
Saved model to transformer_epoch403.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 404/1000 average loss: 0.0033
Saved model to transformer_epoch404.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 405/1000 average loss: 0.0033
Saved model to transformer_epoch405.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 406/1000 average loss: 0.0033
Saved model to transformer_epoch406.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 407/1000 average loss: 0.0034
Saved model to transformer_epoch407.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 408/1000 average loss: 0.0036
Saved model to transformer_epoch408.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.63s/it]


Epoch 409/1000 average loss: 0.0133
Saved model to transformer_epoch409.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 410/1000 average loss: 0.0036
Saved model to transformer_epoch410.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 411/1000 average loss: 0.0038
Saved model to transformer_epoch411.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Epoch 412/1000 average loss: 0.0059
Saved model to transformer_epoch412.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.66s/it]


Epoch 413/1000 average loss: 0.0202
Saved model to transformer_epoch413.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it]


Epoch 414/1000 average loss: 0.0030
Saved model to transformer_epoch414.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 415/1000 average loss: 0.0057
Saved model to transformer_epoch415.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 416/1000 average loss: 0.0031
Saved model to transformer_epoch416.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 417/1000 average loss: 0.0037
Saved model to transformer_epoch417.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 418/1000 average loss: 0.0038
Saved model to transformer_epoch418.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.68s/it]


Epoch 419/1000 average loss: 0.0034
Saved model to transformer_epoch419.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 420/1000 average loss: 0.0031
Saved model to transformer_epoch420.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 421/1000 average loss: 0.0032
Saved model to transformer_epoch421.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.53s/it]


Epoch 422/1000 average loss: 0.0033
Saved model to transformer_epoch422.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


Epoch 423/1000 average loss: 0.0036
Saved model to transformer_epoch423.pth


Training batches: 100%|██████████| 1/1 [00:13<00:00, 13.74s/it]


Epoch 424/1000 average loss: 0.0031
Saved model to transformer_epoch424.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 425/1000 average loss: 0.0033
Saved model to transformer_epoch425.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.18s/it]


Epoch 426/1000 average loss: 0.0032
Saved model to transformer_epoch426.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.89s/it]


Epoch 427/1000 average loss: 0.0031
Saved model to transformer_epoch427.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.08s/it]


Epoch 428/1000 average loss: 0.0031
Saved model to transformer_epoch428.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.90s/it]


Epoch 429/1000 average loss: 0.0075
Saved model to transformer_epoch429.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.87s/it]


Epoch 430/1000 average loss: 0.0030
Saved model to transformer_epoch430.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]


Epoch 431/1000 average loss: 0.0045
Saved model to transformer_epoch431.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.19s/it]


Epoch 432/1000 average loss: 0.0026
Saved model to transformer_epoch432.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 433/1000 average loss: 0.0026
Saved model to transformer_epoch433.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


Epoch 434/1000 average loss: 0.0080
Saved model to transformer_epoch434.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]


Epoch 435/1000 average loss: 0.0028
Saved model to transformer_epoch435.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


Epoch 436/1000 average loss: 0.0063
Saved model to transformer_epoch436.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Epoch 437/1000 average loss: 0.0043
Saved model to transformer_epoch437.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.08s/it]


Epoch 438/1000 average loss: 0.0044
Saved model to transformer_epoch438.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.89s/it]


Epoch 439/1000 average loss: 0.0039
Saved model to transformer_epoch439.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


Epoch 440/1000 average loss: 0.0041
Saved model to transformer_epoch440.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.95s/it]


Epoch 441/1000 average loss: 0.0026
Saved model to transformer_epoch441.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it]


Epoch 442/1000 average loss: 0.0034
Saved model to transformer_epoch442.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Epoch 443/1000 average loss: 0.0028
Saved model to transformer_epoch443.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 444/1000 average loss: 0.0034
Saved model to transformer_epoch444.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.55s/it]


Epoch 445/1000 average loss: 0.0032
Saved model to transformer_epoch445.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 446/1000 average loss: 0.0031
Saved model to transformer_epoch446.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]


Epoch 447/1000 average loss: 0.0033
Saved model to transformer_epoch447.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 448/1000 average loss: 0.0032
Saved model to transformer_epoch448.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Epoch 449/1000 average loss: 0.0032
Saved model to transformer_epoch449.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 450/1000 average loss: 0.0040
Saved model to transformer_epoch450.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.38s/it]


Epoch 451/1000 average loss: 0.0030
Saved model to transformer_epoch451.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 452/1000 average loss: 0.0029
Saved model to transformer_epoch452.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]


Epoch 453/1000 average loss: 0.0033
Saved model to transformer_epoch453.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.05s/it]


Epoch 454/1000 average loss: 0.0026
Saved model to transformer_epoch454.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.10s/it]


Epoch 455/1000 average loss: 0.0026
Saved model to transformer_epoch455.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.14s/it]


Epoch 456/1000 average loss: 0.0027
Saved model to transformer_epoch456.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.16s/it]


Epoch 457/1000 average loss: 0.0028
Saved model to transformer_epoch457.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


Epoch 458/1000 average loss: 0.0029
Saved model to transformer_epoch458.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.01s/it]


Epoch 459/1000 average loss: 0.0028
Saved model to transformer_epoch459.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.26s/it]


Epoch 460/1000 average loss: 0.0024
Saved model to transformer_epoch460.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 461/1000 average loss: 0.0027
Saved model to transformer_epoch461.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.33s/it]


Epoch 462/1000 average loss: 0.0029
Saved model to transformer_epoch462.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 463/1000 average loss: 0.0026
Saved model to transformer_epoch463.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]


Epoch 464/1000 average loss: 0.0024
Saved model to transformer_epoch464.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]


Epoch 465/1000 average loss: 0.0032
Saved model to transformer_epoch465.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.40s/it]


Epoch 466/1000 average loss: 0.0023
Saved model to transformer_epoch466.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 467/1000 average loss: 0.0022
Saved model to transformer_epoch467.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.40s/it]


Epoch 468/1000 average loss: 0.0022
Saved model to transformer_epoch468.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 469/1000 average loss: 0.0023
Saved model to transformer_epoch469.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 470/1000 average loss: 0.0023
Saved model to transformer_epoch470.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 471/1000 average loss: 0.0221
Saved model to transformer_epoch471.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


Epoch 472/1000 average loss: 0.0023
Saved model to transformer_epoch472.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 473/1000 average loss: 0.0061
Saved model to transformer_epoch473.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


Epoch 474/1000 average loss: 0.0024
Saved model to transformer_epoch474.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 475/1000 average loss: 0.0052
Saved model to transformer_epoch475.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.88s/it]


Epoch 476/1000 average loss: 0.0033
Saved model to transformer_epoch476.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 477/1000 average loss: 0.0027
Saved model to transformer_epoch477.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]


Epoch 478/1000 average loss: 0.0105
Saved model to transformer_epoch478.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 479/1000 average loss: 0.0150
Saved model to transformer_epoch479.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.77s/it]


Epoch 480/1000 average loss: 0.0023
Saved model to transformer_epoch480.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 481/1000 average loss: 0.0023
Saved model to transformer_epoch481.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 482/1000 average loss: 0.0044
Saved model to transformer_epoch482.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 483/1000 average loss: 0.0025
Saved model to transformer_epoch483.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 484/1000 average loss: 0.0081
Saved model to transformer_epoch484.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 485/1000 average loss: 0.0043
Saved model to transformer_epoch485.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 486/1000 average loss: 0.0051
Saved model to transformer_epoch486.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 487/1000 average loss: 0.0025
Saved model to transformer_epoch487.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 488/1000 average loss: 0.0030
Saved model to transformer_epoch488.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.73s/it]


Epoch 489/1000 average loss: 0.0031
Saved model to transformer_epoch489.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 490/1000 average loss: 0.0050
Saved model to transformer_epoch490.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 491/1000 average loss: 0.0045
Saved model to transformer_epoch491.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 492/1000 average loss: 0.0053
Saved model to transformer_epoch492.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 493/1000 average loss: 0.0034
Saved model to transformer_epoch493.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]


Epoch 494/1000 average loss: 0.0032
Saved model to transformer_epoch494.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 495/1000 average loss: 0.0034
Saved model to transformer_epoch495.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 496/1000 average loss: 0.0040
Saved model to transformer_epoch496.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 497/1000 average loss: 0.0025
Saved model to transformer_epoch497.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 498/1000 average loss: 0.0032
Saved model to transformer_epoch498.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 499/1000 average loss: 0.0027
Saved model to transformer_epoch499.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 500/1000 average loss: 0.0100
Saved model to transformer_epoch500.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.63s/it]


Epoch 501/1000 average loss: 0.0034
Saved model to transformer_epoch501.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.14s/it]


Epoch 502/1000 average loss: 0.0029
Saved model to transformer_epoch502.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 503/1000 average loss: 0.0026
Saved model to transformer_epoch503.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Epoch 504/1000 average loss: 0.0028
Saved model to transformer_epoch504.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


Epoch 505/1000 average loss: 0.0090
Saved model to transformer_epoch505.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]


Epoch 506/1000 average loss: 0.0026
Saved model to transformer_epoch506.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.11s/it]


Epoch 507/1000 average loss: 0.0030
Saved model to transformer_epoch507.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


Epoch 508/1000 average loss: 0.0051
Saved model to transformer_epoch508.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.16s/it]


Epoch 509/1000 average loss: 0.0030
Saved model to transformer_epoch509.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.38s/it]


Epoch 510/1000 average loss: 0.0025
Saved model to transformer_epoch510.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


Epoch 511/1000 average loss: 0.0104
Saved model to transformer_epoch511.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.40s/it]


Epoch 512/1000 average loss: 0.0057
Saved model to transformer_epoch512.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


Epoch 513/1000 average loss: 0.0286
Saved model to transformer_epoch513.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.33s/it]


Epoch 514/1000 average loss: 0.0026
Saved model to transformer_epoch514.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Epoch 515/1000 average loss: 0.0026
Saved model to transformer_epoch515.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.45s/it]


Epoch 516/1000 average loss: 0.0125
Saved model to transformer_epoch516.pth


Training batches: 100%|██████████| 1/1 [00:14<00:00, 14.65s/it]


Epoch 517/1000 average loss: 0.0283
Saved model to transformer_epoch517.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


Epoch 518/1000 average loss: 0.0098
Saved model to transformer_epoch518.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.06s/it]


Epoch 519/1000 average loss: 0.0038
Saved model to transformer_epoch519.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]


Epoch 520/1000 average loss: 0.0029
Saved model to transformer_epoch520.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.00s/it]


Epoch 521/1000 average loss: 0.0040
Saved model to transformer_epoch521.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 522/1000 average loss: 0.0113
Saved model to transformer_epoch522.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.03s/it]


Epoch 523/1000 average loss: 0.0041
Saved model to transformer_epoch523.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 524/1000 average loss: 0.0032
Saved model to transformer_epoch524.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.21s/it]


Epoch 525/1000 average loss: 0.0032
Saved model to transformer_epoch525.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]


Epoch 526/1000 average loss: 0.0048
Saved model to transformer_epoch526.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Epoch 527/1000 average loss: 0.0046
Saved model to transformer_epoch527.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


Epoch 528/1000 average loss: 0.0030
Saved model to transformer_epoch528.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]


Epoch 529/1000 average loss: 0.0030
Saved model to transformer_epoch529.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 530/1000 average loss: 0.0029
Saved model to transformer_epoch530.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]


Epoch 531/1000 average loss: 0.0036
Saved model to transformer_epoch531.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.28s/it]


Epoch 532/1000 average loss: 0.0037
Saved model to transformer_epoch532.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.53s/it]


Epoch 533/1000 average loss: 0.0053
Saved model to transformer_epoch533.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]


Epoch 534/1000 average loss: 0.0032
Saved model to transformer_epoch534.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]


Epoch 535/1000 average loss: 0.0034
Saved model to transformer_epoch535.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]


Epoch 536/1000 average loss: 0.0028
Saved model to transformer_epoch536.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]


Epoch 537/1000 average loss: 0.0047
Saved model to transformer_epoch537.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


Epoch 538/1000 average loss: 0.0032
Saved model to transformer_epoch538.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.19s/it]


Epoch 539/1000 average loss: 0.0046
Saved model to transformer_epoch539.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]


Epoch 540/1000 average loss: 0.0026
Saved model to transformer_epoch540.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.27s/it]


Epoch 541/1000 average loss: 0.0028
Saved model to transformer_epoch541.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 542/1000 average loss: 0.0068
Saved model to transformer_epoch542.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.47s/it]


Epoch 543/1000 average loss: 0.0020
Saved model to transformer_epoch543.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 544/1000 average loss: 0.0023
Saved model to transformer_epoch544.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 545/1000 average loss: 0.0022
Saved model to transformer_epoch545.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it]


Epoch 546/1000 average loss: 0.0146
Saved model to transformer_epoch546.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Epoch 547/1000 average loss: 0.0084
Saved model to transformer_epoch547.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 548/1000 average loss: 0.0024
Saved model to transformer_epoch548.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Epoch 549/1000 average loss: 0.0074
Saved model to transformer_epoch549.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 550/1000 average loss: 0.0265
Saved model to transformer_epoch550.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 551/1000 average loss: 0.0062
Saved model to transformer_epoch551.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 552/1000 average loss: 0.0032
Saved model to transformer_epoch552.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 553/1000 average loss: 0.0029
Saved model to transformer_epoch553.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 554/1000 average loss: 0.0037
Saved model to transformer_epoch554.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 555/1000 average loss: 0.0036
Saved model to transformer_epoch555.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 556/1000 average loss: 0.0057
Saved model to transformer_epoch556.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 557/1000 average loss: 0.0133
Saved model to transformer_epoch557.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 558/1000 average loss: 0.0033
Saved model to transformer_epoch558.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 559/1000 average loss: 0.0033
Saved model to transformer_epoch559.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 560/1000 average loss: 0.0037
Saved model to transformer_epoch560.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 561/1000 average loss: 0.0034
Saved model to transformer_epoch561.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 562/1000 average loss: 0.0035
Saved model to transformer_epoch562.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 563/1000 average loss: 0.0047
Saved model to transformer_epoch563.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 564/1000 average loss: 0.0037
Saved model to transformer_epoch564.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 565/1000 average loss: 0.0032
Saved model to transformer_epoch565.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 566/1000 average loss: 0.0025
Saved model to transformer_epoch566.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]


Epoch 567/1000 average loss: 0.0027
Saved model to transformer_epoch567.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 568/1000 average loss: 0.0030
Saved model to transformer_epoch568.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.57s/it]


Epoch 569/1000 average loss: 0.0026
Saved model to transformer_epoch569.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 570/1000 average loss: 0.0026
Saved model to transformer_epoch570.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.55s/it]


Epoch 571/1000 average loss: 0.0023
Saved model to transformer_epoch571.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 572/1000 average loss: 0.0021
Saved model to transformer_epoch572.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 573/1000 average loss: 0.0020
Saved model to transformer_epoch573.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 574/1000 average loss: 0.0018
Saved model to transformer_epoch574.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Epoch 575/1000 average loss: 0.0020
Saved model to transformer_epoch575.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 576/1000 average loss: 0.0020
Saved model to transformer_epoch576.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.57s/it]


Epoch 577/1000 average loss: 0.0022
Saved model to transformer_epoch577.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 578/1000 average loss: 0.0019
Saved model to transformer_epoch578.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Epoch 579/1000 average loss: 0.0018
Saved model to transformer_epoch579.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 580/1000 average loss: 0.0018
Saved model to transformer_epoch580.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.47s/it]


Epoch 581/1000 average loss: 0.0018
Saved model to transformer_epoch581.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.66s/it]


Epoch 582/1000 average loss: 0.0018
Saved model to transformer_epoch582.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


Epoch 583/1000 average loss: 0.0026
Saved model to transformer_epoch583.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 584/1000 average loss: 0.0018
Saved model to transformer_epoch584.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.50s/it]


Epoch 585/1000 average loss: 0.0021
Saved model to transformer_epoch585.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 586/1000 average loss: 0.0054
Saved model to transformer_epoch586.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


Epoch 587/1000 average loss: 0.0016
Saved model to transformer_epoch587.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 588/1000 average loss: 0.0016
Saved model to transformer_epoch588.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]


Epoch 589/1000 average loss: 0.0019
Saved model to transformer_epoch589.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 590/1000 average loss: 0.0018
Saved model to transformer_epoch590.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 591/1000 average loss: 0.0042
Saved model to transformer_epoch591.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]


Epoch 592/1000 average loss: 0.0018
Saved model to transformer_epoch592.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 593/1000 average loss: 0.0387
Saved model to transformer_epoch593.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


Epoch 594/1000 average loss: 0.0016
Saved model to transformer_epoch594.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 595/1000 average loss: 0.0017
Saved model to transformer_epoch595.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


Epoch 596/1000 average loss: 0.0020
Saved model to transformer_epoch596.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 597/1000 average loss: 0.0043
Saved model to transformer_epoch597.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.52s/it]


Epoch 598/1000 average loss: 0.0026
Saved model to transformer_epoch598.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.73s/it]


Epoch 599/1000 average loss: 0.0049
Saved model to transformer_epoch599.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it]


Epoch 600/1000 average loss: 0.0036
Saved model to transformer_epoch600.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 601/1000 average loss: 0.0060
Saved model to transformer_epoch601.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 602/1000 average loss: 0.0030
Saved model to transformer_epoch602.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 603/1000 average loss: 0.0024
Saved model to transformer_epoch603.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 604/1000 average loss: 0.0086
Saved model to transformer_epoch604.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 605/1000 average loss: 0.0038
Saved model to transformer_epoch605.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 606/1000 average loss: 0.0029
Saved model to transformer_epoch606.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 607/1000 average loss: 0.0067
Saved model to transformer_epoch607.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 608/1000 average loss: 0.0064
Saved model to transformer_epoch608.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 609/1000 average loss: 0.0044
Saved model to transformer_epoch609.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 610/1000 average loss: 0.0031
Saved model to transformer_epoch610.pth


Training batches: 100%|██████████| 1/1 [00:11<00:00, 11.51s/it]


Epoch 611/1000 average loss: 0.0045
Saved model to transformer_epoch611.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.46s/it]


Epoch 612/1000 average loss: 0.0023
Saved model to transformer_epoch612.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.12s/it]


Epoch 613/1000 average loss: 0.0028
Saved model to transformer_epoch613.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 614/1000 average loss: 0.0027
Saved model to transformer_epoch614.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]


Epoch 615/1000 average loss: 0.0038
Saved model to transformer_epoch615.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 616/1000 average loss: 0.0035
Saved model to transformer_epoch616.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.16s/it]


Epoch 617/1000 average loss: 0.0041
Saved model to transformer_epoch617.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


Epoch 618/1000 average loss: 0.0030
Saved model to transformer_epoch618.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.14s/it]


Epoch 619/1000 average loss: 0.0028
Saved model to transformer_epoch619.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]


Epoch 620/1000 average loss: 0.0022
Saved model to transformer_epoch620.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.24s/it]


Epoch 621/1000 average loss: 0.0021
Saved model to transformer_epoch621.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 622/1000 average loss: 0.0019
Saved model to transformer_epoch622.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.17s/it]


Epoch 623/1000 average loss: 0.0019
Saved model to transformer_epoch623.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.92s/it]


Epoch 624/1000 average loss: 0.0019
Saved model to transformer_epoch624.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Epoch 625/1000 average loss: 0.0019
Saved model to transformer_epoch625.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 626/1000 average loss: 0.0018
Saved model to transformer_epoch626.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.64s/it]


Epoch 627/1000 average loss: 0.0024
Saved model to transformer_epoch627.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 628/1000 average loss: 0.0017
Saved model to transformer_epoch628.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 629/1000 average loss: 0.0017
Saved model to transformer_epoch629.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 630/1000 average loss: 0.0016
Saved model to transformer_epoch630.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 631/1000 average loss: 0.0021
Saved model to transformer_epoch631.pth


Training batches: 100%|██████████| 1/1 [00:12<00:00, 12.88s/it]


Epoch 632/1000 average loss: 0.0015
Saved model to transformer_epoch632.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.05s/it]


Epoch 633/1000 average loss: 0.0015
Saved model to transformer_epoch633.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Epoch 634/1000 average loss: 0.0016
Saved model to transformer_epoch634.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.24s/it]


Epoch 635/1000 average loss: 0.0018
Saved model to transformer_epoch635.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 636/1000 average loss: 0.0016
Saved model to transformer_epoch636.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.29s/it]


Epoch 637/1000 average loss: 0.0016
Saved model to transformer_epoch637.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 638/1000 average loss: 0.0014
Saved model to transformer_epoch638.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.36s/it]


Epoch 639/1000 average loss: 0.0021
Saved model to transformer_epoch639.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 640/1000 average loss: 0.0018
Saved model to transformer_epoch640.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.52s/it]


Epoch 641/1000 average loss: 0.0017
Saved model to transformer_epoch641.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 642/1000 average loss: 0.0014
Saved model to transformer_epoch642.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 643/1000 average loss: 0.0013
Saved model to transformer_epoch643.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 644/1000 average loss: 0.0015
Saved model to transformer_epoch644.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 645/1000 average loss: 0.0013
Saved model to transformer_epoch645.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 646/1000 average loss: 0.0013
Saved model to transformer_epoch646.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 647/1000 average loss: 0.0015
Saved model to transformer_epoch647.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Epoch 648/1000 average loss: 0.0013
Saved model to transformer_epoch648.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 649/1000 average loss: 0.0014
Saved model to transformer_epoch649.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it]


Epoch 650/1000 average loss: 0.0013
Saved model to transformer_epoch650.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 651/1000 average loss: 0.0014
Saved model to transformer_epoch651.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 652/1000 average loss: 0.0013
Saved model to transformer_epoch652.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


Epoch 653/1000 average loss: 0.0037
Saved model to transformer_epoch653.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 654/1000 average loss: 0.0016
Saved model to transformer_epoch654.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 655/1000 average loss: 0.0012
Saved model to transformer_epoch655.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


Epoch 656/1000 average loss: 0.0014
Saved model to transformer_epoch656.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.77s/it]


Epoch 657/1000 average loss: 0.0012
Saved model to transformer_epoch657.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 658/1000 average loss: 0.0013
Saved model to transformer_epoch658.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 659/1000 average loss: 0.0012
Saved model to transformer_epoch659.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.72s/it]


Epoch 660/1000 average loss: 0.0011
Saved model to transformer_epoch660.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Epoch 661/1000 average loss: 0.0012
Saved model to transformer_epoch661.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 662/1000 average loss: 0.0012
Saved model to transformer_epoch662.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Epoch 663/1000 average loss: 0.0014
Saved model to transformer_epoch663.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 664/1000 average loss: 0.0038
Saved model to transformer_epoch664.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 665/1000 average loss: 0.0016
Saved model to transformer_epoch665.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.63s/it]


Epoch 666/1000 average loss: 0.0035
Saved model to transformer_epoch666.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


Epoch 667/1000 average loss: 0.0012
Saved model to transformer_epoch667.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 668/1000 average loss: 0.0013
Saved model to transformer_epoch668.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 669/1000 average loss: 0.0013
Saved model to transformer_epoch669.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 670/1000 average loss: 0.0014
Saved model to transformer_epoch670.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 671/1000 average loss: 0.0047
Saved model to transformer_epoch671.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


Epoch 672/1000 average loss: 0.0015
Saved model to transformer_epoch672.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 673/1000 average loss: 0.0028
Saved model to transformer_epoch673.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 674/1000 average loss: 0.0018
Saved model to transformer_epoch674.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.86s/it]


Epoch 675/1000 average loss: 0.0025
Saved model to transformer_epoch675.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.90s/it]


Epoch 676/1000 average loss: 0.0014
Saved model to transformer_epoch676.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Epoch 677/1000 average loss: 0.0018
Saved model to transformer_epoch677.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 678/1000 average loss: 0.0014
Saved model to transformer_epoch678.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 679/1000 average loss: 0.0015
Saved model to transformer_epoch679.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 680/1000 average loss: 0.0015
Saved model to transformer_epoch680.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]


Epoch 681/1000 average loss: 0.0016
Saved model to transformer_epoch681.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 682/1000 average loss: 0.0017
Saved model to transformer_epoch682.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.86s/it]


Epoch 683/1000 average loss: 0.0017
Saved model to transformer_epoch683.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 684/1000 average loss: 0.0015
Saved model to transformer_epoch684.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.02s/it]


Epoch 685/1000 average loss: 0.0016
Saved model to transformer_epoch685.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it]


Epoch 686/1000 average loss: 0.0015
Saved model to transformer_epoch686.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.95s/it]


Epoch 687/1000 average loss: 0.0014
Saved model to transformer_epoch687.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]


Epoch 688/1000 average loss: 0.0013
Saved model to transformer_epoch688.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.36s/it]


Epoch 689/1000 average loss: 0.0018
Saved model to transformer_epoch689.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.08s/it]


Epoch 690/1000 average loss: 0.0014
Saved model to transformer_epoch690.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.03s/it]


Epoch 691/1000 average loss: 0.0014
Saved model to transformer_epoch691.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 692/1000 average loss: 0.0013
Saved model to transformer_epoch692.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.65s/it]


Epoch 693/1000 average loss: 0.0013
Saved model to transformer_epoch693.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.02s/it]


Epoch 694/1000 average loss: 0.0013
Saved model to transformer_epoch694.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.27s/it]


Epoch 695/1000 average loss: 0.0013
Saved model to transformer_epoch695.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.16s/it]


Epoch 696/1000 average loss: 0.0012
Saved model to transformer_epoch696.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.29s/it]


Epoch 697/1000 average loss: 0.0014
Saved model to transformer_epoch697.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.96s/it]


Epoch 698/1000 average loss: 0.0015
Saved model to transformer_epoch698.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]


Epoch 699/1000 average loss: 0.0011
Saved model to transformer_epoch699.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


Epoch 700/1000 average loss: 0.0013
Saved model to transformer_epoch700.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 701/1000 average loss: 0.0012
Saved model to transformer_epoch701.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.32s/it]


Epoch 702/1000 average loss: 0.0011
Saved model to transformer_epoch702.pth


Training batches: 100%|██████████| 1/1 [00:13<00:00, 13.95s/it]


Epoch 703/1000 average loss: 0.0011
Saved model to transformer_epoch703.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 704/1000 average loss: 0.0010
Saved model to transformer_epoch704.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.25s/it]


Epoch 705/1000 average loss: 0.0010
Saved model to transformer_epoch705.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.90s/it]


Epoch 706/1000 average loss: 0.0010
Saved model to transformer_epoch706.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.14s/it]


Epoch 707/1000 average loss: 0.0011
Saved model to transformer_epoch707.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it]


Epoch 708/1000 average loss: 0.0011
Saved model to transformer_epoch708.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.16s/it]


Epoch 709/1000 average loss: 0.0022
Saved model to transformer_epoch709.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.13s/it]


Epoch 710/1000 average loss: 0.0012
Saved model to transformer_epoch710.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.87s/it]


Epoch 711/1000 average loss: 0.0010
Saved model to transformer_epoch711.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


Epoch 712/1000 average loss: 0.0010
Saved model to transformer_epoch712.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.97s/it]


Epoch 713/1000 average loss: 0.0092
Saved model to transformer_epoch713.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Epoch 714/1000 average loss: 0.0010
Saved model to transformer_epoch714.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 715/1000 average loss: 0.0011
Saved model to transformer_epoch715.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 716/1000 average loss: 0.0011
Saved model to transformer_epoch716.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.12s/it]


Epoch 717/1000 average loss: 0.0013
Saved model to transformer_epoch717.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Epoch 718/1000 average loss: 0.0016
Saved model to transformer_epoch718.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 719/1000 average loss: 0.0017
Saved model to transformer_epoch719.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.56s/it]


Epoch 720/1000 average loss: 0.0019
Saved model to transformer_epoch720.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]


Epoch 721/1000 average loss: 0.0024
Saved model to transformer_epoch721.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 722/1000 average loss: 0.0013
Saved model to transformer_epoch722.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 723/1000 average loss: 0.0015
Saved model to transformer_epoch723.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.24s/it]


Epoch 724/1000 average loss: 0.0038
Saved model to transformer_epoch724.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it]


Epoch 725/1000 average loss: 0.0013
Saved model to transformer_epoch725.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.17s/it]


Epoch 726/1000 average loss: 0.0011
Saved model to transformer_epoch726.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.27s/it]


Epoch 727/1000 average loss: 0.0011
Saved model to transformer_epoch727.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.21s/it]


Epoch 728/1000 average loss: 0.0014
Saved model to transformer_epoch728.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 729/1000 average loss: 0.0011
Saved model to transformer_epoch729.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.07s/it]


Epoch 730/1000 average loss: 0.0012
Saved model to transformer_epoch730.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.21s/it]


Epoch 731/1000 average loss: 0.0012
Saved model to transformer_epoch731.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


Epoch 732/1000 average loss: 0.0013
Saved model to transformer_epoch732.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]


Epoch 733/1000 average loss: 0.0011
Saved model to transformer_epoch733.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 734/1000 average loss: 0.0014
Saved model to transformer_epoch734.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]


Epoch 735/1000 average loss: 0.0011
Saved model to transformer_epoch735.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 736/1000 average loss: 0.0012
Saved model to transformer_epoch736.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.52s/it]


Epoch 737/1000 average loss: 0.0011
Saved model to transformer_epoch737.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.01s/it]


Epoch 738/1000 average loss: 0.0011
Saved model to transformer_epoch738.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Epoch 739/1000 average loss: 0.0011
Saved model to transformer_epoch739.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]


Epoch 740/1000 average loss: 0.0011
Saved model to transformer_epoch740.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 741/1000 average loss: 0.0009
Saved model to transformer_epoch741.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 742/1000 average loss: 0.0040
Saved model to transformer_epoch742.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 743/1000 average loss: 0.0010
Saved model to transformer_epoch743.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 744/1000 average loss: 0.0010
Saved model to transformer_epoch744.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.68s/it]


Epoch 745/1000 average loss: 0.0011
Saved model to transformer_epoch745.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 746/1000 average loss: 0.0011
Saved model to transformer_epoch746.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 747/1000 average loss: 0.0011
Saved model to transformer_epoch747.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.70s/it]


Epoch 748/1000 average loss: 0.0016
Saved model to transformer_epoch748.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.61s/it]


Epoch 749/1000 average loss: 0.0014
Saved model to transformer_epoch749.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.63s/it]


Epoch 750/1000 average loss: 0.0014
Saved model to transformer_epoch750.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 751/1000 average loss: 0.0021
Saved model to transformer_epoch751.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.67s/it]


Epoch 752/1000 average loss: 0.0017
Saved model to transformer_epoch752.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 753/1000 average loss: 0.0013
Saved model to transformer_epoch753.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 754/1000 average loss: 0.0012
Saved model to transformer_epoch754.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 755/1000 average loss: 0.0117
Saved model to transformer_epoch755.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 756/1000 average loss: 0.0011
Saved model to transformer_epoch756.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Epoch 757/1000 average loss: 0.0010
Saved model to transformer_epoch757.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.73s/it]


Epoch 758/1000 average loss: 0.0010
Saved model to transformer_epoch758.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]


Epoch 759/1000 average loss: 0.0013
Saved model to transformer_epoch759.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 760/1000 average loss: 0.0013
Saved model to transformer_epoch760.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


Epoch 761/1000 average loss: 0.0023
Saved model to transformer_epoch761.pth


Training batches: 100%|██████████| 1/1 [00:14<00:00, 14.49s/it]


Epoch 762/1000 average loss: 0.0014
Saved model to transformer_epoch762.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.79s/it]


Epoch 763/1000 average loss: 0.0021
Saved model to transformer_epoch763.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]


Epoch 764/1000 average loss: 0.0020
Saved model to transformer_epoch764.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


Epoch 765/1000 average loss: 0.0073
Saved model to transformer_epoch765.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.72s/it]


Epoch 766/1000 average loss: 0.0042
Saved model to transformer_epoch766.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Epoch 767/1000 average loss: 0.0013
Saved model to transformer_epoch767.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 768/1000 average loss: 0.0016
Saved model to transformer_epoch768.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 769/1000 average loss: 0.0012
Saved model to transformer_epoch769.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.26s/it]


Epoch 770/1000 average loss: 0.0014
Saved model to transformer_epoch770.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


Epoch 771/1000 average loss: 0.0014
Saved model to transformer_epoch771.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.12s/it]


Epoch 772/1000 average loss: 0.0026
Saved model to transformer_epoch772.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.18s/it]


Epoch 773/1000 average loss: 0.0014
Saved model to transformer_epoch773.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 774/1000 average loss: 0.0025
Saved model to transformer_epoch774.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.20s/it]


Epoch 775/1000 average loss: 0.0028
Saved model to transformer_epoch775.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.99s/it]


Epoch 776/1000 average loss: 0.0016
Saved model to transformer_epoch776.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.19s/it]


Epoch 777/1000 average loss: 0.0057
Saved model to transformer_epoch777.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.10s/it]


Epoch 778/1000 average loss: 0.0016
Saved model to transformer_epoch778.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.28s/it]


Epoch 779/1000 average loss: 0.0012
Saved model to transformer_epoch779.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.06s/it]


Epoch 780/1000 average loss: 0.0012
Saved model to transformer_epoch780.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.21s/it]


Epoch 781/1000 average loss: 0.0030
Saved model to transformer_epoch781.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.29s/it]


Epoch 782/1000 average loss: 0.0012
Saved model to transformer_epoch782.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.47s/it]


Epoch 783/1000 average loss: 0.0013
Saved model to transformer_epoch783.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]


Epoch 784/1000 average loss: 0.0015
Saved model to transformer_epoch784.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.33s/it]


Epoch 785/1000 average loss: 0.0017
Saved model to transformer_epoch785.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.23s/it]


Epoch 786/1000 average loss: 0.0015
Saved model to transformer_epoch786.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.38s/it]


Epoch 787/1000 average loss: 0.0150
Saved model to transformer_epoch787.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.21s/it]


Epoch 788/1000 average loss: 0.0021
Saved model to transformer_epoch788.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.29s/it]


Epoch 789/1000 average loss: 0.0020
Saved model to transformer_epoch789.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


Epoch 790/1000 average loss: 0.0018
Saved model to transformer_epoch790.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 791/1000 average loss: 0.0015
Saved model to transformer_epoch791.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]


Epoch 792/1000 average loss: 0.0023
Saved model to transformer_epoch792.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]


Epoch 793/1000 average loss: 0.0020
Saved model to transformer_epoch793.pth


Training batches: 100%|██████████| 1/1 [00:18<00:00, 18.89s/it]


Epoch 794/1000 average loss: 0.0014
Saved model to transformer_epoch794.pth


Training batches: 100%|██████████| 1/1 [00:12<00:00, 12.91s/it]


Epoch 795/1000 average loss: 0.0014
Saved model to transformer_epoch795.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


Epoch 796/1000 average loss: 0.0043
Saved model to transformer_epoch796.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.16s/it]


Epoch 797/1000 average loss: 0.0014
Saved model to transformer_epoch797.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.03s/it]


Epoch 798/1000 average loss: 0.0013
Saved model to transformer_epoch798.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]


Epoch 799/1000 average loss: 0.0012
Saved model to transformer_epoch799.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.10s/it]


Epoch 800/1000 average loss: 0.0012
Saved model to transformer_epoch800.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.95s/it]


Epoch 801/1000 average loss: 0.0013
Saved model to transformer_epoch801.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 802/1000 average loss: 0.0011
Saved model to transformer_epoch802.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 803/1000 average loss: 0.0012
Saved model to transformer_epoch803.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 804/1000 average loss: 0.0012
Saved model to transformer_epoch804.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 805/1000 average loss: 0.0011
Saved model to transformer_epoch805.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


Epoch 806/1000 average loss: 0.0011
Saved model to transformer_epoch806.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


Epoch 807/1000 average loss: 0.0012
Saved model to transformer_epoch807.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.46s/it]


Epoch 808/1000 average loss: 0.0020
Saved model to transformer_epoch808.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.90s/it]


Epoch 809/1000 average loss: 0.0011
Saved model to transformer_epoch809.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 810/1000 average loss: 0.0011
Saved model to transformer_epoch810.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 811/1000 average loss: 0.0012
Saved model to transformer_epoch811.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 812/1000 average loss: 0.0010
Saved model to transformer_epoch812.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 813/1000 average loss: 0.0010
Saved model to transformer_epoch813.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 814/1000 average loss: 0.0055
Saved model to transformer_epoch814.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 815/1000 average loss: 0.0011
Saved model to transformer_epoch815.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.74s/it]


Epoch 816/1000 average loss: 0.0011
Saved model to transformer_epoch816.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 817/1000 average loss: 0.0019
Saved model to transformer_epoch817.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.63s/it]


Epoch 818/1000 average loss: 0.0010
Saved model to transformer_epoch818.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]


Epoch 819/1000 average loss: 0.0014
Saved model to transformer_epoch819.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 820/1000 average loss: 0.0026
Saved model to transformer_epoch820.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 821/1000 average loss: 0.0010
Saved model to transformer_epoch821.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 822/1000 average loss: 0.0012
Saved model to transformer_epoch822.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 823/1000 average loss: 0.0015
Saved model to transformer_epoch823.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.53s/it]


Epoch 824/1000 average loss: 0.0011
Saved model to transformer_epoch824.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 825/1000 average loss: 0.0011
Saved model to transformer_epoch825.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.50s/it]


Epoch 826/1000 average loss: 0.0011
Saved model to transformer_epoch826.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 827/1000 average loss: 0.0011
Saved model to transformer_epoch827.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]


Epoch 828/1000 average loss: 0.0010
Saved model to transformer_epoch828.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]


Epoch 829/1000 average loss: 0.0011
Saved model to transformer_epoch829.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.38s/it]


Epoch 830/1000 average loss: 0.0009
Saved model to transformer_epoch830.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 831/1000 average loss: 0.0012
Saved model to transformer_epoch831.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.13s/it]


Epoch 832/1000 average loss: 0.0010
Saved model to transformer_epoch832.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.01s/it]


Epoch 833/1000 average loss: 0.0038
Saved model to transformer_epoch833.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


Epoch 834/1000 average loss: 0.0010
Saved model to transformer_epoch834.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.41s/it]


Epoch 835/1000 average loss: 0.0012
Saved model to transformer_epoch835.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]


Epoch 836/1000 average loss: 0.0011
Saved model to transformer_epoch836.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


Epoch 837/1000 average loss: 0.0010
Saved model to transformer_epoch837.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]


Epoch 838/1000 average loss: 0.0117
Saved model to transformer_epoch838.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 839/1000 average loss: 0.0011
Saved model to transformer_epoch839.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.97s/it]


Epoch 840/1000 average loss: 0.0010
Saved model to transformer_epoch840.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]


Epoch 841/1000 average loss: 0.0014
Saved model to transformer_epoch841.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]


Epoch 842/1000 average loss: 0.0012
Saved model to transformer_epoch842.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.71s/it]


Epoch 843/1000 average loss: 0.0255
Saved model to transformer_epoch843.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 844/1000 average loss: 0.0016
Saved model to transformer_epoch844.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 845/1000 average loss: 0.0011
Saved model to transformer_epoch845.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.44s/it]


Epoch 846/1000 average loss: 0.0014
Saved model to transformer_epoch846.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.69s/it]


Epoch 847/1000 average loss: 0.0073
Saved model to transformer_epoch847.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.20s/it]


Epoch 848/1000 average loss: 0.0029
Saved model to transformer_epoch848.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.94s/it]


Epoch 849/1000 average loss: 0.0024
Saved model to transformer_epoch849.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.05s/it]


Epoch 850/1000 average loss: 0.0013
Saved model to transformer_epoch850.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.11s/it]


Epoch 851/1000 average loss: 0.0013
Saved model to transformer_epoch851.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 852/1000 average loss: 0.0015
Saved model to transformer_epoch852.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.35s/it]


Epoch 853/1000 average loss: 0.0011
Saved model to transformer_epoch853.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]


Epoch 854/1000 average loss: 0.0080
Saved model to transformer_epoch854.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.58s/it]


Epoch 855/1000 average loss: 0.0012
Saved model to transformer_epoch855.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.93s/it]


Epoch 856/1000 average loss: 0.0013
Saved model to transformer_epoch856.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.43s/it]


Epoch 857/1000 average loss: 0.0025
Saved model to transformer_epoch857.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 858/1000 average loss: 0.0013
Saved model to transformer_epoch858.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 859/1000 average loss: 0.0013
Saved model to transformer_epoch859.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it]


Epoch 860/1000 average loss: 0.0014
Saved model to transformer_epoch860.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.86s/it]


Epoch 861/1000 average loss: 0.0011
Saved model to transformer_epoch861.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 862/1000 average loss: 0.0011
Saved model to transformer_epoch862.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 863/1000 average loss: 0.0016
Saved model to transformer_epoch863.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it]


Epoch 864/1000 average loss: 0.0012
Saved model to transformer_epoch864.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 865/1000 average loss: 0.0018
Saved model to transformer_epoch865.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.79s/it]


Epoch 866/1000 average loss: 0.0010
Saved model to transformer_epoch866.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]


Epoch 867/1000 average loss: 0.0009
Saved model to transformer_epoch867.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.88s/it]


Epoch 868/1000 average loss: 0.0015
Saved model to transformer_epoch868.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 869/1000 average loss: 0.0010
Saved model to transformer_epoch869.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.75s/it]


Epoch 870/1000 average loss: 0.0010
Saved model to transformer_epoch870.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.80s/it]


Epoch 871/1000 average loss: 0.0009
Saved model to transformer_epoch871.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.83s/it]


Epoch 872/1000 average loss: 0.0012
Saved model to transformer_epoch872.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


Epoch 873/1000 average loss: 0.0010
Saved model to transformer_epoch873.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.86s/it]


Epoch 874/1000 average loss: 0.0008
Saved model to transformer_epoch874.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 875/1000 average loss: 0.0009
Saved model to transformer_epoch875.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 876/1000 average loss: 0.0008
Saved model to transformer_epoch876.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]


Epoch 877/1000 average loss: 0.0008
Saved model to transformer_epoch877.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 878/1000 average loss: 0.0008
Saved model to transformer_epoch878.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 879/1000 average loss: 0.0008
Saved model to transformer_epoch879.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.74s/it]


Epoch 880/1000 average loss: 0.0009
Saved model to transformer_epoch880.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.69s/it]


Epoch 881/1000 average loss: 0.0008
Saved model to transformer_epoch881.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]


Epoch 882/1000 average loss: 0.0009
Saved model to transformer_epoch882.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.77s/it]


Epoch 883/1000 average loss: 0.0008
Saved model to transformer_epoch883.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.77s/it]


Epoch 884/1000 average loss: 0.0011
Saved model to transformer_epoch884.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Epoch 885/1000 average loss: 0.0008
Saved model to transformer_epoch885.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.48s/it]


Epoch 886/1000 average loss: 0.0007
Saved model to transformer_epoch886.pth


Training batches: 100%|██████████| 1/1 [00:14<00:00, 14.21s/it]


Epoch 887/1000 average loss: 0.0008
Saved model to transformer_epoch887.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


Epoch 888/1000 average loss: 0.0008
Saved model to transformer_epoch888.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.13s/it]


Epoch 889/1000 average loss: 0.0007
Saved model to transformer_epoch889.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.40s/it]


Epoch 890/1000 average loss: 0.0007
Saved model to transformer_epoch890.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.63s/it]


Epoch 891/1000 average loss: 0.0010
Saved model to transformer_epoch891.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.02s/it]


Epoch 892/1000 average loss: 0.0007
Saved model to transformer_epoch892.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.03s/it]


Epoch 893/1000 average loss: 0.0008
Saved model to transformer_epoch893.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.73s/it]


Epoch 894/1000 average loss: 0.0008
Saved model to transformer_epoch894.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.84s/it]


Epoch 895/1000 average loss: 0.0007
Saved model to transformer_epoch895.pth


Training batches: 100%|██████████| 1/1 [00:09<00:00,  9.33s/it]


Epoch 896/1000 average loss: 0.0009
Saved model to transformer_epoch896.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.51s/it]


Epoch 897/1000 average loss: 0.0008
Saved model to transformer_epoch897.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Epoch 898/1000 average loss: 0.0009
Saved model to transformer_epoch898.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]


Epoch 899/1000 average loss: 0.0007
Saved model to transformer_epoch899.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.65s/it]


Epoch 900/1000 average loss: 0.0007
Saved model to transformer_epoch900.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.52s/it]


Epoch 901/1000 average loss: 0.0006
Saved model to transformer_epoch901.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.38s/it]


Epoch 902/1000 average loss: 0.0008
Saved model to transformer_epoch902.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it]


Epoch 903/1000 average loss: 0.0014
Saved model to transformer_epoch903.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.25s/it]


Epoch 904/1000 average loss: 0.0007
Saved model to transformer_epoch904.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.48s/it]


Epoch 905/1000 average loss: 0.0007
Saved model to transformer_epoch905.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.05s/it]


Epoch 906/1000 average loss: 0.0006
Saved model to transformer_epoch906.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.34s/it]


Epoch 907/1000 average loss: 0.0007
Saved model to transformer_epoch907.pth


Training batches: 100%|██████████| 1/1 [00:11<00:00, 11.33s/it]


Epoch 908/1000 average loss: 0.0007
Saved model to transformer_epoch908.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.71s/it]


Epoch 909/1000 average loss: 0.0028
Saved model to transformer_epoch909.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.80s/it]


Epoch 910/1000 average loss: 0.0007
Saved model to transformer_epoch910.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.92s/it]


Epoch 911/1000 average loss: 0.0007
Saved model to transformer_epoch911.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.49s/it]


Epoch 912/1000 average loss: 0.0007
Saved model to transformer_epoch912.pth


Training batches: 100%|██████████| 1/1 [00:08<00:00,  8.43s/it]


Epoch 913/1000 average loss: 0.0007
Saved model to transformer_epoch913.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.96s/it]


Epoch 914/1000 average loss: 0.0007
Saved model to transformer_epoch914.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.78s/it]


Epoch 915/1000 average loss: 0.0009
Saved model to transformer_epoch915.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.02s/it]


Epoch 916/1000 average loss: 0.0008
Saved model to transformer_epoch916.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.98s/it]


Epoch 917/1000 average loss: 0.0008
Saved model to transformer_epoch917.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.34s/it]


Epoch 918/1000 average loss: 0.0006
Saved model to transformer_epoch918.pth


Training batches: 100%|██████████| 1/1 [00:07<00:00,  7.01s/it]


Epoch 919/1000 average loss: 0.0009
Saved model to transformer_epoch919.pth


Training batches: 100%|██████████| 1/1 [00:06<00:00,  6.90s/it]


Epoch 920/1000 average loss: 0.0007
Saved model to transformer_epoch920.pth


Training batches: 100%|██████████| 1/1 [00:05<00:00,  5.91s/it]


Epoch 921/1000 average loss: 0.0008


RuntimeError: [enforce fail at inline_container.cc:664] . unexpected pos 50500224 vs 50500112

In [None]:
summary = summarize(model, "यह एक हिंदी बैठक की ट्रांसक्रिप्ट है जिसमें...", sp)
print(summary)


In [None]:
sp = spm.SentencePieceProcessor(model_file="hindi_spm.model")
print(sp.encode("यह एक हिंदी बैठक की ट्रांसक्रिप्ट है।", out_type=str))


['▁यह', '▁एक', '▁ह', 'ि', 'ंद', 'ी', '▁ब', 'ै', 'ठ', 'क', '▁की', '▁ट्रांसक्रिप्ट', '▁है', '।']


In [None]:
import torch
import sentencepiece as spm


In [None]:
from google.colab import drive
drive.mount('/content/drive')


ValueError: /root/.config/Google must be a directory if present

In [None]:
# Load the SentencePiece tokenizer
sp = spm.SentencePieceProcessor()
sp.load("/content/hindi_spm.model")

# Load model checkpoint
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


d_model = 512  # same as checkpoint
nhead = 8
num_encoder_layers = 3
num_decoder_layers = 3

model = Seq2SeqTransformer(num_encoder_layers, num_decoder_layers, d_model, nhead,
                           src_vocab_size, tgt_vocab_size).to(DEVICE)

model.load_state_dict(torch.load("/content/transformer_epoch901.pth", map_location=DEVICE))
model.eval()

Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1024, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerDecoderLayer(
          (self_attn): MultiheadAtte

In [None]:
with open("/content/meeting_hindi.txt", "r", encoding="utf-8") as f:
    hindi_text = f.read()

print("📝 Transcript preview:\n", hindi_text[:1000])  # preview first 300 chars



📝 Transcript preview:
 स्मार्टडेस्क एआई - XYZ टेक्नोलॉजीज टीम मीटिंग ट्रांसक्रिप्ट
[10:00:00] रघु (नेतृत्व): आइए शुरुआती और मुख्य उपयोगकर्ताओं की बात करें। मुझे केंद्रित, व्यावहारिक सुझाव चाहिए।
[10:00:08] रघु (नेतृत्व): मुझे लगता है कि छात्र अध्ययन के एर्गोनॉमिक्स और लंबे
डेस्क सत्रों के लिए स्मार्टडेस्क एआई की सराहना करेंगे।
[10:00:20] टिया (ग्राहक सहायता): मुझे लगता है कि दूरस्थ कर्मचारी अध्ययन के एर्गोनॉमिक्स
और लंबे डेस्क सत्रों के लिए स्मार्टडेस्क एआई की सराहना करेंगे।
[10:00:42] राम (उत्पाद): हमें संकेत मिल रहे हैं कि छात्र लचीले सेटअप और सोशल शेयरिंग को महत्व देते हैं। साथ ही,
टिया ने एक अच्छी बात उठाई कि मुझे लगता है कि दूरस्थ कर्मचारी इसकी सराहना करेंगे।
[10:01:04] रविन (मार्केटिंग): स्टार्टअप एक अच्छा लक्ष्य हो सकते हैं क्योंकि मुद्रा सुधार और उत्पादकता बेहतर होती है।
[10:00:45] राजेश (वित्त): हमें संकेत मिल रहे हैं कि दूरस्थ कर्मचारी अध्ययन के एर्गोनॉमिक्स और लंबे डेस्क
सेशन को महत्व देते हैं।
[10:00:42] राम (उत्पाद): क्या होगा अगर हम लचीले सेटअप
और सोशल शेयरिंग पर केंद्रित

In [None]:
summary = summarize(model, hindi_text, sp)
print("🧾 Generated Summary:\n", summary)


🧾 Generated Summary:
 स्मार्टडेस्क एआई - XYZ टेक्नोलॉजीज टीम मीटिंग ट्रांसक्रिप्ट [10:00:00] रघु (नेतृत्व): आइए शुरुआती और मुख्य उपयोगकर्ताओं की बात करें। मुझे केंद्रित, व्यावहारिक सुझाव चाहिए। [10:00:08] रघु (नेतृत्व): मुझे लगता है कि छात्र अध्ययन के एर्गोनॉमिक्स और लंबे डेस्क सत्रों के लिए स्मार्टडेस्क एआई की सराहना करेंगे। [10:00:20] टिया (ग्राहक सहायता): मुझे लगता है कि दूरस्थ कर्मचारी अध्ययन के एर्गोनॉमिक्स और लंबे डेस्क सत्रों के लिए स्मार्टडेस्क एआई की सराहना करेंगे। [10:00:42] राम (उत्पाद): हमें संकेत मिल रहे हैं कि छात्र लचीले सेटअप और सोशल शेयरिंग को महत्व देते हैं। साथ ही, टिया ने एक अच्छी बात उठाई कि मुझे लगता है कि दूरस्थ
