In [29]:
from dataclasses import dataclass


@dataclass
class Config:
    pretrain_raw_data = "dataset/tamil_pretrain.txt"
    context_length = 512
    filtered_data = "dataset/filtered_cl_tamil_pretrain.txt"
    vocab_size = 30000
    tokenizer_path = "artifacts/tokenizer.json"

## Data Preprocessing

In [14]:
import mmap
import re

def clean_whitespaces(text):
    return re.sub(r"\s+", " ", text).strip()


def filter_sentences(file_path, max_len, output_file):
    with open(file_path, "r", encoding="utf-8") as f:

        mmapped_file = mmap.mmap(
            f.fileno(),
            0,
            access=mmap.ACCESS_READ
        )
        with open(output_file, "w") as out_file:
            for line in iter(mmapped_file.readline, b""):
                sentence = line.decode("utf-8").strip()
                sentence = clean_whitespaces(sentence)
                if sentence:
                    if len(sentence) >= max_len:
                        out_file.write(sentence + "\n")
        mmapped_file.close()

In [15]:
filter_sentences(Config.pretrain_raw_data, Config.context_length, "dataset/filtered_cl_tamil_pretrain.txt")

## Training a custom tokenizer
This block helps to train a BPE tokenizer from scratch for the new filtered dataset.

You can learn how to build a tokenizer from scratch block by block here: [Link](https://huggingface.co/learn/nlp-course/en/chapter6/8)

In [19]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [20]:
tamil_tokenizer = Tokenizer(models.BPE())

In [21]:
tamil_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [22]:
#tamil sample sentence
sample_text = "ро╡рогроХрпНроХроорпН роЙро▓роХроорпН"
tamil_tokenizer.pre_tokenizer.pre_tokenize_str(sample_text)

[('├а┬о┬╡├а┬о┬г├а┬о─╖', (0, 3)),
 ('├а┬п─п', (3, 4)),
 ('├а┬о─╖├а┬о┬о', (4, 6)),
 ('├а┬п─п', (6, 7)),
 ('─а├а┬о─л├а┬о┬▓├а┬о─╖├а┬о┬о', (7, 12)),
 ('├а┬п─п', (12, 13))]

In [24]:
trainer = trainers.BpeTrainer(vocab_size=Config.vocab_size, special_tokens=["<|endoftext|>"])
tamil_tokenizer.model = models.BPE()
tamil_tokenizer.train([Config.filtered_data], trainer=trainer)






In [25]:
tamil_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [26]:
tamil_tokenizer.decoder = decoders.ByteLevel()

In [27]:
tamil_tokenizer.decode(
    tamil_tokenizer.encode("ро╡рогроХрпНроХроорпН роЙро▓роХроорпН").ids
)

'ро╡рогроХрпНроХроорпН роЙро▓роХроорпН'

In [30]:
tamil_tokenizer.save(Config.tokenizer_path)

## Training pipeline setup

In [32]:
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset

In [33]:
#loading the splits
data_files ={
    "train": "dataset/v2/tamil_train.txt",
    "test": "dataset/v2/tamil_test.txt",
}

raw_dataset = load_dataset(
    "text",
    data_files=data_files
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [35]:
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=Config.tokenizer_path,
)

outputs = tokenizer(
    raw_dataset["train"][:2]['text'],
    truncation=True,
    max_length=Config.context_length,
    return_overflowing_tokens=True,
    return_length=True

)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {outputs['length']}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 3
Input chunk lengths: [512, 390, 496]
Chunk mapping: [0, 0, 1]


In [41]:
raw_dataset['train'][:2]['text']

['роГрокро╛ропрпЖроЯрпНроЯрпЖ (1825) роЖроХро┐ропро╡ро▒рпНро▒ро╛ро▓рпН роЪро╛роорпБро╡рпЗро▓рпН роорпЛро░рпНро╕рпБроХрпНроХрпБ роЬройроиро╛ропроХ родрпЗроЪро┐роп роЙрогро░рпНро╡ро┐ройрпН роорпАродрпБ роХрпБро┤рокрпНрокроорпН роПро▒рпНрокроЯрпНроЯродрпБ. роЪрпЖропро▓рпН роорпБро▒рпИропро┐ро▓рпН роЕроорпЖро░ро┐роХрпНроХ роороХрпНроХро│ро╛роЯрпНроЪро┐ропрпИроХрпН роХро╛рогрпНрокро┐рокрпНрокродро▒рпНроХро╛роХ роЕроорпЖро░ро┐роХрпНроХ рокро┐ро░родро┐роиро┐родро┐роХро│рпН роЕро╡рпИропрпИ роЕро╡ро░рпН роУро╡ро┐ропрооро╛роХ ро╡ро░рпИропродрпН родрпЗро░рпНроирпНродрпЖроЯрпБродрпНродро╛ро░рпН. роЕро╡ро░рпН рокрпБродро┐роп роЕро▒рпИроХро│ро┐ройрпН роХроЯрпНроЯроорпИрокрпНрокрпИ ро╡ро░рпИро╡родро▒рпНроХрпБ ро╡ро╛ро╖ро┐роЩрпНроЯройрпН D.C.роХрпНроХрпБ рокропрогро┐родрпНродро╛ро░рпН. роЕроирпНрод роУро╡ро┐ропродрпНродро┐ро▓рпН роХро╡ройрооро╛роХ роОрогрпНрокродрпБ рокрпЗро░рпИ ро╡ро░рпИроирпНродро┐ро░рпБроирпНродро╛ро░рпН. роорпЗро▓рпБроорпН роЕродро┐ро▓рпН роЗроЯроорпН рокрпЖро▒рпНро▒ро┐ро░рпБроирп

In [44]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=Config.context_length,
        return_overflowing_tokens=True,
        return_length=True
    )

    input_batch = []
    for length, input_ids in zip(
        outputs['length'], outputs['input_ids']
    ):
        if length == Config.context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_dataset = raw_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
)

Map:   0%|          | 0/1184417 [00:00<?, ? examples/s]

Map:   0%|          | 0/345494 [00:00<?, ? examples/s]

In [45]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1528958
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 445316
    })
})

In [47]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=Config.context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [48]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 108.9M parameters


In [None]:
# 1. First, add the pad token more explicitly
if tokenizer.pad_token is None:
    # Method 1: Set pad_token to eos_token
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    
    # Method 2: If Method 1 doesn't work, try adding a special token
    # This is more reliable as it modifies the tokenizer's vocabulary
    special_tokens_dict = {'pad_token': '[PAD]'}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print(f"Added {num_added_toks} special tokens: {special_tokens_dict}")
    
    # If working with a model, resize embeddings to match new vocabulary size
    # model.resize_token_embeddings(len(tokenizer))

# 2. Verify that pad token is set
print(f"Pad token: '{tokenizer.pad_token}', ID: {tokenizer.pad_token_id}")

Added 1 special tokens: {'pad_token': '[PAD]'}
Pad token: '[PAD]', ID: 30000


In [64]:
# 2. Create data collator with explicit padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False, 
)

# 3. Try processing your batch
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 512])
attention_mask shape: torch.Size([5, 512])
labels shape: torch.Size([5, 512])


In [65]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="artifacts",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

  trainer = Trainer(


In [None]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 7.60 GiB of which 139.06 MiB is free. Process 2120 has 102.55 MiB memory in use. Including non-PyTorch memory, this process has 7.10 GiB memory in use. Of the allocated memory 6.92 GiB is allocated by PyTorch, and 55.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

## Transformer from scratch

In [4]:
1/8

0.125

In [3]:
64 ** -0.5

0.125

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
#Block:
    # Masked Multi-head Attention
    # Residual connection
    # Layer Normalization
    # Feed Forward
    # Residual connection
    # Layer Normalization

#Output Head:
    # Linear layer
    # Softmax
class MultiHeadAttention(nn.Module):
    
    def __init__(self, num_heads: int, dim: int, dropout:float):
        super().__init__()
        
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads

        self.qkv = nn.Linear(dim, dim * 3)
        self.proj = nn.Linear(dim, dim)
        self.attn_dropout = nn.Dropout(dropout)
        self.out_dropout = nn.Dropout(dropout)
        self.scale = self.head_dim ** -0.5
    
    def forward(self, x, mask=None):
        batch_size, seq_len, dim = x.size()
        qkv = self.qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: t.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2), qkv)

        #attention scores
        attention = (q @ k.transpose(-2, -1)) * self.scale

        if mask is not None:
            mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)
            attention.masked_fill_(mask[None, None, ...], float("-inf"))
        
        attention = F.softmax(attention, dim=-1)
        attention = self.attn_dropout(attention)

        out = (attention @ v).transpose(1, 2).reshape(batch_size, seq_len, dim)
        out = self.proj(out)
        out = self.out_dropout(out)

        return out

class FeedForward(nn.Module):
    def __init__(self, dim: int = 512) -> None:
        super().__init__()

        self.fc1 = nn.Linear(dim, dim * 4)
        self.fc2 = nn.Linear(dim * 4, dim)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class Transformer(nn.Module):
    def __init__(self, dim:int=512, num_heads:int=8, dropout:float=0.1):
        super().__init__()

        self.attn_norm = nn.LayerNorm(dim)
        self.attention = MultiHeadAttention(dim=dim, num_heads=num_heads, dropout=dropout)

        self.ff = FeedForward(dim)
        self.ff_norm = nn.LayerNorm(dim)

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):

        residual = x
        x = self.attn_norm(x)
        x = self.attention(x, mask=mask)
        x = residual + self.dropout(x)
        
        residual = x
        x = self.ff_norm(x)
        x = self.ff(x)
        x = residual + self.dropout(x)
        return x
    
class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, dim: int, max_seq_len: int = 2048):
        super().__init__()
        
        # Create positional encodings
        pe = torch.zeros(max_seq_len, dim)
        position = torch.arange(0, max_seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2) * -(math.log(10000.0) / dim))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        # Register as buffer (not a parameter)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # x shape: [batch_size, seq_len, dim]
        return x + self.pe[:, :x.size(1)]


class GPT(nn.Module):
    def __init__(self, num_tokens: int, dim: int, num_layers:int = 6):
        super().__init__()
        self.num_tokens = num_tokens
        self.dim = dim
        self.token_emb = nn.Embedding(num_tokens, dim)
        self.pos_emb = SinusoidalPositionalEncoding(dim=dim)
        self.transformer = nn.ModuleList([Transformer(dim=dim) for _ in range(6)])
        self.final_norm = nn.LayerNorm(dim)
        self.head = nn.Linear(dim, num_tokens)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x):
        x = self.token_emb(x)
        x = self.pos_emb(x)
        x = self.dropout(x)
        for layer in self.transformer:
            x = layer(x)
        x = self.final_norm(x)
        x = self.head(x)
        return x

In [18]:
data = [x for x in open("data.txt", "r").readlines() if x.strip()]

In [19]:
token_list = []
for line in data:
    for char in line:
        if char not in token_list:
            token_list.append(char)

In [20]:
tokens = "".join(sorted(token_list))
num_tokens = len(tokens)
char_to_token = {char: token for token, char in enumerate(tokens)}
token_to_char = {token: char for token, char in enumerate(tokens)}

In [21]:
model = GPT(num_tokens, 512).to("cuda")

In [22]:
print(f"{sum(p.numel() for p in model.parameters())} parameters in model")

18981953 parameters in model


In [23]:
#tokenize the data using the map function
def tokenize(line):
    return torch.tensor([char_to_token[char] for char in line])


def pad_truncate(tokens, target_length):
    if len(tokens) > target_length:
        return tokens[: target_length]
    elif len(tokens) < target_length:
        return F.pad(tokens, (0, target_length - len(tokens)))
    return tokens


In [24]:
stk = torch.stack([pad_truncate(tokenize(line), 50) for line in data[:32]])

In [25]:
stk.shape

torch.Size([32, 50])

In [41]:
def generate(seed_token, model, context_length, max_new_tokens, temperature=1.0):
    model.eval()
    tokens = [char_to_token[seed_token]]
    
    for _ in range(max_new_tokens):
        # Prepare input
        x = torch.tensor(tokens[-context_length:]).unsqueeze(0).to("cuda:0").long()
        
        # Get predictions
        logits = model(x)
        
        # Sample next token (from the last position)
        logits = logits[0, -1, :] / temperature
        probs = F.softmax(logits, dim=0)
        next_token = torch.multinomial(probs, num_samples=1).item()
        
        tokens.append(next_token)
    
    # Convert to text
    return ''.join([token_list[t] for t in tokens])

In [49]:
generate("v", model, 1, 100)

"&rqF.XIqHPWbA3YL?LURPjaSyiem!pu\n\nrRsU\ncI;aGSNkVTf$-&wFuwha'zN LRV-:&sQEVXkcFMIT,irF mb?quSTnVrfof:JjJ"

In [15]:
#find number of times a character appears in the dataset
from collections import Counter

char_counter = Counter()
for line in data:
    char_counter.update(line)

char_counter

Counter({' ': 169892,
         'e': 94611,
         't': 67009,
         'o': 65798,
         'a': 55507,
         'h': 51310,
         's': 49696,
         'r': 48889,
         'n': 48529,
         'i': 45537,
         'l': 33339,
         '\n': 32777,
         'd': 31358,
         'u': 26584,
         'm': 22243,
         'y': 20448,
         ',': 19846,
         'w': 17585,
         'f': 15770,
         'c': 15623,
         'g': 13356,
         'I': 11832,
         'b': 11321,
         'p': 10808,
         ':': 10316,
         '.': 7885,
         'A': 7819,
         'v': 7793,
         'k': 7088,
         'T': 7015,
         "'": 6187,
         'E': 6041,
         'O': 5481,
         'N': 5079,
         'R': 4869,
         'S': 4523,
         'L': 3876,
         'C': 3820,
         ';': 3628,
         'W': 3530,
         'U': 3313,
         'H': 3068,
         'M': 2840,
         'B': 2761,
         '?': 2462,
         'G': 2399,
         '!': 2172,
         'D': 2089,
         '-':

In [51]:
from tqdm import tqdm
import numpy as np


batch_size = 32
lr = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
context = 65



model = GPT(num_tokens, 512).to("cuda")

for epoch in range(2):
    step_loss = 0.0
    for i in tqdm(range(0, len(data), batch_size)):
        x = data[i: i+batch_size]
        x = torch.stack([pad_truncate(tokenize(line), context) for line in x])
        y = x[:, 1:]
        x = x[:, :-1]
        x = x.to("cuda:0").long()
        y = y.to("cuda:0").long()

        y_pred = model(x)
        y_pred = y_pred.view(-1, num_tokens)
        y = y.view(-1)
        loss = loss_fn(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step_loss += loss.item()
        if i % 1000 == 0:
            model.eval()
            print(f"Epoch {epoch} Iteration {i} Loss: {loss.item()}")
            with torch.no_grad():
                model_gen = generate(np.random.choice(token_list), model, 65, 100)
            print(model_gen)  
            model.train()
    
    


  0%|          | 0/1025 [00:00<?, ?it/s]

Epoch 0 Iteration 0 Loss: 4.222636699676514


  0%|          | 4/1025 [00:00<01:37, 10.53it/s]

Lo&B.;;L&nlHP3y'IFlJsMZVlkORsepRq?YMihVBOQ:oYypo;usm' PEi;C;RBiA3UiGJO:FG Xup-nbJQCQpB:wr:K'yOlaO3-o



 12%|тЦИтЦП        | 124/1025 [00:05<00:39, 23.09it/s]

Epoch 0 Iteration 4000 Loss: 4.206521987915039


 13%|тЦИтЦО        | 130/1025 [00:06<00:56, 15.94it/s]

&QrxeHsv:ljFpKbVqL
WcC?tR,KcWinPwCaGyuz!U,:mgPrvamayIV&v?ooYjTJvtQJEBJ,OgGJMVQiz$VQgUTjEuTjp!k:,TjZV$


 24%|тЦИтЦИтЦН       | 250/1025 [00:11<00:34, 22.73it/s]

Epoch 0 Iteration 8000 Loss: 4.227103233337402


 25%|тЦИтЦИтЦН       | 253/1025 [00:11<00:54, 14.19it/s]

!axv&ODvjJuCV$f-gt:eJ
? taBVjZrJs
&
GURfD-gYKizsxuwiPetXH&?dYCcBlr:j ,niQV'zUVoF:rvy&V  ;yo
aQKfJVuKX


 36%|тЦИтЦИтЦИтЦЛ      | 373/1025 [00:17<00:28, 22.52it/s]

Epoch 0 Iteration 12000 Loss: 4.197488784790039


 37%|тЦИтЦИтЦИтЦЛ      | 379/1025 [00:17<00:40, 15.79it/s]

dbL!aTUKVVXl;eKl.MqcFr&IeDo&!?t:VFe.JtQ-s&?FVggNQSQU3' EVh?-iz-T?dW,BlSkZ:V!Z- uCFej$RCUZEpg.ipwd,lV?


 49%|тЦИтЦИтЦИтЦИтЦК     | 499/1025 [00:22<00:23, 22.55it/s]

Epoch 0 Iteration 16000 Loss: 4.2015886306762695


 49%|тЦИтЦИтЦИтЦИтЦЙ     | 505/1025 [00:23<00:32, 15.76it/s]

nOdS&FiRps-O,:kSDxAZJhtLIJV&s3j,UtdVX;Onhok Rigq,g$uLVQ-glJrg;ZDIttiBX.MfAkO
wwC.opzEq&3KpRDr:FHxFGwF


 61%|тЦИтЦИтЦИтЦИтЦИтЦИ    | 625/1025 [00:28<00:17, 22.50it/s]

Epoch 0 Iteration 20000 Loss: 4.213192939758301


 61%|тЦИтЦИтЦИтЦИтЦИтЦИтЦП   | 628/1025 [00:29<00:28, 14.12it/s]

wCj l;hUkE'DC
LZj.3-v$CNCTJA&ieiv.s
os?XvPl;$:MIJsRontVlVKOd3sZrw&ydYo::TpogmQ?rs$C
$$mUqf;CvYJKv:Q!A


 73%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦО  | 748/1025 [00:34<00:12, 22.43it/s]

Epoch 0 Iteration 24000 Loss: 4.232711315155029


 74%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦО  | 754/1025 [00:35<00:17, 15.83it/s]

zoB-LMdvagS fiHdWwFzYazz
zj3kozPdRU'w$sCi
P:HdzQ:wVym;KNUhqJWFJkLFy;zC'MKla'$OQKecTZDoV!,Ndm.S&-Y!gQ;


 85%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦМ | 874/1025 [00:40<00:06, 22.17it/s]

Epoch 0 Iteration 28000 Loss: 4.236649513244629


 86%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦМ | 880/1025 [00:40<00:09, 15.58it/s]

AFiBQr3ZJH:YjdlizaWF:NgQPrkLEd,Yb;ig,:mmHDvOv'dudQ.pad$DmOkQr VG
JMFYBOj!,:,yMxbTEKPda!.bJMAPiK$V!Gm$


 98%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦК| 1000/1025 [00:46<00:01, 21.89it/s]

Epoch 0 Iteration 32000 Loss: 4.229125499725342


 98%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦК| 1003/1025 [00:46<00:01, 13.90it/s]

y

YiPRpe-WFSk&QAbeDZ;vyoRNJMD&bC-?Utb-o&iv.E,wsgv ia$EoJO:Yb3EF-hfqnp-OTlaWitlO&-YNDzCfJD&zwks&dULCj


100%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИ| 1025/1025 [00:47<00:00, 21.44it/s]
  0%|          | 0/1025 [00:00<?, ?it/s]

Epoch 1 Iteration 0 Loss: 4.220936298370361


  0%|          | 4/1025 [00:00<01:39, 10.26it/s]

usVO Cgj?J&C;$cqdww3dow'Nl.umB gILPxbl;KOLpZu$
jux.D?r:Q;kJOMKdK
vhVuH$iDJW E?uG-DlkC;gdJJRsmT BiQtGD


 12%|тЦИтЦП        | 124/1025 [00:06<00:41, 21.57it/s]

Epoch 1 Iteration 4000 Loss: 4.194313049316406


 13%|тЦИтЦО        | 130/1025 [00:06<00:58, 15.35it/s]

Fa
uope
Y;'!wsAVhy!xk$yRKziYZcEJYiH!ONf-jVD!wsixBopmCuLQ?c-MoBVMJa$XJuCvedaJCJuCgOHkGJ-ksRxaDFyyqLB!U


 24%|тЦИтЦИтЦН       | 250/1025 [00:12<00:36, 21.34it/s]

Epoch 1 Iteration 8000 Loss: 4.230233192443848


 25%|тЦИтЦИтЦН       | 253/1025 [00:12<00:57, 13.51it/s]

XqoPHkDM,LjN$kzXb!xZe!LJ!b;uRQDQUDnu$MN
i!'?qLi,UtunJu;&iA-HdZieKJ,:$Qv izBli$oJex'!HE-iet;ivF!SoqfJM


 36%|тЦИтЦИтЦИтЦЛ      | 373/1025 [00:18<00:30, 21.40it/s]

Epoch 1 Iteration 12000 Loss: 4.204566955566406


 37%|тЦИтЦИтЦИтЦЛ      | 379/1025 [00:18<00:42, 15.22it/s]

cUiidXNlHYQLASBkMkDz

;OzJ
?XJQBC.bQ'I
qB-pVP
,A
wYZuU sEZJeP.vQYVuaUYGHM DUUwYhwF$yt$TKE,J;G&&btl.CX


 49%|тЦИтЦИтЦИтЦИтЦК     | 499/1025 [00:24<00:24, 21.42it/s]

Epoch 1 Iteration 16000 Loss: 4.207834720611572


 49%|тЦИтЦИтЦИтЦИтЦЙ     | 505/1025 [00:24<00:34, 15.16it/s]

:VfnXaieg.ztYmsc&UWWrVhpeaJ.CG?wiWKtie
h'vvxkQ.TTJQrRdEJjcdVi$xbei:NaIEwVQ-&
h rbOikd&JC$b,!Zycjisnfk


 50%|тЦИтЦИтЦИтЦИтЦИ     | 516/1025 [00:25<00:25, 20.34it/s]


KeyboardInterrupt: 