In [None]:
import torch_xla.core.xla_model as xm
device = xm.xla_device()  # Use TPU as the device
print(f'Training on: {device}')



## Modelo

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
from torch.utils.data import DataLoader, Dataset

class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, pretrained_embeddings=None):
        super(LSTMTextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # Initialize the embedding layer with pre-trained embeddings if provided
        if pretrained_embeddings is not None:
            self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings))
            self.embedding.weight.requires_grad = False  # Freeze if you don't want to fine-tune
        
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        # This method initializes the hidden state and cell state for LSTM
        num_layers = self.lstm.num_layers
        hidden_size = self.lstm.hidden_size
        return (torch.zeros(num_layers, batch_size, hidden_size).to(device),
                torch.zeros(num_layers, batch_size, hidden_size).to(device))



In [None]:
import re
# Load the vocabulary (token -> index) mapping
vocab = {}  # You should populate this with your BPE vocabulary
with open('tokenizadorIskonawa.vocab', 'r', encoding='utf-8') as vocab_file:
    for idx, line in enumerate(vocab_file):
        token, code = re.split(r'\t', line.strip())
        # Save as integer
        vocab[token] = idx

# Load the BPE tokenized dataset
def load_bpe_dataset(file_path, vocab):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Tokenize each line into subwords and convert them to indices
            tokens = line.strip().split()  # Assuming tokens are space-separated
            token_ids = [vocab.get(token, vocab['<unk>']) for token in tokens]  # Handle unknown tokens
            dataset.append(token_ids)
    return dataset

bpe_tokenized_dataset = load_bpe_dataset('tokens.txt', vocab)

# Check dataset example
print(bpe_tokenized_dataset[:1])


In [None]:
# Define the characters
char1 = '▁'  # U+2581, "LOWER ONE EIGHTH BLOCK"
char2 = '_'  # U+005F, "LOW LINE"

# Print the characters and their Unicode code points
print(f"Character 1: {char1}, Unicode: {ord(char1)}")
print(f"Character 2: {char2}, Unicode: {ord(char2)}")
vocab.get('▁ma')

In [5]:
import numpy as np

def load_embeddings(embedding_file, vocab):
    with open(embedding_file, 'r', encoding='utf-8') as f:
        # Read the first line to get vocab size and embed size
        first_line = f.readline().strip()
        vocab_size, embed_size = map(int, first_line.split())
        
        # Initialize a dictionary to hold the embeddings
        embeddings = np.zeros((len(vocab), embed_size), dtype=np.float32)
        
        # Read the rest of the file
        for line in f:
            values = line.strip().split()
            subword = values[0].strip()
            vector = np.array(values[1:], dtype=np.float32)
            index = vocab.get(subword, -1)
            if index == -1:
                print(f'Found {subword} in vocab')
            else:
                embeddings[index] = vector
    
    return embeddings, vocab_size, embed_size

embedding_file = 'isk_anchor_final2.txt'
pretrained_embeddings, vocab_size, embed_size = load_embeddings(embedding_file, vocab)

In [6]:
class BPEDataset(Dataset):
    def __init__(self, tokenized_data, pad_token=0):
        self.tokenized_data = tokenized_data
        self.pad_token = pad_token

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        # Get the tokenized sentence
        sentence = self.tokenized_data[idx]
        
        # Convert to tensor and return
        return torch.tensor(sentence, dtype=torch.long)

def collate_fn(batch):
    # Get the max length of sentences in the batch
    max_length = max(len(sentence) for sentence in batch)
    
    # Pad sentences to the max length
    padded_batch = [torch.cat([sentence, torch.tensor([0] * (max_length - len(sentence)))]) for sentence in batch]
    
    return torch.stack(padded_batch)

In [None]:
dataset = BPEDataset(bpe_tokenized_dataset)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

# Check example
for inputs in dataloader:
    print(inputs.shape)  # Check the shape of the padded input batch
    break

In [8]:
# Parameters
vocab_size = len(vocab)
embed_size = 300
hidden_size = 128
num_layers = 2
num_epochs = 50
learning_rate = 0.001

# Initialize the model, loss function, and optimizer
model = LSTMTextGenerator(vocab_size, embed_size, hidden_size, num_layers, pretrained_embeddings=pretrained_embeddings).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)



In [9]:
import os
import torch

def save_checkpoint(epoch, model, optimizer, loss, checkpoint_dir='checkpoints'):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_last.pth')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, checkpoint_path)
    print(f'Checkpoint saved at {checkpoint_path}')

## Entrenar

In [11]:
import torch
from torch.utils.data import DataLoader
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.utils.utils as xu
# Assuming `model`, `optimizer`, `criterion`, `device`, `vocab_size`, `dataset`, `collate_fn`, and `num_epochs` are defined

def train_loop_fn(dataloader, epoch):
    total_loss = 0
    model.train()  # Set the model to training mode
    step =0
    total_batches = len(dataloader)

    for batch_idx, inputs in enumerate(dataloader):
        optimizer.zero_grad()
        
        # Prepare inputs and targets for text generation
        inputs_seq = inputs[:, :-1].to(xm.xla_device()).long() 
        targets_seq = inputs[:, 1:].to(xm.xla_device()).long()  
        # print(f"Inputs are on device: {inputs_seq.device}")
        # Initialize hidden state
        hidden = model.init_hidden(inputs_seq.size(0))  # Initialize hidden state based on batch size

        # Forward pass
        outputs, hidden = model(inputs_seq, hidden)  # Pass inputs_seq and hidden state
        
        # Calculate loss
        loss = criterion(outputs.view(-1, vocab_size), targets_seq.contiguous().view(-1))
        
        # Backward pass and optimization
        loss.backward()
        xm.optimizer_step(optimizer)
        xm.mark_step()
        
        total_loss += loss.item()  # Accumulate loss
        progress = (batch_idx + 1) / total_batches * 100  # Calculate progress percentage
        print(f"Successfully completed step {step} on device: {xm.xla_device()}, Progress: {progress:.2f}%")
        step += 1

    save_checkpoint(epoch, model, optimizer, total_loss / len(dataloader))
    return total_loss / len(dataloader)  # Average loss over the epoch

def train_model():
    # Create the DataLoader
    dataloader = DataLoader(dataset, batch_size=256, shuffle=True, collate_fn=collate_fn)
    device = xm.xla_device()
    model.to(device)

    # Training Loop
    for epoch in range(num_epochs):
        para_loader = pl.ParallelLoader(dataloader, [device]).per_device_loader(device)
        loss = train_loop_fn(para_loader, epoch)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss:.4f}, Device: {device}')

train_model()

Successfully completed step 0 on device: xla:0, Progress: 2.22%
Successfully completed step 1 on device: xla:0, Progress: 4.44%
Successfully completed step 2 on device: xla:0, Progress: 6.67%
Successfully completed step 3 on device: xla:0, Progress: 8.89%
Successfully completed step 4 on device: xla:0, Progress: 11.11%
Successfully completed step 5 on device: xla:0, Progress: 13.33%
Successfully completed step 6 on device: xla:0, Progress: 15.56%
Successfully completed step 7 on device: xla:0, Progress: 17.78%
Successfully completed step 8 on device: xla:0, Progress: 20.00%
Successfully completed step 9 on device: xla:0, Progress: 22.22%
Successfully completed step 10 on device: xla:0, Progress: 24.44%
Successfully completed step 11 on device: xla:0, Progress: 26.67%
Successfully completed step 12 on device: xla:0, Progress: 28.89%
Successfully completed step 13 on device: xla:0, Progress: 31.11%
Successfully completed step 14 on device: xla:0, Progress: 33.33%
Successfully completed s

## Generar

In [None]:
def generate_text(model, start_sequence, generation_length):
    model.eval()
    generated_sequence = start_sequence

    with torch.no_grad():
        for _ in range(generation_length):
            input_seq = torch.tensor(generated_sequence[-(seq_len-1):], dtype=torch.long).unsqueeze(0).to(device)  # Move to TPU
            output = model(input_seq)
            next_token = torch.argmax(output, dim=1).item()  # Get the predicted token
            generated_sequence.append(next_token)

    return generated_sequence


In [None]:
import torch
import torch_xla.core.xla_model as xm

dev = xm.xla_device()
t1 = torch.randn(3,3,device=dev)
t2 = torch.randn(3,3,device=dev)
print(t1 + t2)
