# Embeddings Gaussianos y RNNs para Modelado de Incertidumbre en Secuencias de Texto

Implementación en PyTorch de una RNN que utiliza gaussian embeddings para generar texto de manera probabilística, permitiendo la captura de múltiples posibles continuaciones de una secuencia.

Combina gaussian embeddings con RNNs para capturar la incertidumbre y variabilidad en la generación de secuencias de texto, mejorando la robustez del modelo.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import numpy as np
import re
from nltk.tokenize import word_tokenize, sent_tokenize

import warnings

warnings.filterwarnings("ignore")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Overglitch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class Vocabulary: 
    def __init__(self):
        self.word2idx = {"<PAD>": 0}
        self.idx2word = {0: "<PAD>"}
        self.idx = 1 

    def add_sentence(self, sentence):
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __len__(self):
        return len(self.word2idx)

    def word_to_index(self, word):
        return self.word2idx.get(word, self.word2idx["<PAD>"])

    def index_to_word(self, idx):
        return self.idx2word.get(idx, "<PAD>")

    def sentence_to_indices(self, sentence):
        return [self.word_to_index(word) for word in sentence]

    def indices_to_sentence(self, indices):
        return [self.index_to_word(idx) for idx in indices]


In [3]:
class TextDataset(Dataset):
    def __init__(self, filepath, seq_length=5, num_sentences=None):
        self.filepath = filepath
        self.seq_length = seq_length
        self.num_sentences = num_sentences
        self.vocab = Vocabulary()
        self.data = self.load_and_preprocess_data()
        self.inputs, self.targets = self.create_sequences()

    def load_and_preprocess_data(self):
        with open(self.filepath, 'r', encoding='utf-8') as f:
            text = f.read()

        text = self.clean_text(text)
        sentences = sent_tokenize(text)
        if self.num_sentences:
            sentences = sentences[:self.num_sentences]

        tokenized_sentences = [word_tokenize(sent) for sent in sentences]
        for sentence in tokenized_sentences:
            self.vocab.add_sentence(sentence)

        return tokenized_sentences

    def clean_text(self, text):
        # Eliminar texto dentro de paréntesis
        text = re.sub(r'\([^)]*\)', '', text)
        # Eliminar signos de igual y tokens desconocidos
        text = re.sub(r'=', '', text)
        text = re.sub(r'<unk>', '', text)
        # Reemplazar múltiples guiones por un espacio
        text = re.sub(r'-{2,}', ' ', text)
        # Reemplazar múltiples puntos por un solo punto
        text = re.sub(r'\.{2,}', '.', text)
        # Eliminar caracteres no deseados
        text = re.sub(r"[^a-zA-Z0-9\s\.\']", '', text)
        # Reemplazar múltiples espacios por uno solo
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text

    def create_sequences(self):
        inputs = []
        targets = []
        for sentence in self.data:
            if len(sentence) < 2:
                continue
            indices = self.vocab.sentence_to_indices(sentence)
            for i in range(1, len(indices)):
                seq = indices[max(0, i - self.seq_length):i]
                seq = [0] * (self.seq_length - len(seq)) + seq  # Padding
                inputs.append(seq)
                targets.append(indices[i])

        inputs = torch.tensor(inputs, dtype=torch.long)
        targets = torch.tensor(targets, dtype=torch.long)
        return inputs, targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]


In [4]:
class GaussianEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, padding_idx=0):
        super(GaussianEmbedding, self).__init__()
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx

        # Parámetros de media y log-varianza para cada palabra
        self.mean = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
        self.log_var = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)

        # Inicializar embeddings
        nn.init.uniform_(self.mean.weight, -0.1, 0.1)
        nn.init.uniform_(self.log_var.weight, -0.1, 0.1)

    def forward(self, input):
        mean = self.mean(input)
        log_var = self.log_var(input)
        std = torch.exp(0.5 * log_var)

        # Muestrear epsilon de una distribución normal estándar
        epsilon = torch.randn_like(std)
        # Truco de reparametrización
        z = mean + epsilon * std

        return z, mean, log_var

    def kl_loss(self, mean, log_var):
        # Calcular la divergencia KL entre el embedding gaussiano y una normal estándar
        kl = -0.5 * torch.sum(1 + log_var - mean.pow(2) - log_var.exp(), dim=2)
        return kl.mean()


In [5]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, padding_idx, dropout_p):
        super(LSTMModel, self).__init__()
        self.embedding = GaussianEmbedding(num_embeddings=vocab_size, embedding_dim=embedding_dim,
                                           padding_idx=padding_idx)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x, mean, log_var = self.embedding(x)
        x, hidden = self.lstm(x, hidden)
        x = self.dropout(x)
        x = self.fc(x[:, -1, :])  # Usar la salida del último paso temporal
        return x, hidden, mean, log_var

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new_zeros(1, batch_size, self.lstm.hidden_size),
                weight.new_zeros(1, batch_size, self.lstm.hidden_size))


In [6]:
class Trainer:
    def __init__(self, model, train_dataset, batch_size=64, lr=0.001, clip=5, kl_weight=0.1):
        self.model = model
        self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        self.vocab = train_dataset.vocab
        self.seq_length = train_dataset.seq_length
        self.criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignorar el padding
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        self.clip = clip
        self.kl_weight = kl_weight  # Peso para la pérdida de divergencia Kellback-Leibler

    def train(self, epochs):
        for epoch in range(1, epochs + 1):
            self.model.train()
            epoch_loss = 0
            for inputs, targets in self.train_loader:
                batch_size = inputs.size(0)
                hidden = self.model.init_hidden(batch_size)
                hidden = tuple([h.data for h in hidden])

                self.model.zero_grad()
                outputs, hidden, mean, log_var = self.model(inputs, hidden)
                nll_loss = self.criterion(outputs, targets) # Pérdida de entropía cruzada
                kl_loss = self.model.embedding.kl_loss(mean, log_var) # Pérdida de divergencia KL
                loss = nll_loss + self.kl_weight * kl_loss # Pérdida total
                loss.backward()

                nn.utils.clip_grad_norm_(self.model.parameters(), self.clip) # Clip de gradientes
                self.optimizer.step()

                epoch_loss += loss.item() * batch_size

            avg_loss = epoch_loss / len(self.train_loader.dataset)
            perplexity = np.exp(avg_loss) # incertidumbre del modelo en la predicción de palabras
            print(f'Epoch {epoch}, Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}')

            generated_text = self.generate_text('The', 50, top_k=5)
            print(f'Text generated after epoch {epoch}:\n{generated_text}\n')

    def generate_text(self, init_text, length, top_k=5):
        self.model.eval()
        words = word_tokenize(init_text)
        state_h, state_c = self.model.init_hidden(1)
        for _ in range(length):
            input_words = words[-(self.seq_length - 1):]
            indices = [self.vocab.word_to_index(w) for w in input_words]
            if len(indices) < self.seq_length - 1:
                indices = [0] * (self.seq_length - 1 - len(indices)) + indices

            x = torch.tensor([indices], dtype=torch.long)
            with torch.no_grad():
                output, (state_h, state_c), mean, log_var = self.model(x, (state_h, state_c))
                probs = F.softmax(output, dim=1).data
                # Muestreo de la distribución
                top_probs, top_ix = probs.topk(top_k)
                top_probs = top_probs.cpu().numpy().squeeze()
                top_ix = top_ix.cpu().numpy().squeeze()
                word_idx = np.random.choice(top_ix, p=top_probs / top_probs.sum())
                word = self.vocab.index_to_word(word_idx)
                words.append(word)

        return ' '.join(words)


In [8]:
# Definir parámetros
seq_length = 6
num_sentences = 25000 
train_filepath = 'resources/data/wikitext2/baby.txt'

# Crear dataset
train_dataset = TextDataset(train_filepath, seq_length=seq_length, num_sentences=num_sentences)

print(f"Número de secuencias de entrada: {len(train_dataset)}")

# Actualizar vocab_size y padding_idx
vocab_size = len(train_dataset.vocab)
embedding_dim = 100
hidden_dim = 128
padding_idx = train_dataset.vocab.word_to_index("<PAD>")
dropout_p = 0.1
batch_size = 200
learning_rate = 0.001
num_epochs = 10

# Instanciar el modelo con Gaussian Embeddings
model = LSTMModel(vocab_size=vocab_size, embedding_dim=embedding_dim,
                  hidden_dim=hidden_dim, padding_idx=padding_idx, dropout_p=dropout_p)

# Instanciar el trainer con el nuevo modelo
trainer = Trainer(model=model, train_dataset=train_dataset, batch_size=batch_size, lr=learning_rate, kl_weight=0.1)

# Entrenar el modelo
trainer.train(num_epochs)


Número de secuencias de entrada: 107740
Epoch 1, Loss: 7.4279, Perplexity: 1682.2655
Text generated after epoch 1:
The . the the the and the . of . to . the to . the of . the of . and the and the the . of the and the . and the of . of . . . of to the . the . of of . the and

Epoch 2, Loss: 7.1032, Perplexity: 1215.8862
Text generated after epoch 2:
The in . . to of the and and the the of of the the the the . of the . the in the . . to of the the . the . to the in the the the the . in the of the and of the . to in

Epoch 3, Loss: 7.0763, Perplexity: 1183.5990
Text generated after epoch 3:
The of the the the of . of of of the . . . the the in . and to of to and and the the . a to and . the the to the of of the and . to of the the and . the of the . and

Epoch 4, Loss: 7.0363, Perplexity: 1137.1417
Text generated after epoch 4:
The a . of and the the the of and a . . . the . in . of and . a the and . the the the the a . . the the a . of the in in the a of . a the the and the and a

Epoch 

In [9]:
# Generar texto después del entrenamiento
init_text = 'The'
generated_text = trainer.generate_text(init_text, length=100, top_k=5)
print("Texto generado después del entrenamiento:")
print(generated_text)


Texto generado después del entrenamiento:
The first of the of . the episode . . . of the shark . . the Missouri of the Missouri in the and . the Missouri States . the the States . in a Missouri of the . of a river in the river and the . in the Missouri . of the Missouri River of the Missouri 's in . in a in . in the Ganges . and the Missouri of the Missouri and the Missouri . and of the . . the Missouri . . . . be a . . the and was the and was
