In [64]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.functional import log_softmax
from torch.utils.data import DataLoader

import spacy
import re


In [12]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(device)
else:
    print ("MPS device not found.")

mps


# Tokenización

In [15]:
def clean_corpus_from_file(file_path):
    cleaned_corpus = []

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            # Leer el contenido del archivo
            corpus = file.readlines()

            for text in corpus:
                # Convertir a minúsculas
                text = text.lower()

                # Eliminar caracteres no alfabéticos y números
                text = re.sub(r'[^a-z\s]', '', text)

                # Eliminar espacios en blanco adicionales
                text = ' '.join(text.split())

                cleaned_corpus.append(text)

    except Exception as e:
        print(f"Error al leer el archivo: {e}")

    return cleaned_corpus



def clean_corpus(corpus):
    cleaned_corpus = []

    for text in corpus:
        # Convertir a minúsculas
        text = text.lower()

        # Eliminar caracteres no alfabéticos y números
        text = re.sub(r'[^a-z\s]', '', text)

        # Eliminar espacios en blanco adicionales
        text = ' '.join(text.split())

        cleaned_corpus.append(text)

    return cleaned_corpus

In [10]:
# Cargar el modelo de spaCy para el idioma correspondiente
nlp = spacy.load("en_core_web_sm") 

def tokenize_corpus(corpus):
    tokenized_corpus = []

    for text in corpus:
        # Tokenizar el texto usando spaCy
        tokens = [token.text for token in nlp(text)]
        tokenized_corpus.append(tokens)

    return tokenized_corpus

In [16]:
corpus_file_path = "data/corpus.txt"
cleaned_corpus = clean_corpus_from_file(corpus_file_path)

In [23]:
# Suponiendo que 'corpus' es tu corpus de texto
tokenized_corpus = tokenize_corpus(cleaned_corpus)

In [24]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Crear un vocabulario asignando un índice único a cada palabra
vocab = {word: idx for idx, word in enumerate(set(word for sent in tokenized_corpus for word in sent))}
idx_to_word = {idx: word for word, idx in vocab.items()}

# Convertir tokens a índices numéricos
indexed_corpus = [[vocab[word] for word in sent] for sent in tokenized_corpus]

# Convertir la lista de listas a tensores
input_tensor = pad_sequence([torch.tensor(sentence) for sentence in indexed_corpus], batch_first=True)


In [26]:
# Crear un conjunto para almacenar todas las palabras únicas
unique_words = set(word for sent in tokenized_corpus for word in sent)

# Obtener el tamaño del vocabulario
vocab_size = len(unique_words)

print("Tamaño del vocabulario:", vocab_size)

Tamaño del vocabulario: 11705


In [22]:
print(input_tensor[1:].shape)

torch.Size([8288, 106])


# VAE

In [25]:
# Definir el modelo VAE con generación de texto
class VAEWithTextGeneration(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size, output_size):
        super(VAEWithTextGeneration, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, latent_size * 2)  # Mean and log-variance
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )

        self.text_generator = nn.Linear(latent_size, output_size)

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        # Encoder
        x = self.encoder(x)
        mu, log_var = torch.chunk(x, 2, dim=-1)

        # Reparameterization
        z = self.reparameterize(mu, log_var)

        # Decoder
        reconstructed_x = self.decoder(z)

        # Text Generation
        generated_text = log_softmax(self.text_generator(z), dim=-1)

        return reconstructed_x, generated_text, mu, log_var


In [27]:
# Hiperparámetros
input_size = 106 # Tamaño de entrada (ajustar según tus datos)
hidden_size = 256 # Tamaño de capa oculta
latent_size = 64 # Tamaño de la capa latente
output_size = 11705 # Tamaño de salida (vocabulario en el caso de texto)

In [28]:
# Instanciar el modelo VAE
vae = VAEWithTextGeneration(input_size, hidden_size, latent_size, output_size)

In [29]:
# Definir la función de pérdida y el optimizador
criterion = nn.CrossEntropyLoss()  # Para problemas de clasificación de texto
optimizer = optim.Adam(vae.parameters(), lr=0.001)


In [30]:
# Entrenamiento del VAE
vae.train()
optimizer.zero_grad()

In [45]:
# Forward pass
input_tensor = input_tensor.float()
reconstructed_x, generated_text, mu, log_var = vae(input_tensor)

In [46]:
# Ajuste al mismo tamaño de lote usando relleno (padding)
from torch.nn.utils.rnn import pad_sequence

input_tensor_padded = pad_sequence([torch.tensor(sentence) for sentence in indexed_corpus], batch_first=True)
flat_target_padded = pad_sequence([torch.tensor(sentence) for sentence in indexed_corpus], batch_first=True)

# Aplanar el tensor de etiquetas
flat_target = flat_target_padded.view(-1)


In [49]:
# Aplanar el tensor de etiquetas
flat_target = input_tensor.view(-1)
flat_target_reduced = flat_target[:input_tensor.size(0)]


flat_target_reduced = flat_target_reduced.long()

# Calcular la pérdida
reconstruction_loss = criterion(reconstructed_x, flat_target_reduced)
generation_loss = criterion(generated_text, flat_target_reduced)
kl_divergence = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

total_loss = reconstruction_loss + generation_loss + kl_divergence

In [50]:
# Backward pass y optimización
total_loss.backward()
optimizer.step()

# Entrenamiento

In [63]:
# Hiperparámetros
learning_rate = 0.001
batch_size = 32
epochs = 10

In [70]:
# Crear un DataLoader con relleno


In [76]:
from torch.nn.utils.rnn import pad_sequence

# Modifica tu DataLoader para que el collate_fn maneje las secuencias y las etiquetas por separado
def my_collate(batch):
    sequences = [torch.tensor(item[0]) for item in batch if item[0]]  # Filtrar secuencias vacías
    labels = [item[1] for item in batch]

    if not sequences:
        return torch.tensor([]), torch.tensor([])  # Devolver tensores vacíos

    # Resto del código
    padded_sequences = pad_sequence(sequences, batch_first=True)
    return padded_sequences, torch.tensor(labels)

# DataLoader con el nuevo collate_fn
train_loader = DataLoader(indexed_corpus, batch_size=batch_size, shuffle=True, collate_fn=my_collate)

# Resto del código sin cambios
optimizer = torch.optim.Adam(vae.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    vae.train()
    total_loss = 0.0

    for padded_sequences, labels in train_loader:
        if len(padded_sequences) == 0:
            continue  # Saltar el lote vacío

        optimizer.zero_grad()

        # Obtener las salidas del modelo
        reconstructed_x, generated_text, mu, log_var = vae(padded_sequences)

        # Aplanar las etiquetas
        flat_target_reduced = labels.view(-1).long()

        # Calcular la pérdida
        reconstruction_loss = criterion(reconstructed_x, flat_target_reduced)
        generation_loss = criterion(generated_text, flat_target_reduced)
        kl_divergence = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

        total_loss = reconstruction_loss + generation_loss + kl_divergence

        # Retropropagación y actualización de pesos
        total_loss.backward()
        optimizer.step()

        total_loss += total_loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}")


RuntimeError: ArrayRef: invalid slice, N = 1; size = 0

# Generar texto

In [61]:
texto_generado = vae.forward(input_tensor[0])

In [59]:
# Detokenización con spaCy
texto_destokenizado = " ".join(tokenized_corpus[0])
doc = nlp.make_doc(texto_destokenizado)
texto_destokenizado = " ".join(token.text for token in doc)

print(texto_destokenizado)

the stranger als o by albert camu s


# Generar texto