## Preparación

Oraciones originales de referencia

In [8]:
import pandas as pd 
import re
corpus=pd.read_json("../O1_Corpus/corpus.json", lines=True)

def etapa_preprocesamiento(textos, tokenizador=None):
    
    #Textos es una columna de un dataframe
    #1. Pasar a minúsculas
    textos = textos.str.lower()
    #2. Eliminar caracteres especiales
    textos = textos.apply(lambda x: re.sub(r"[\W\d_]+", " ", x))
    textos = textos.apply(lambda x: re.sub(r"ininteligible", "", x))
    #3. Eliminar espacios en blanco extra
    textos = textos.apply(lambda x: re.sub(r"\s+", " ", x))
    #4. Eliminar espacios en blanco al principio y al final
    textos = textos.str.strip()
    #5. Tokenizar usando SentencePiece
    if tokenizador:
        textos = textos.apply(lambda x: tokenizador.encode_as_pieces(x))
    return textos

corpus['transcription'] = etapa_preprocesamiento(corpus['transcription'])

In [None]:
reference_texts=corpus.sample(300)['transcription'].to_list()
reference_texts

In [55]:
import mauve
import torch
from torch import nn

def calculate_perplexity(model, tokenizer, input_texts, device):
    model.eval()  
    total_loss = 0
    for text in input_texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
        total_loss += loss.item()

    avg_loss = total_loss / len(input_texts)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

import torch
import torch.nn as nn

import torch
import torch.nn as nn

def calculate_perplexity_lstm(model, vocab, input_texts, device):
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.get('<unk>', -100))
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for text in input_texts:
            # Tokenize the input text and shift targets
            token_ids = [vocab.get(token, vocab['<unk>']) for token in text.split()]
            if len(token_ids) < 2:  # Ensure there's enough context for prediction
                continue
            
            inputs = torch.tensor(token_ids[:-1], dtype=torch.long).unsqueeze(0).to(device)  # Exclude last token
            targets = torch.tensor(token_ids[1:], dtype=torch.long).unsqueeze(0).to(device)  # Exclude first token

            # Reinitialize the hidden state
            hidden = model.init_hidden(inputs.size(0))
            hidden = (hidden[0].to(device), hidden[1].to(device))

            # Forward pass
            outputs, _ = model(inputs, hidden)
            outputs = outputs.view(-1, outputs.size(-1))
            targets = targets.view(-1)

            # Calculate the loss, skip if NaN
            loss = criterion(outputs, targets)
            if not torch.isnan(loss):
                total_loss += loss.item() * targets.size(0)
                total_tokens += targets.size(0)

    if total_tokens == 0:
        return float('inf')

    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()


def calculate_distinct_n(generated_texts, n=1):
    n_grams = []
    for text in generated_texts:
        # Split the text into characters
        chars = list(text)
        # Generate n-grams from characters
        n_grams.extend([tuple(chars[i:i+n]) for i in range(len(chars)-n+1)])

    total_n_grams = len(n_grams)
    unique_n_grams = len(set(n_grams))

    distinct_n_score = unique_n_grams / total_n_grams if total_n_grams > 0 else 0
    return distinct_n_score

def calculate_mauve(generated_texts, reference_texts=reference_texts):
    """
    Function to calculate the MAUVE score for generated texts.
    
    Args:
    generated_texts (list): List of generated texts.
    reference_texts (list): List of constant reference texts (human-written).
    
    Returns:
    float: MAUVE score.
    """
    cudaAvailable = torch.cuda.is_available()
    print(f"Using {'cuda' if cudaAvailable else 'cpu'}")
    mauve_score = mauve.compute_mauve(
        p_text=generated_texts, 
        q_text=reference_texts, 
        device_id=0 if cudaAvailable else -1,
        max_text_length=256
    )
    return mauve_score.mauve


## Modelos

In [7]:
diccionarioGenerado={}

In [185]:
#For each key in the dictionary, create a file with the key name and write each line in the value array
for key in diccionarioGenerado:
    with open(f"{key}.txt", "w") as file:
        for line in diccionarioGenerado[key]:
            file.write(line + "\n")

In [48]:
unique_words = set()
for transcription in corpus['transcription']:
    words = transcription.split()
    unique_words.update(words)

unique_words_list = sorted(unique_words)

## ZmBART

In [None]:
import torch
from transformers import MBartForConditionalGeneration, MBartTokenizer
import sentencepiece as spm
# Path to your model checkpoint
model_path = "checkpoints/zmbart"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer
tokenizerIsk = spm.SentencePieceProcessor(model_file=model_path+"/spiece.model")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")

# Load the model and move it to the GPU
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25").to(device)
checkpoint = torch.load(model_path + "/zmbart_checkpoint112.pt")
model_weights = checkpoint['model'] if 'model' in checkpoint else checkpoint
model.load_state_dict(model_weights, strict=False)

def safe_decode(tokenizer, output_ids):
    decoded_tokens = []
    
    for token_id in output_ids:
        #Skip 0 token
        if token_id.item() == 0:
            continue
        try:
            # Decode each token individually. Now tokenizer is a SentencePiece processor
            decoded_token = tokenizer.decode_ids([token_id.item()])
            
            # If token is padding or unknown, append an empty string or placeholder
            if decoded_token == "<pad>" or decoded_token == "<unk>" or decoded_token == "⁇":
                decoded_tokens.append("")
            else:
                decoded_tokens.append(decoded_token)
        except IndexError:
            # Handle index errors by appending an empty string
            decoded_tokens.append("")
    
    # Join tokens with spaces, replacing empty tokens appropriately
    return ' '.join(decoded_tokens).strip()



In [None]:
import random

num_samples = 1
sentences = []

while len(sentences) <= 100:
    input_texts = random.sample(unique_words_list, num_samples)
    # print("Input texts: ", input_texts)
    
    # Encode the input texts
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)
    # print("Encoded inputs: ", inputs)
    
    # Generate outputs
    outputs = model.generate(**inputs, max_length=20, num_beams=10, early_stopping=False, no_repeat_ngram_size=1, pad_token_id=tokenizer.pad_token_id)
    # print("Outputs: ", outputs)
    
    # Decode the outputs
    decoded_outputs = [safe_decode(tokenizerIsk, output) for output in outputs]
    # print("Decoded outputs: ", decoded_outputs)
    sentences.extend(decoded_outputs)
    print("Tenemos ", len(sentences), " oraciones")
    
    # break

In [86]:
diccionarioGenerado["zmbart"]=sentences

In [None]:
perplexity = calculate_perplexity(model, tokenizer, sentences, device)
distinct_2 = calculate_distinct_n(sentences, n=2)
distinct_3 = calculate_distinct_n(sentences, n=3)
mauve_score = calculate_mauve(sentences)

print(f"Perplexity: {perplexity}")
print(f"Distinct-2: {distinct_2}")
print(f"Distinct-3: {distinct_3}")
print(f"MAUVE: {mauve_score}")

Perplexity: 43.40846252441406
Distinct-2: 0.03824678950307091
Distinct-3: 0.11806952025280092
MAUVE: 0.031570415352922064

## Meta XNLG

In [None]:
import torch
import random
from transformers import MT5ForConditionalGeneration, T5Tokenizer

# Path to your model checkpoint
checkpoint_path = "/workspace/Tesis/O3_modelos/checkpoints/metaXNLG_checkpoint-10500/"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)

# Load the model and move it to the GPU
model = MT5ForConditionalGeneration.from_pretrained(checkpoint_path).to(device)

def safe_decode(tokenizer, output_ids):
    # print(output_ids)
    decoded_tokens = []
    
    for token_id in output_ids:
        try:
            # Decode each token individually
            decoded_token = tokenizer.decode(token_id.item(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
            
            # If token is padding or unknown, append an empty string or placeholder
            if decoded_token == "<pad>" or decoded_token == "<unk>":
                decoded_tokens.append("")
            else:
                decoded_tokens.append(decoded_token)
        except IndexError:
            # Handle index errors by appending an empty string
            decoded_tokens.append("")
    
    # Join tokens with spaces, replacing empty tokens appropriately
    return ' '.join(decoded_tokens).strip().replace('▁', ' ').replace('  ', ' ')

In [None]:
# Randomly select a number of input texts
num_samples = 1
import random


sentences = []

while len(sentences) <= 100:
    input_texts = random.sample(unique_words_list, num_samples) 
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)  # Move inputs to GPU
    outputs = model.generate(**inputs, max_length=30)
    predictions = [safe_decode(tokenizer, output) for output in outputs]
    split_sentences = [sentence.split('</s>') for sentence in predictions]
    split_sentences = [[s.strip() for s in sentence_list if s.strip()] for sentence_list in split_sentences]
    flattened_sentences = [item for sublist in split_sentences for item in sublist]
    sentences.extend(flattened_sentences)
    print("Tenemos ",len(sentences)," oraciones")

In [9]:
diccionarioGenerado["metaXNLG"]=sentences

In [None]:
#Métricas
perplexity = calculate_perplexity(model, tokenizer, sentences, device)
distinct_2 = calculate_distinct_n(sentences, n=2)
distinct_3 = calculate_distinct_n(sentences, n=3)
mauve_score = calculate_mauve(sentences)

print(f"Perplexity: {perplexity}")
print(f"Distinct-2: {distinct_2}")
print(f"Distinct-3: {distinct_3}")
print(f"MAUVE: {mauve_score}")

Perplexity: 53.81763458251953
Distinct-2: 0.06568575932737783
Distinct-3: 0.20421753607103219
MAUVE: 0.5293356144783942

## LSTM

In [1]:
#Modelo 
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import re
import numpy as np

# Define the LSTMTextGenerator class

class LSTMTextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.3, pretrained_embeddings=None, device=None):
        super(LSTMTextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # Initialize the embedding layer with pre-trained embeddings if provided
        if pretrained_embeddings is not None:
            self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings))
            self.embedding.weight.requires_grad = False  
        
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.layer_norm = nn.LayerNorm(hidden_size * 2)  
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.device = device
    
    def forward(self, x, hidden):
        x = self.embedding(x)
        x = self.dropout(x)
        out, hidden = self.lstm(x, hidden)
        out = self.layer_norm(out)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        num_layers = self.lstm.num_layers
        hidden_size = self.lstm.hidden_size
        num_directions = 2  
        if self.device is not None:
            return (torch.zeros(num_layers * num_directions, batch_size, hidden_size).to(self.device),
                    torch.zeros(num_layers * num_directions, batch_size, hidden_size).to(self.device))
        return (torch.zeros(num_layers * num_directions, batch_size, hidden_size),
                torch.zeros(num_layers * num_directions, batch_size, hidden_size))

# Load the vocabulary (token -> index) mapping
vocab = {}
with open('BaslineLSTM/tokenizadorIskonawa.vocab', 'r', encoding='utf-8') as vocab_file:
    for idx, line in enumerate(vocab_file):
        token, code = re.split(r'\t', line.strip())
        vocab[token] = idx


In [None]:
# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(vocab)
embed_size = 300
hidden_size = 128
num_layers = 2
num_epochs = 20
learning_rate = 0.0001
model = LSTMTextGenerator(vocab_size, embed_size, hidden_size, num_layers).to(device)

# Load the checkpoint
checkpoint_path = "checkpoints/lstm/checkpoint_last.pth"
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])


In [None]:
# def load_embeddings(embedding_file, vocab):
#     with open(embedding_file, 'r', encoding='utf-8') as f:
#         # Read the first line to get vocab size and embed size
#         first_line = f.readline().strip()
#         vocab_size, embed_size = map(int, first_line.split())
        
#         # Initialize a dictionary to hold the embeddings
#         embeddings = np.zeros((len(vocab), embed_size), dtype=np.float32)
        
#         # Read the rest of the file
#         for line in f:
#             values = line.strip().split()
#             subword = values[0].strip()
#             vector = np.array(values[1:], dtype=np.float32)
#             index = vocab.get(subword, -1)
#             if index == -1:
#                 print(f'Found {subword} in vocab')
#             else:
#                 embeddings[index] = vector
    
#     return embeddings, vocab_size, embed_size

# embedding_file = 'BaslineLSTM/isk_anchor_final2.txt'
# pretrained_embeddings, vocab_size, embed_size = load_embeddings(embedding_file, vocab)

# model.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))

In [44]:
import random


def test_lstm(model, input_data, vocab, max_length=8):
    model.eval()
    with torch.no_grad():

        inputs = torch.tensor([[vocab[token] for token in sentence] for sentence in input_data]).to(device)
        hidden = model.init_hidden(inputs.size(0))
        hidden = (hidden[0].to(device), hidden[1].to(device))
        

        generated_tokens = inputs.tolist()
        
        for _ in range(max_length):
            max_repeat = 2
            # Generate predictions based on the input sequence and hidden state
            outputs, hidden = model(inputs, hidden)
            predictions = torch.argmax(outputs, dim=2)
            
            # Append predicted token to generated tokens
            generated_tokens[0].append(predictions[0, -1].item())
            
            if len(generated_tokens[0]) > 1 and generated_tokens[0][-1] == generated_tokens[0][-2]:
                max_repeat -= 1
                if max_repeat == 0:
                    print("Se repitio")
                    break   
                generated_tokens[0][-1] = random.choice(range(vocab_size))
                
        
            # Update inputs with the latest predicted token
            inputs = torch.cat((inputs, predictions[:, -1].unsqueeze(1)), dim=1)
        
        # Convert indices back to words, excluding '<unk>' tokens
        generated_sentences = []
        for tokens in generated_tokens:
            sentence = [list(vocab.keys())[list(vocab.values()).index(token)] for token in tokens if token in vocab.values() and token != vocab['<unk>']]
            generated_sentences.append(sentence)
        
    return generated_sentences


def join_tokens(tokens):
    sentence=''.join(tokens)
    return sentence.replace('▁', ' ').replace('  ', ' ').strip()

# Generate sentences using the LSTM model
num_samples = 2
sentences = []
reverse_vocab = {idx: token for token, idx in vocab.items()}

while len(sentences) <= 100:
    input_texts = random.sample(list(vocab.keys()), num_samples)
    input_data = [input_texts]  # Wrap in a list to match the expected input format

    # print("Input texts: ", input_texts)
    predictions = test_lstm(model, input_data, vocab)
    # print("Predictions: ", predictions)
 
    flattened_sentences = [token for sentence in predictions for token in sentence]
    # print("Flattened sentences: ", flattened_sentences)
    joint_sentence=join_tokens(flattened_sentences)

    sentences.extend([joint_sentence])
    
    # print("Tenemos ", len(sentences), " oraciones")
    # break
    # Break after processing one batch (for testing, remove this in real runs)

In [46]:
diccionarioGenerado["LSTM"]=sentences

In [None]:
perplexity = calculate_perplexity_lstm(model, vocab, sentences, device)
# distinct_2 = calculate_distinct_n(sentences, n=2)
# distinct_3 = calculate_distinct_n(sentences, n=3)
# mauve_score = calculate_mauve(sentences)

print(f"Perplexity: {perplexity}")
print(f"Distinct-2: {distinct_2}")
print(f"Distinct-3: {distinct_3}")
print(f"MAUVE: {mauve_score}")

Perplexity: 2473.0986328125
Distinct-2: 0.0713372538992585
Distinct-3: 0.2671916010498688
MAUVE: 0.41212726326292703

## T5

In [91]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, MultiHeadAttention

import sentencepiece as spm


# Define paths to checkpoint and SentencePiece model
MODEL_DIR = "checkpoints/t5"
TF_CHECKPOINT_PATH = "checkpoints/t5/smallT5_model.ckpt-120000"
SP_MODEL_PATH = "checkpoints/t5/spiece.model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sp = spm.SentencePieceProcessor()
sp.Load(SP_MODEL_PATH)

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.dense_proj = tf.keras.Sequential([
            Dense(d_ff, activation='gelu'),
            Dense(d_model),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.attention(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        dense_output = self.dense_proj(out1)
        dense_output = self.dropout2(dense_output, training=training)
        return self.layernorm2(out1 + dense_output)

class SimpleT5Model(tf.keras.Model):
    def __init__(self, d_model, num_layers, num_heads, d_ff, dropout_rate):
        super(SimpleT5Model, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Dense(d_model)
        self.transformer_blocks = [TransformerBlock(d_model, num_heads, d_ff, dropout_rate) for _ in range(num_layers)]
        self.final_layer = Dense(d_model)

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        for transformer_block in self.transformer_blocks:
            x = transformer_block(x, training=training)
        return self.final_layer(x)

d_model = 512
num_layers = 8
d_ff = 1024
num_heads = 6
dropout_rate = 0.0

model = SimpleT5Model(d_model, num_layers, num_heads, d_ff, dropout_rate)



# Restore the model weights from the checkpoint
checkpoint = tf.train.Checkpoint(model=model)
status = checkpoint.restore(TF_CHECKPOINT_PATH).expect_partial()

# Confirm if checkpoint was successfully restored
status.assert_existing_objects_matched()

# Encode input text using SentencePiece
def encode_text(text):
    return sp.EncodeAsIds(text)

# Decode output IDs to text using SentencePiece
def decode_text(ids):
    return sp.DecodeIds(ids)

In [182]:
import random 

num_samples = 1
sentences = []

while len(sentences) <= 100:
    input_texts = random.sample(unique_words_list, num_samples)
    inputs = [encode_text(text) for text in input_texts]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, padding='post', maxlen=15)
    
    # Ensure the input tensor has the correct shape
    inputs = tf.convert_to_tensor(inputs, dtype=tf.int32)
    inputs = tf.expand_dims(inputs, axis=-1)  # Add a new dimension if needed
    inputs = tf.tile(inputs, [1, 1, 3])  # Tile to match the expected feature size
    
    outputs = model(inputs, training=False)
    # print(outputs)
    token_ids = tf.argmax(outputs, axis=-1).numpy().squeeze()
    
    # Ensure token_ids is a list of lists
    if token_ids.ndim == 1:
        token_ids = [token_ids]
    
    #Truncate token_ids to the second time the '454' token appears
    token_ids = [ids[:np.where(ids == 454)[0][1]] if np.sum(ids == 454) > 1 else ids for ids in token_ids]

    # print(token_ids)
    # Decode the token IDs to text
    predictions = [decode_text(ids.tolist()) for ids in token_ids]
    
    #Sentence is the input text and the prediction
    sentence = input_texts[0] + ' ' + predictions[0]
    
    sentences.extend([sentence])
    # print("Tenemos ",len(sentences)," oraciones")


In [183]:
sentences = [sentence.replace('▁', ' ').replace('  ', ' ').strip() for sentence in sentences]
diccionarioGenerado["T5"]=sentences



In [219]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy

# Define the cross-entropy loss
loss_fn = SparseCategoricalCrossentropy(from_logits=True)

# Function to compute perplexity
def calculate_perplexity_t5(model, input_texts):
    total_loss = 0.0
    total_tokens = 0

    for text in input_texts:
        # Tokenize and encode the input text
        token_ids = encode_text(text)
        #For those token_ids values bigger than 512, assign them a random value
        token_ids = [token if token < 512 else random.choice(range(512)) for token in token_ids]
        token_ids = tf.convert_to_tensor(token_ids, dtype=tf.int32)
        
        
        if len(token_ids) < 2:  # Need at least 2 tokens to compute loss
            continue

        # Convert to TensorFlow constant and add batch dimension
        inputs = tf.constant(token_ids, dtype=tf.int32)
        inputs = tf.expand_dims(inputs, axis=0)

        # Adjust `inputs` shape to match model requirements
        if inputs.shape[-1] % 3 != 0:  # Check if padding is needed
            padding_needed = 3 - (inputs.shape[-1] % 3)
            inputs = tf.pad(inputs, [[0, 0], [0, padding_needed]])

        # Reshape for the model if required
        inputs = tf.reshape(inputs, (-1, inputs.shape[1] // 3, 3))

        # Get model predictions
        predictions = model(inputs, training=False)

        # Prepare targets for calculating loss
        targets = token_ids[1:]  # Shifted by one
        targets = tf.constant(targets, dtype=tf.int32)

        # If shapes don't align, pad `targets`
        if targets.shape[0] < predictions.shape[1]:
            targets = tf.pad(targets, [[0, predictions.shape[1] - targets.shape[0]]])
        elif targets.shape[0] > predictions.shape[1]:
            targets = targets[:predictions.shape[1]]  # Truncate to match

        targets = tf.reshape(targets, predictions.shape[:-1])  # Match dimensions

        # Calculate cross-entropy loss
        loss = loss_fn(targets, predictions)
        total_loss += loss.numpy() * len(targets)
        total_tokens += len(targets)

    # Calculate average loss per token
    avg_loss = total_loss / total_tokens if total_tokens > 0 else 0

    # Compute perplexity
    perplexity = np.exp(avg_loss) if avg_loss > 0 else float('inf')
    return perplexity


In [220]:
#Métricas
perplexity = calculate_perplexity_t5(model, sentences)
distinct_2 = calculate_distinct_n(sentences, n=2)
distinct_3 = calculate_distinct_n(sentences, n=3)
mauve_score = calculate_mauve(sentences)

print(f"Perplexity: {perplexity}")
print(f"Distinct-2: {distinct_2}")
print(f"Distinct-3: {distinct_3}")
print(f"MAUVE: {mauve_score}")



Using cuda


Featurizing p: 100%|██████████| 101/101 [00:04<00:00, 20.73it/s]
Featurizing q: 100%|██████████| 299/299 [00:12<00:00, 23.98it/s]


Perplexity: 169.55085834426123
Distinct-2: 0.07234479220112879
Distinct-3: 0.19155844155844157
MAUVE: 0.009045433531443093


Perplexity: 169.55085834426123
Distinct-2: 0.07234479220112879
Distinct-3: 0.19155844155844157
MAUVE: 0.009045433531443093