## Preparación

Oraciones originales de referencia

In [17]:
import pandas as pd 
import re
corpus=pd.read_json("../O1_Corpus/corpus.json", lines=True)

def etapa_preprocesamiento(textos, tokenizador=None):
    
    #Textos es una columna de un dataframe
    #1. Pasar a minúsculas
    textos = textos.str.lower()
    #2. Eliminar caracteres especiales
    textos = textos.apply(lambda x: re.sub(r"[\W\d_]+", " ", x))
    textos = textos.apply(lambda x: re.sub(r"ininteligible", "", x))
    #3. Eliminar espacios en blanco extra
    textos = textos.apply(lambda x: re.sub(r"\s+", " ", x))
    #4. Eliminar espacios en blanco al principio y al final
    textos = textos.str.strip()
    #5. Tokenizar usando SentencePiece
    if tokenizador:
        textos = textos.apply(lambda x: tokenizador.encode_as_pieces(x))
    return textos

corpus['transcription'] = etapa_preprocesamiento(corpus['transcription'])

In [None]:
reference_texts=corpus.sample(300)['transcription'].to_list()
reference_texts

In [78]:
import mauve
import torch


def calculate_perplexity(model, tokenizer, input_texts, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    for text in input_texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
        total_loss += loss.item()

    avg_loss = total_loss / len(input_texts)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

def calculate_distinct_n(generated_texts, n=1):
    n_grams = []
    for text in generated_texts:
        # Split the text into characters
        chars = list(text)
        # Generate n-grams from characters
        n_grams.extend([tuple(chars[i:i+n]) for i in range(len(chars)-n+1)])

    total_n_grams = len(n_grams)
    unique_n_grams = len(set(n_grams))

    distinct_n_score = unique_n_grams / total_n_grams if total_n_grams > 0 else 0
    return distinct_n_score

def calculate_mauve(generated_texts, reference_texts=reference_texts):
    """
    Function to calculate the MAUVE score for generated texts.
    
    Args:
    generated_texts (list): List of generated texts.
    reference_texts (list): List of constant reference texts (human-written).
    
    Returns:
    float: MAUVE score.
    """
    cudaAvailable = torch.cuda.is_available()
    print(f"Using {'cuda' if cudaAvailable else 'cpu'}")
    mauve_score = mauve.compute_mauve(
        p_text=generated_texts, 
        q_text=reference_texts, 
        device_id=0 if cudaAvailable else -1,
        max_text_length=256
    )
    return mauve_score.mauve


## Modelos

In [22]:
diccionarioGenerado={}

In [27]:
unique_words = set()
for transcription in corpus['transcription']:
    words = transcription.split()
    unique_words.update(words)

unique_words_list = sorted(unique_words)

### ZmBART

In [93]:
import torch
from transformers import MBartForConditionalGeneration, MBartTokenizer
import sentencepiece as spm
# Path to your model checkpoint
model_path = "checkpoints/zmbart"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer
tokenizer = spm.SentencePieceProcessor(model_file=model_path+"/spiece.model")


# Load the model and move it to the GPU
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25").to(device)

# Load the fine-tuned weights
model.load_state_dict(torch.load(model_path + "/zmbart_checkpoint112.pt"))

def safe_decode(tokenizer, output_ids):
    decoded_tokens = []
    
    for token_id in output_ids:
        try:
            # Decode each token individually
            decoded_token = tokenizer.decode(token_id.item(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
            
            # If token is padding or unknown, append an empty string or placeholder
            if decoded_token == "<pad>" or decoded_token == "<unk>":
                decoded_tokens.append("")
            else:
                decoded_tokens.append(decoded_token)
        except IndexError:
            # Handle index errors by appending an empty string
            decoded_tokens.append("")
    
    # Join tokens with spaces, replacing empty tokens appropriately
    return ' '.join(decoded_tokens).strip().replace('▁', ' ').replace('  ', ' ')



Using device: cuda


  model.load_state_dict(torch.load(model_path + "/zmbart_checkpoint112.pt"))


RuntimeError: Error(s) in loading state_dict for MBartForConditionalGeneration:
	Missing key(s) in state_dict: "final_logits_bias", "model.shared.weight", "model.encoder.embed_tokens.weight", "model.encoder.embed_positions.weight", "model.encoder.layers.0.self_attn.k_proj.weight", "model.encoder.layers.0.self_attn.k_proj.bias", "model.encoder.layers.0.self_attn.v_proj.weight", "model.encoder.layers.0.self_attn.v_proj.bias", "model.encoder.layers.0.self_attn.q_proj.weight", "model.encoder.layers.0.self_attn.q_proj.bias", "model.encoder.layers.0.self_attn.out_proj.weight", "model.encoder.layers.0.self_attn.out_proj.bias", "model.encoder.layers.0.self_attn_layer_norm.weight", "model.encoder.layers.0.self_attn_layer_norm.bias", "model.encoder.layers.0.fc1.weight", "model.encoder.layers.0.fc1.bias", "model.encoder.layers.0.fc2.weight", "model.encoder.layers.0.fc2.bias", "model.encoder.layers.0.final_layer_norm.weight", "model.encoder.layers.0.final_layer_norm.bias", "model.encoder.layers.1.self_attn.k_proj.weight", "model.encoder.layers.1.self_attn.k_proj.bias", "model.encoder.layers.1.self_attn.v_proj.weight", "model.encoder.layers.1.self_attn.v_proj.bias", "model.encoder.layers.1.self_attn.q_proj.weight", "model.encoder.layers.1.self_attn.q_proj.bias", "model.encoder.layers.1.self_attn.out_proj.weight", "model.encoder.layers.1.self_attn.out_proj.bias", "model.encoder.layers.1.self_attn_layer_norm.weight", "model.encoder.layers.1.self_attn_layer_norm.bias", "model.encoder.layers.1.fc1.weight", "model.encoder.layers.1.fc1.bias", "model.encoder.layers.1.fc2.weight", "model.encoder.layers.1.fc2.bias", "model.encoder.layers.1.final_layer_norm.weight", "model.encoder.layers.1.final_layer_norm.bias", "model.encoder.layers.2.self_attn.k_proj.weight", "model.encoder.layers.2.self_attn.k_proj.bias", "model.encoder.layers.2.self_attn.v_proj.weight", "model.encoder.layers.2.self_attn.v_proj.bias", "model.encoder.layers.2.self_attn.q_proj.weight", "model.encoder.layers.2.self_attn.q_proj.bias", "model.encoder.layers.2.self_attn.out_proj.weight", "model.encoder.layers.2.self_attn.out_proj.bias", "model.encoder.layers.2.self_attn_layer_norm.weight", "model.encoder.layers.2.self_attn_layer_norm.bias", "model.encoder.layers.2.fc1.weight", "model.encoder.layers.2.fc1.bias", "model.encoder.layers.2.fc2.weight", "model.encoder.layers.2.fc2.bias", "model.encoder.layers.2.final_layer_norm.weight", "model.encoder.layers.2.final_layer_norm.bias", "model.encoder.layers.3.self_attn.k_proj.weight", "model.encoder.layers.3.self_attn.k_proj.bias", "model.encoder.layers.3.self_attn.v_proj.weight", "model.encoder.layers.3.self_attn.v_proj.bias", "model.encoder.layers.3.self_attn.q_proj.weight", "model.encoder.layers.3.self_attn.q_proj.bias", "model.encoder.layers.3.self_attn.out_proj.weight", "model.encoder.layers.3.self_attn.out_proj.bias", "model.encoder.layers.3.self_attn_layer_norm.weight", "model.encoder.layers.3.self_attn_layer_norm.bias", "model.encoder.layers.3.fc1.weight", "model.encoder.layers.3.fc1.bias", "model.encoder.layers.3.fc2.weight", "model.encoder.layers.3.fc2.bias", "model.encoder.layers.3.final_layer_norm.weight", "model.encoder.layers.3.final_layer_norm.bias", "model.encoder.layers.4.self_attn.k_proj.weight", "model.encoder.layers.4.self_attn.k_proj.bias", "model.encoder.layers.4.self_attn.v_proj.weight", "model.encoder.layers.4.self_attn.v_proj.bias", "model.encoder.layers.4.self_attn.q_proj.weight", "model.encoder.layers.4.self_attn.q_proj.bias", "model.encoder.layers.4.self_attn.out_proj.weight", "model.encoder.layers.4.self_attn.out_proj.bias", "model.encoder.layers.4.self_attn_layer_norm.weight", "model.encoder.layers.4.self_attn_layer_norm.bias", "model.encoder.layers.4.fc1.weight", "model.encoder.layers.4.fc1.bias", "model.encoder.layers.4.fc2.weight", "model.encoder.layers.4.fc2.bias", "model.encoder.layers.4.final_layer_norm.weight", "model.encoder.layers.4.final_layer_norm.bias", "model.encoder.layers.5.self_attn.k_proj.weight", "model.encoder.layers.5.self_attn.k_proj.bias", "model.encoder.layers.5.self_attn.v_proj.weight", "model.encoder.layers.5.self_attn.v_proj.bias", "model.encoder.layers.5.self_attn.q_proj.weight", "model.encoder.layers.5.self_attn.q_proj.bias", "model.encoder.layers.5.self_attn.out_proj.weight", "model.encoder.layers.5.self_attn.out_proj.bias", "model.encoder.layers.5.self_attn_layer_norm.weight", "model.encoder.layers.5.self_attn_layer_norm.bias", "model.encoder.layers.5.fc1.weight", "model.encoder.layers.5.fc1.bias", "model.encoder.layers.5.fc2.weight", "model.encoder.layers.5.fc2.bias", "model.encoder.layers.5.final_layer_norm.weight", "model.encoder.layers.5.final_layer_norm.bias", "model.encoder.layers.6.self_attn.k_proj.weight", "model.encoder.layers.6.self_attn.k_proj.bias", "model.encoder.layers.6.self_attn.v_proj.weight", "model.encoder.layers.6.self_attn.v_proj.bias", "model.encoder.layers.6.self_attn.q_proj.weight", "model.encoder.layers.6.self_attn.q_proj.bias", "model.encoder.layers.6.self_attn.out_proj.weight", "model.encoder.layers.6.self_attn.out_proj.bias", "model.encoder.layers.6.self_attn_layer_norm.weight", "model.encoder.layers.6.self_attn_layer_norm.bias", "model.encoder.layers.6.fc1.weight", "model.encoder.layers.6.fc1.bias", "model.encoder.layers.6.fc2.weight", "model.encoder.layers.6.fc2.bias", "model.encoder.layers.6.final_layer_norm.weight", "model.encoder.layers.6.final_layer_norm.bias", "model.encoder.layers.7.self_attn.k_proj.weight", "model.encoder.layers.7.self_attn.k_proj.bias", "model.encoder.layers.7.self_attn.v_proj.weight", "model.encoder.layers.7.self_attn.v_proj.bias", "model.encoder.layers.7.self_attn.q_proj.weight", "model.encoder.layers.7.self_attn.q_proj.bias", "model.encoder.layers.7.self_attn.out_proj.weight", "model.encoder.layers.7.self_attn.out_proj.bias", "model.encoder.layers.7.self_attn_layer_norm.weight", "model.encoder.layers.7.self_attn_layer_norm.bias", "model.encoder.layers.7.fc1.weight", "model.encoder.layers.7.fc1.bias", "model.encoder.layers.7.fc2.weight", "model.encoder.layers.7.fc2.bias", "model.encoder.layers.7.final_layer_norm.weight", "model.encoder.layers.7.final_layer_norm.bias", "model.encoder.layers.8.self_attn.k_proj.weight", "model.encoder.layers.8.self_attn.k_proj.bias", "model.encoder.layers.8.self_attn.v_proj.weight", "model.encoder.layers.8.self_attn.v_proj.bias", "model.encoder.layers.8.self_attn.q_proj.weight", "model.encoder.layers.8.self_attn.q_proj.bias", "model.encoder.layers.8.self_attn.out_proj.weight", "model.encoder.layers.8.self_attn.out_proj.bias", "model.encoder.layers.8.self_attn_layer_norm.weight", "model.encoder.layers.8.self_attn_layer_norm.bias", "model.encoder.layers.8.fc1.weight", "model.encoder.layers.8.fc1.bias", "model.encoder.layers.8.fc2.weight", "model.encoder.layers.8.fc2.bias", "model.encoder.layers.8.final_layer_norm.weight", "model.encoder.layers.8.final_layer_norm.bias", "model.encoder.layers.9.self_attn.k_proj.weight", "model.encoder.layers.9.self_attn.k_proj.bias", "model.encoder.layers.9.self_attn.v_proj.weight", "model.encoder.layers.9.self_attn.v_proj.bias", "model.encoder.layers.9.self_attn.q_proj.weight", "model.encoder.layers.9.self_attn.q_proj.bias", "model.encoder.layers.9.self_attn.out_proj.weight", "model.encoder.layers.9.self_attn.out_proj.bias", "model.encoder.layers.9.self_attn_layer_norm.weight", "model.encoder.layers.9.self_attn_layer_norm.bias", "model.encoder.layers.9.fc1.weight", "model.encoder.layers.9.fc1.bias", "model.encoder.layers.9.fc2.weight", "model.encoder.layers.9.fc2.bias", "model.encoder.layers.9.final_layer_norm.weight", "model.encoder.layers.9.final_layer_norm.bias", "model.encoder.layers.10.self_attn.k_proj.weight", "model.encoder.layers.10.self_attn.k_proj.bias", "model.encoder.layers.10.self_attn.v_proj.weight", "model.encoder.layers.10.self_attn.v_proj.bias", "model.encoder.layers.10.self_attn.q_proj.weight", "model.encoder.layers.10.self_attn.q_proj.bias", "model.encoder.layers.10.self_attn.out_proj.weight", "model.encoder.layers.10.self_attn.out_proj.bias", "model.encoder.layers.10.self_attn_layer_norm.weight", "model.encoder.layers.10.self_attn_layer_norm.bias", "model.encoder.layers.10.fc1.weight", "model.encoder.layers.10.fc1.bias", "model.encoder.layers.10.fc2.weight", "model.encoder.layers.10.fc2.bias", "model.encoder.layers.10.final_layer_norm.weight", "model.encoder.layers.10.final_layer_norm.bias", "model.encoder.layers.11.self_attn.k_proj.weight", "model.encoder.layers.11.self_attn.k_proj.bias", "model.encoder.layers.11.self_attn.v_proj.weight", "model.encoder.layers.11.self_attn.v_proj.bias", "model.encoder.layers.11.self_attn.q_proj.weight", "model.encoder.layers.11.self_attn.q_proj.bias", "model.encoder.layers.11.self_attn.out_proj.weight", "model.encoder.layers.11.self_attn.out_proj.bias", "model.encoder.layers.11.self_attn_layer_norm.weight", "model.encoder.layers.11.self_attn_layer_norm.bias", "model.encoder.layers.11.fc1.weight", "model.encoder.layers.11.fc1.bias", "model.encoder.layers.11.fc2.weight", "model.encoder.layers.11.fc2.bias", "model.encoder.layers.11.final_layer_norm.weight", "model.encoder.layers.11.final_layer_norm.bias", "model.encoder.layernorm_embedding.weight", "model.encoder.layernorm_embedding.bias", "model.encoder.layer_norm.weight", "model.encoder.layer_norm.bias", "model.decoder.embed_tokens.weight", "model.decoder.embed_positions.weight", "model.decoder.layers.0.self_attn.k_proj.weight", "model.decoder.layers.0.self_attn.k_proj.bias", "model.decoder.layers.0.self_attn.v_proj.weight", "model.decoder.layers.0.self_attn.v_proj.bias", "model.decoder.layers.0.self_attn.q_proj.weight", "model.decoder.layers.0.self_attn.q_proj.bias", "model.decoder.layers.0.self_attn.out_proj.weight", "model.decoder.layers.0.self_attn.out_proj.bias", "model.decoder.layers.0.self_attn_layer_norm.weight", "model.decoder.layers.0.self_attn_layer_norm.bias", "model.decoder.layers.0.encoder_attn.k_proj.weight", "model.decoder.layers.0.encoder_attn.k_proj.bias", "model.decoder.layers.0.encoder_attn.v_proj.weight", "model.decoder.layers.0.encoder_attn.v_proj.bias", "model.decoder.layers.0.encoder_attn.q_proj.weight", "model.decoder.layers.0.encoder_attn.q_proj.bias", "model.decoder.layers.0.encoder_attn.out_proj.weight", "model.decoder.layers.0.encoder_attn.out_proj.bias", "model.decoder.layers.0.encoder_attn_layer_norm.weight", "model.decoder.layers.0.encoder_attn_layer_norm.bias", "model.decoder.layers.0.fc1.weight", "model.decoder.layers.0.fc1.bias", "model.decoder.layers.0.fc2.weight", "model.decoder.layers.0.fc2.bias", "model.decoder.layers.0.final_layer_norm.weight", "model.decoder.layers.0.final_layer_norm.bias", "model.decoder.layers.1.self_attn.k_proj.weight", "model.decoder.layers.1.self_attn.k_proj.bias", "model.decoder.layers.1.self_attn.v_proj.weight", "model.decoder.layers.1.self_attn.v_proj.bias", "model.decoder.layers.1.self_attn.q_proj.weight", "model.decoder.layers.1.self_attn.q_proj.bias", "model.decoder.layers.1.self_attn.out_proj.weight", "model.decoder.layers.1.self_attn.out_proj.bias", "model.decoder.layers.1.self_attn_layer_norm.weight", "model.decoder.layers.1.self_attn_layer_norm.bias", "model.decoder.layers.1.encoder_attn.k_proj.weight", "model.decoder.layers.1.encoder_attn.k_proj.bias", "model.decoder.layers.1.encoder_attn.v_proj.weight", "model.decoder.layers.1.encoder_attn.v_proj.bias", "model.decoder.layers.1.encoder_attn.q_proj.weight", "model.decoder.layers.1.encoder_attn.q_proj.bias", "model.decoder.layers.1.encoder_attn.out_proj.weight", "model.decoder.layers.1.encoder_attn.out_proj.bias", "model.decoder.layers.1.encoder_attn_layer_norm.weight", "model.decoder.layers.1.encoder_attn_layer_norm.bias", "model.decoder.layers.1.fc1.weight", "model.decoder.layers.1.fc1.bias", "model.decoder.layers.1.fc2.weight", "model.decoder.layers.1.fc2.bias", "model.decoder.layers.1.final_layer_norm.weight", "model.decoder.layers.1.final_layer_norm.bias", "model.decoder.layers.2.self_attn.k_proj.weight", "model.decoder.layers.2.self_attn.k_proj.bias", "model.decoder.layers.2.self_attn.v_proj.weight", "model.decoder.layers.2.self_attn.v_proj.bias", "model.decoder.layers.2.self_attn.q_proj.weight", "model.decoder.layers.2.self_attn.q_proj.bias", "model.decoder.layers.2.self_attn.out_proj.weight", "model.decoder.layers.2.self_attn.out_proj.bias", "model.decoder.layers.2.self_attn_layer_norm.weight", "model.decoder.layers.2.self_attn_layer_norm.bias", "model.decoder.layers.2.encoder_attn.k_proj.weight", "model.decoder.layers.2.encoder_attn.k_proj.bias", "model.decoder.layers.2.encoder_attn.v_proj.weight", "model.decoder.layers.2.encoder_attn.v_proj.bias", "model.decoder.layers.2.encoder_attn.q_proj.weight", "model.decoder.layers.2.encoder_attn.q_proj.bias", "model.decoder.layers.2.encoder_attn.out_proj.weight", "model.decoder.layers.2.encoder_attn.out_proj.bias", "model.decoder.layers.2.encoder_attn_layer_norm.weight", "model.decoder.layers.2.encoder_attn_layer_norm.bias", "model.decoder.layers.2.fc1.weight", "model.decoder.layers.2.fc1.bias", "model.decoder.layers.2.fc2.weight", "model.decoder.layers.2.fc2.bias", "model.decoder.layers.2.final_layer_norm.weight", "model.decoder.layers.2.final_layer_norm.bias", "model.decoder.layers.3.self_attn.k_proj.weight", "model.decoder.layers.3.self_attn.k_proj.bias", "model.decoder.layers.3.self_attn.v_proj.weight", "model.decoder.layers.3.self_attn.v_proj.bias", "model.decoder.layers.3.self_attn.q_proj.weight", "model.decoder.layers.3.self_attn.q_proj.bias", "model.decoder.layers.3.self_attn.out_proj.weight", "model.decoder.layers.3.self_attn.out_proj.bias", "model.decoder.layers.3.self_attn_layer_norm.weight", "model.decoder.layers.3.self_attn_layer_norm.bias", "model.decoder.layers.3.encoder_attn.k_proj.weight", "model.decoder.layers.3.encoder_attn.k_proj.bias", "model.decoder.layers.3.encoder_attn.v_proj.weight", "model.decoder.layers.3.encoder_attn.v_proj.bias", "model.decoder.layers.3.encoder_attn.q_proj.weight", "model.decoder.layers.3.encoder_attn.q_proj.bias", "model.decoder.layers.3.encoder_attn.out_proj.weight", "model.decoder.layers.3.encoder_attn.out_proj.bias", "model.decoder.layers.3.encoder_attn_layer_norm.weight", "model.decoder.layers.3.encoder_attn_layer_norm.bias", "model.decoder.layers.3.fc1.weight", "model.decoder.layers.3.fc1.bias", "model.decoder.layers.3.fc2.weight", "model.decoder.layers.3.fc2.bias", "model.decoder.layers.3.final_layer_norm.weight", "model.decoder.layers.3.final_layer_norm.bias", "model.decoder.layers.4.self_attn.k_proj.weight", "model.decoder.layers.4.self_attn.k_proj.bias", "model.decoder.layers.4.self_attn.v_proj.weight", "model.decoder.layers.4.self_attn.v_proj.bias", "model.decoder.layers.4.self_attn.q_proj.weight", "model.decoder.layers.4.self_attn.q_proj.bias", "model.decoder.layers.4.self_attn.out_proj.weight", "model.decoder.layers.4.self_attn.out_proj.bias", "model.decoder.layers.4.self_attn_layer_norm.weight", "model.decoder.layers.4.self_attn_layer_norm.bias", "model.decoder.layers.4.encoder_attn.k_proj.weight", "model.decoder.layers.4.encoder_attn.k_proj.bias", "model.decoder.layers.4.encoder_attn.v_proj.weight", "model.decoder.layers.4.encoder_attn.v_proj.bias", "model.decoder.layers.4.encoder_attn.q_proj.weight", "model.decoder.layers.4.encoder_attn.q_proj.bias", "model.decoder.layers.4.encoder_attn.out_proj.weight", "model.decoder.layers.4.encoder_attn.out_proj.bias", "model.decoder.layers.4.encoder_attn_layer_norm.weight", "model.decoder.layers.4.encoder_attn_layer_norm.bias", "model.decoder.layers.4.fc1.weight", "model.decoder.layers.4.fc1.bias", "model.decoder.layers.4.fc2.weight", "model.decoder.layers.4.fc2.bias", "model.decoder.layers.4.final_layer_norm.weight", "model.decoder.layers.4.final_layer_norm.bias", "model.decoder.layers.5.self_attn.k_proj.weight", "model.decoder.layers.5.self_attn.k_proj.bias", "model.decoder.layers.5.self_attn.v_proj.weight", "model.decoder.layers.5.self_attn.v_proj.bias", "model.decoder.layers.5.self_attn.q_proj.weight", "model.decoder.layers.5.self_attn.q_proj.bias", "model.decoder.layers.5.self_attn.out_proj.weight", "model.decoder.layers.5.self_attn.out_proj.bias", "model.decoder.layers.5.self_attn_layer_norm.weight", "model.decoder.layers.5.self_attn_layer_norm.bias", "model.decoder.layers.5.encoder_attn.k_proj.weight", "model.decoder.layers.5.encoder_attn.k_proj.bias", "model.decoder.layers.5.encoder_attn.v_proj.weight", "model.decoder.layers.5.encoder_attn.v_proj.bias", "model.decoder.layers.5.encoder_attn.q_proj.weight", "model.decoder.layers.5.encoder_attn.q_proj.bias", "model.decoder.layers.5.encoder_attn.out_proj.weight", "model.decoder.layers.5.encoder_attn.out_proj.bias", "model.decoder.layers.5.encoder_attn_layer_norm.weight", "model.decoder.layers.5.encoder_attn_layer_norm.bias", "model.decoder.layers.5.fc1.weight", "model.decoder.layers.5.fc1.bias", "model.decoder.layers.5.fc2.weight", "model.decoder.layers.5.fc2.bias", "model.decoder.layers.5.final_layer_norm.weight", "model.decoder.layers.5.final_layer_norm.bias", "model.decoder.layers.6.self_attn.k_proj.weight", "model.decoder.layers.6.self_attn.k_proj.bias", "model.decoder.layers.6.self_attn.v_proj.weight", "model.decoder.layers.6.self_attn.v_proj.bias", "model.decoder.layers.6.self_attn.q_proj.weight", "model.decoder.layers.6.self_attn.q_proj.bias", "model.decoder.layers.6.self_attn.out_proj.weight", "model.decoder.layers.6.self_attn.out_proj.bias", "model.decoder.layers.6.self_attn_layer_norm.weight", "model.decoder.layers.6.self_attn_layer_norm.bias", "model.decoder.layers.6.encoder_attn.k_proj.weight", "model.decoder.layers.6.encoder_attn.k_proj.bias", "model.decoder.layers.6.encoder_attn.v_proj.weight", "model.decoder.layers.6.encoder_attn.v_proj.bias", "model.decoder.layers.6.encoder_attn.q_proj.weight", "model.decoder.layers.6.encoder_attn.q_proj.bias", "model.decoder.layers.6.encoder_attn.out_proj.weight", "model.decoder.layers.6.encoder_attn.out_proj.bias", "model.decoder.layers.6.encoder_attn_layer_norm.weight", "model.decoder.layers.6.encoder_attn_layer_norm.bias", "model.decoder.layers.6.fc1.weight", "model.decoder.layers.6.fc1.bias", "model.decoder.layers.6.fc2.weight", "model.decoder.layers.6.fc2.bias", "model.decoder.layers.6.final_layer_norm.weight", "model.decoder.layers.6.final_layer_norm.bias", "model.decoder.layers.7.self_attn.k_proj.weight", "model.decoder.layers.7.self_attn.k_proj.bias", "model.decoder.layers.7.self_attn.v_proj.weight", "model.decoder.layers.7.self_attn.v_proj.bias", "model.decoder.layers.7.self_attn.q_proj.weight", "model.decoder.layers.7.self_attn.q_proj.bias", "model.decoder.layers.7.self_attn.out_proj.weight", "model.decoder.layers.7.self_attn.out_proj.bias", "model.decoder.layers.7.self_attn_layer_norm.weight", "model.decoder.layers.7.self_attn_layer_norm.bias", "model.decoder.layers.7.encoder_attn.k_proj.weight", "model.decoder.layers.7.encoder_attn.k_proj.bias", "model.decoder.layers.7.encoder_attn.v_proj.weight", "model.decoder.layers.7.encoder_attn.v_proj.bias", "model.decoder.layers.7.encoder_attn.q_proj.weight", "model.decoder.layers.7.encoder_attn.q_proj.bias", "model.decoder.layers.7.encoder_attn.out_proj.weight", "model.decoder.layers.7.encoder_attn.out_proj.bias", "model.decoder.layers.7.encoder_attn_layer_norm.weight", "model.decoder.layers.7.encoder_attn_layer_norm.bias", "model.decoder.layers.7.fc1.weight", "model.decoder.layers.7.fc1.bias", "model.decoder.layers.7.fc2.weight", "model.decoder.layers.7.fc2.bias", "model.decoder.layers.7.final_layer_norm.weight", "model.decoder.layers.7.final_layer_norm.bias", "model.decoder.layers.8.self_attn.k_proj.weight", "model.decoder.layers.8.self_attn.k_proj.bias", "model.decoder.layers.8.self_attn.v_proj.weight", "model.decoder.layers.8.self_attn.v_proj.bias", "model.decoder.layers.8.self_attn.q_proj.weight", "model.decoder.layers.8.self_attn.q_proj.bias", "model.decoder.layers.8.self_attn.out_proj.weight", "model.decoder.layers.8.self_attn.out_proj.bias", "model.decoder.layers.8.self_attn_layer_norm.weight", "model.decoder.layers.8.self_attn_layer_norm.bias", "model.decoder.layers.8.encoder_attn.k_proj.weight", "model.decoder.layers.8.encoder_attn.k_proj.bias", "model.decoder.layers.8.encoder_attn.v_proj.weight", "model.decoder.layers.8.encoder_attn.v_proj.bias", "model.decoder.layers.8.encoder_attn.q_proj.weight", "model.decoder.layers.8.encoder_attn.q_proj.bias", "model.decoder.layers.8.encoder_attn.out_proj.weight", "model.decoder.layers.8.encoder_attn.out_proj.bias", "model.decoder.layers.8.encoder_attn_layer_norm.weight", "model.decoder.layers.8.encoder_attn_layer_norm.bias", "model.decoder.layers.8.fc1.weight", "model.decoder.layers.8.fc1.bias", "model.decoder.layers.8.fc2.weight", "model.decoder.layers.8.fc2.bias", "model.decoder.layers.8.final_layer_norm.weight", "model.decoder.layers.8.final_layer_norm.bias", "model.decoder.layers.9.self_attn.k_proj.weight", "model.decoder.layers.9.self_attn.k_proj.bias", "model.decoder.layers.9.self_attn.v_proj.weight", "model.decoder.layers.9.self_attn.v_proj.bias", "model.decoder.layers.9.self_attn.q_proj.weight", "model.decoder.layers.9.self_attn.q_proj.bias", "model.decoder.layers.9.self_attn.out_proj.weight", "model.decoder.layers.9.self_attn.out_proj.bias", "model.decoder.layers.9.self_attn_layer_norm.weight", "model.decoder.layers.9.self_attn_layer_norm.bias", "model.decoder.layers.9.encoder_attn.k_proj.weight", "model.decoder.layers.9.encoder_attn.k_proj.bias", "model.decoder.layers.9.encoder_attn.v_proj.weight", "model.decoder.layers.9.encoder_attn.v_proj.bias", "model.decoder.layers.9.encoder_attn.q_proj.weight", "model.decoder.layers.9.encoder_attn.q_proj.bias", "model.decoder.layers.9.encoder_attn.out_proj.weight", "model.decoder.layers.9.encoder_attn.out_proj.bias", "model.decoder.layers.9.encoder_attn_layer_norm.weight", "model.decoder.layers.9.encoder_attn_layer_norm.bias", "model.decoder.layers.9.fc1.weight", "model.decoder.layers.9.fc1.bias", "model.decoder.layers.9.fc2.weight", "model.decoder.layers.9.fc2.bias", "model.decoder.layers.9.final_layer_norm.weight", "model.decoder.layers.9.final_layer_norm.bias", "model.decoder.layers.10.self_attn.k_proj.weight", "model.decoder.layers.10.self_attn.k_proj.bias", "model.decoder.layers.10.self_attn.v_proj.weight", "model.decoder.layers.10.self_attn.v_proj.bias", "model.decoder.layers.10.self_attn.q_proj.weight", "model.decoder.layers.10.self_attn.q_proj.bias", "model.decoder.layers.10.self_attn.out_proj.weight", "model.decoder.layers.10.self_attn.out_proj.bias", "model.decoder.layers.10.self_attn_layer_norm.weight", "model.decoder.layers.10.self_attn_layer_norm.bias", "model.decoder.layers.10.encoder_attn.k_proj.weight", "model.decoder.layers.10.encoder_attn.k_proj.bias", "model.decoder.layers.10.encoder_attn.v_proj.weight", "model.decoder.layers.10.encoder_attn.v_proj.bias", "model.decoder.layers.10.encoder_attn.q_proj.weight", "model.decoder.layers.10.encoder_attn.q_proj.bias", "model.decoder.layers.10.encoder_attn.out_proj.weight", "model.decoder.layers.10.encoder_attn.out_proj.bias", "model.decoder.layers.10.encoder_attn_layer_norm.weight", "model.decoder.layers.10.encoder_attn_layer_norm.bias", "model.decoder.layers.10.fc1.weight", "model.decoder.layers.10.fc1.bias", "model.decoder.layers.10.fc2.weight", "model.decoder.layers.10.fc2.bias", "model.decoder.layers.10.final_layer_norm.weight", "model.decoder.layers.10.final_layer_norm.bias", "model.decoder.layers.11.self_attn.k_proj.weight", "model.decoder.layers.11.self_attn.k_proj.bias", "model.decoder.layers.11.self_attn.v_proj.weight", "model.decoder.layers.11.self_attn.v_proj.bias", "model.decoder.layers.11.self_attn.q_proj.weight", "model.decoder.layers.11.self_attn.q_proj.bias", "model.decoder.layers.11.self_attn.out_proj.weight", "model.decoder.layers.11.self_attn.out_proj.bias", "model.decoder.layers.11.self_attn_layer_norm.weight", "model.decoder.layers.11.self_attn_layer_norm.bias", "model.decoder.layers.11.encoder_attn.k_proj.weight", "model.decoder.layers.11.encoder_attn.k_proj.bias", "model.decoder.layers.11.encoder_attn.v_proj.weight", "model.decoder.layers.11.encoder_attn.v_proj.bias", "model.decoder.layers.11.encoder_attn.q_proj.weight", "model.decoder.layers.11.encoder_attn.q_proj.bias", "model.decoder.layers.11.encoder_attn.out_proj.weight", "model.decoder.layers.11.encoder_attn.out_proj.bias", "model.decoder.layers.11.encoder_attn_layer_norm.weight", "model.decoder.layers.11.encoder_attn_layer_norm.bias", "model.decoder.layers.11.fc1.weight", "model.decoder.layers.11.fc1.bias", "model.decoder.layers.11.fc2.weight", "model.decoder.layers.11.fc2.bias", "model.decoder.layers.11.final_layer_norm.weight", "model.decoder.layers.11.final_layer_norm.bias", "model.decoder.layernorm_embedding.weight", "model.decoder.layernorm_embedding.bias", "model.decoder.layer_norm.weight", "model.decoder.layer_norm.bias", "lm_head.weight". 
	Unexpected key(s) in state_dict: "args", "model", "optimizer_history", "extra_state", "last_optimizer_state". 

## Meta XNLG

In [None]:
import torch
import random
from transformers import MT5ForConditionalGeneration, T5Tokenizer

# Path to your model checkpoint
checkpoint_path = "/workspace/Tesis/O3_modelos/checkpoints/metaXNLG_checkpoint-10500/"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)

# Load the model and move it to the GPU
model = MT5ForConditionalGeneration.from_pretrained(checkpoint_path).to(device)

def safe_decode(tokenizer, output_ids):
    # print(output_ids)
    decoded_tokens = []
    
    for token_id in output_ids:
        try:
            # Decode each token individually
            decoded_token = tokenizer.decode(token_id.item(), skip_special_tokens=False, clean_up_tokenization_spaces=False)
            
            # If token is padding or unknown, append an empty string or placeholder
            if decoded_token == "<pad>" or decoded_token == "<unk>":
                decoded_tokens.append("")
            else:
                decoded_tokens.append(decoded_token)
        except IndexError:
            # Handle index errors by appending an empty string
            decoded_tokens.append("")
    
    # Join tokens with spaces, replacing empty tokens appropriately
    return ' '.join(decoded_tokens).strip().replace('▁', ' ').replace('  ', ' ')

In [None]:
# Randomly select a number of input texts
num_samples = 1
import random


sentences = []

while len(sentences) <= 100:
    input_texts = random.sample(unique_words_list, num_samples) 
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)  # Move inputs to GPU
    outputs = model.generate(**inputs, max_length=30)
    predictions = [safe_decode(tokenizer, output) for output in outputs]
    split_sentences = [sentence.split('</s>') for sentence in predictions]
    split_sentences = [[s.strip() for s in sentence_list if s.strip()] for sentence_list in split_sentences]
    flattened_sentences = [item for sublist in split_sentences for item in sublist]
    sentences.extend(flattened_sentences)
    print("Tenemos ",len(sentences)," oraciones")

In [72]:
diccionarioGenerado["metaXNLG"]=sentences

In [None]:
#Métricas
perplexity = calculate_perplexity(model, tokenizer, sentences, device)
distinct_2 = calculate_distinct_n(sentences, n=2)
distinct_3 = calculate_distinct_n(sentences, n=3)
mauve_score = calculate_mauve(sentences)

print(f"Perplexity: {perplexity}")
print(f"Distinct-2: {distinct_2}")
print(f"Distinct-3: {distinct_3}")
print(f"MAUVE: {mauve_score}")

## LSTM

In [None]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, pretrained_embeddings):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(pretrained_embeddings)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])  # Use last output from LSTM
        return output

def test_lstm(model, input_data, vocab):
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor([[vocab[token] for token in sentence] for sentence in input_data])
        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=1)
    return predictions

# Load model with trained weights
vocab_size = len(vocab)
embed_size = pretrained_embeddings.shape[1]
model = LSTMModel(vocab_size, embed_size, hidden_size=128, output_size=vocab_size, pretrained_embeddings=pretrained_embeddings)

input_data = [['your', 'input', 'sentence']]
lstm_predictions = test_lstm(model, input_data, vocab)
print(lstm_predictions)
