# Generación de Versos 

### Librerías Necearias

In [69]:
# Dependencias
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import fasttext

import unicodedata

from tqdm import tqdm_notebook
from torchinfo import summary

### Código de Clases + Funciones Necesarias

Clase Vocabulary (no es extrictamente necesaria), ya que la que después se usa es la del vocabulary especializado (con los tokens \<UNK>, \<MASK>, ...)

In [34]:
class Vocabulary:
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = dict(token_to_idx)
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}

    def to_serializable(self):
        # función para serializar el diccionario token (label) - idx (int)
        return {"token_to_idx": self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        return cls(token_to_idx=contents["token_to_idx"])

    def add_token(self, token):
        # función para añadir token (nuevo) al diccionario
        if token in self._token_to_idx:
            return self._token_to_idx[token]

        index = len(self._token_to_idx)
        self._token_to_idx[token] = index
        self._idx_to_token[index] = token
        return index

    def add_many_tokens(self, tokens):
        # función para añadir N > 1 tokens al diccionario
        return [self.add_token(t) for t in tokens]

    def lookup_token(self, token):
        # función para obtener el idx del token introducido

        return self._token_to_idx[token]

    def lookup_index(self, index):
        # función para obtener el token del idx introducido
        if index not in self._idx_to_token:
            return "<UNK>"
        return self._idx_to_token[index]

    def __len__(self):
        # devuelve el tamaño del diccionario
        return len(self._token_to_idx)

    def __str__(self):
        # devuelve el tamaño del vocabulario
        return f"<Vocabulary(size={len(self)})>"

Vocabulary especial Corán con los tokens especiales \<eos>, \<bos>, ...

In [35]:
class VocabularyCoran(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super().__init__(token_to_idx)
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        # función para serializar el diccionario token (label) - idx (int)
        contents = super().to_serializable()
        contents.update({
            "unk_token": self._unk_token,
            "mask_token": self._mask_token,
            "begin_seq_token": self._begin_seq_token,
            "end_seq_token": self._end_seq_token
        })
        return contents

    @classmethod
    def from_serializable(cls, contents):
        vocab = cls(
            token_to_idx=contents["token_to_idx"],
            unk_token=contents["unk_token"],
            mask_token=contents["mask_token"],
            begin_seq_token=contents["begin_seq_token"],
            end_seq_token=contents["end_seq_token"],
        )
        return vocab

    def lookup_token(self, token):
        # función para obtener el idx del token introducido
        return self._token_to_idx.get(token, self.unk_index)

Vectorizer


In [36]:
class CoranVectorizer:
    def __init__(self, char_vocab: VocabularyCoran):
        self.char_vocab = char_vocab

    def vectorize(self, text: str, vector_length: int):
        indices = [self.char_vocab.begin_seq_index]
        indices.extend(self.char_vocab.lookup_token(ch) for ch in text)
        indices.append(self.char_vocab.end_seq_index)

        from_indices = indices[:-1]
        to_indices = indices[1:]

        # El from_vector será <bos> con los tokens de la secuencia (sin el <eos>)
        from_vector = np.full(vector_length, fill_value=self.char_vocab.mask_index, dtype=np.int64)
        # Y el to_vector será os tokens de la secuencia + <eos>
        to_vector = np.full(vector_length, fill_value=self.char_vocab.mask_index, dtype=np.int64)

        n = min(vector_length, len(from_indices))
        from_vector[:n] = from_indices[:n]

        n = min(vector_length, len(to_indices))
        to_vector[:n] = to_indices[:n]

        return from_vector, to_vector

    @classmethod
    def from_dataframe(cls, df: pd.DataFrame, text_col="text"):
        char_vocab = VocabularyCoran()
        for text in df[text_col].astype(str):
            for ch in text:
                char_vocab.add_token(ch)
        return cls(char_vocab)

    def to_serializable(self):
        return {"char_vocab": self.char_vocab.to_serializable()}

    @classmethod
    def from_serializable(cls, contents):
        char_vocab = VocabularyCoran.from_serializable(contents["char_vocab"])
        return cls(char_vocab)

Funciones para el entrenamiento (métricas de evaluación, argumentos de entrenamiento, ...)

In [37]:
def generate_batches(dataset, batch_size, device, shuffle=True, drop_last=True):
    # genera batches para mandarlos al cpu/gpu (si tenemos cuda)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for batch in dataloader:
        yield {k: v.to(device) for k, v in batch.items()}

def sequence_loss(y_pred, y_true, mask_index):
    # loss function, en nuestro caso el cross entropy loss. Ya que compararemos la distribución de predicciones con la ground truth
    B, T, V = y_pred.shape
    y_pred = y_pred.reshape(B * T, V)
    y_true = y_true.reshape(B * T)

    loss_fn = nn.CrossEntropyLoss(ignore_index=mask_index)
    return loss_fn(y_pred, y_true)

def compute_accuracy(y_pred, y_true, mask_index):
    # función para calcular la accuracy, comparando cada caracter predicho con el ground truth
    y_hat = y_pred.argmax(dim=-1)  
    valid = (y_true != mask_index)
    correct = (y_hat == y_true) & valid
    denom = valid.sum().item()
    if denom == 0:
        return 0.0
    return correct.sum().item() / denom

def make_train(args):
    # sacado del notebook de ALUD, 
    return {"stop_early": False,
            "early_stopping_step": 0,
            "early_stopping_best_val": 1e8,
            "epoch_index": 0,
            "train_loss": [],
            "train_acc": [],
            "val_loss": [],
            "val_acc": [],
            "model_filename": args.model_state_file}

def update_training_state(args, model, train_state):
    # función para tener en cuenta mejora/desmejora de rendimiento -> early_stopping
    if train_state["epoch_index"] == 0:
        torch.save(model.state_dict(), train_state["model_filename"])
        train_state["stop_early"] = False
        return train_state

    loss_t = train_state["val_loss"][-1]
    if loss_t < train_state["early_stopping_best_val"]:
        torch.save(model.state_dict(), train_state["model_filename"])
        train_state["early_stopping_best_val"] = loss_t
        train_state["early_stopping_step"] = 0
    else:
        train_state["early_stopping_step"] += 1

    train_state["stop_early"] = train_state["early_stopping_step"] >= args.early_stopping_criteria
    return train_state

Funciones para obtener y mostrar los nuevos versos una vez entrenados los modelos

In [38]:
def sample_from_model(model, vectorizer, num_samples=10, max_length=300, temperature=0.8, top_k=None):
    # Función para coger los nuevos versos generados y mostrarlos
    # En nuestro caso 10
    model.eval()
    vocab = vectorizer.char_vocab
    device = next(model.parameters()).device
    samples = []

    for _ in range(num_samples):
        indices = [vocab.begin_seq_index]

        for _ in range(max_length):
            x = torch.tensor(indices, dtype=torch.long, device=device).unsqueeze(0)

            with torch.no_grad():
                logits = model(x, apply_softmax=False)         
                next_logits = logits[0, -1] / max(temperature, 1e-8)

                if top_k is not None and top_k > 0:
                    v, ix = torch.topk(next_logits, k=top_k)
                    filtered = torch.full_like(next_logits, float("-inf"))
                    filtered[ix] = v
                    next_logits = filtered

                probs = torch.softmax(next_logits, dim=0)
                next_index = torch.multinomial(probs, 1).item()

            if next_index == vocab.end_seq_index:
                break
            indices.append(next_index)

        samples.append(indices)

    return samples

def decode_samples(sampled_indices, vectorizer):
    # Función para devoler los labels de los índices conseguidos en la función anterior
    char_vocab = vectorizer.char_vocab
    decoded = []

    for indices in sampled_indices:
        chars = [
            char_vocab.lookup_index(idx)
            for idx in indices
            if idx not in (
                char_vocab.begin_seq_index,
                char_vocab.end_seq_index,
                char_vocab.mask_index
            )
        ]
        decoded.append("".join(chars))

    return decoded


Como usaremos los pesos del modelo de embeddings usado anteriormente (`fastText`), los importaremos aquí:

In [39]:
def obtener_pesos(vectorizer, modelo_ft):
    vocab = vectorizer.char_vocab
    token_to_idx = vocab._token_to_idx
    tamaño_vocab = len(token_to_idx)
    embedding_dim = modelo_ft.get_dimension()
    pesos = np.zeros((tamaño_vocab, embedding_dim))

    for token, idx in token_to_idx.items():
        pesos[idx] = modelo_ft.get_word_vector(token)
    
    return torch.FloatTensor(pesos)

### Funciones para los entrenamientos: RNN y LSTM

Clase Dataset del Corán

In [None]:
class CoranDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vectorizer: CoranVectorizer, text_col="text"):
        self.df = df.reset_index(drop=True)
        self._vectorizer = vectorizer
        self._text_col = text_col

        self._max_seq_length = int(self.df[text_col].astype(str).map(len).max()) + 2 # el +2 incluye los tokens del diccionario + <bos> y <eos>

        n = len(self.df)
        train_end = int(n * 0.70) # 70% de las instancias al train set
        val_end = int(n * .85) # 15 para el validation set, y el otro 15 para el test

        self.train_df = self.df.iloc[:train_end]
        self.val_df = self.df.iloc[train_end:val_end]
        self.test_df = self.df.iloc[val_end:]

        self._lookup_dict = {
            "train": (self.train_df, len(self.train_df)),
            "val": (self.val_df, len(self.val_df)),
            "test": (self.test_df, len(self.test_df)),
        }

        self.set_split("train")

    # a partir de aquí hay metodos necesarios para manipular nuestro dataset específico
    @classmethod
    def load_dataset_and_make_vectorizer(cls, coran_txt, sep="|"):
        df = pd.read_csv(coran_txt, sep=sep, names=["sura", "ayah", "text"])
        df["text"] = df["text"].astype(str).str.lower()
        vectorizer = CoranVectorizer.from_dataframe(df, text_col="text")
        return cls(df, vectorizer, text_col="text")

    @classmethod
    def load_dataset_and_load_vectorizer(cls, coran_txt, vectorizer_filepath, sep="|"):
        df = pd.read_csv(coran_txt, sep=sep, names=["sura", "ayah", "text"])
        df["text"] = df["text"].astype(str).str.lower()
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(df, vectorizer, text_col="text")

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath, "r", encoding="utf-8") as f:
            contents = json.load(f)
        return CoranVectorizer.from_serializable(contents)

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w", encoding="utf-8") as f:
            json.dump(self._vectorizer.to_serializable(), f, ensure_ascii=False)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        text = str(row[self._text_col])
        x, y = self._vectorizer.vectorize(text, vector_length=self._max_seq_length)
        return {"x_data": torch.tensor(x, dtype=torch.long),
                "y_target": torch.tensor(y, dtype=torch.long)}

Función de entrenamiento RNN:

In [56]:
def train_RNN(coran_path, output_path):
    args = Namespace(
        coran_txt=coran_path,
        vectorizer_file="vectorizer.json",
        model_state_file="model.pth",
        save_dir=output_path,

        char_embedding_size=300, # 300 porque los embeddings del ft son de 300, tienen que coincidir
        rnn_hidden_size=128,

        seed=1337,
        learning_rate=1e-3,
        batch_size=256,
        num_epochs=50,
        early_stopping_criteria=5,

        cuda=True,
        reload_from_files=False
    )

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
        args.device = torch.device("cuda")
    else:
        args.device = torch.device("cpu")

    os.makedirs(args.save_dir, exist_ok=True)
    if args.vectorizer_file and not os.path.isabs(args.vectorizer_file):
        args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    if args.model_state_file and not os.path.isabs(args.model_state_file):
        args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    if args.reload_from_files and os.path.exists(args.vectorizer_file):
        dataset = CoranDataset.load_dataset_and_load_vectorizer(args.coran_txt, args.vectorizer_file)
    else:
        dataset = CoranDataset.load_dataset_and_make_vectorizer(args.coran_txt)
        dataset.save_vectorizer(args.vectorizer_file)

    vectorizer = dataset.get_vectorizer()
    mask_index = vectorizer.char_vocab.mask_index

    def obtener_pesos(vectorizer, modelo_ft):
        vocab = vectorizer.char_vocab
        token_to_idx = vocab._token_to_idx
        tamaño_vocab = len(token_to_idx)
        embedding_dim = modelo_ft.get_dimension()
        pesos = np.zeros((tamaño_vocab, embedding_dim))

        for token, idx in token_to_idx.items():
            pesos[idx] = modelo_ft.get_word_vector(token)
    
        return torch.FloatTensor(pesos)

    ft_ingles = fasttext.load_model("../src/modelos/fasttext_english_busqueda_seamantica.bin")
    pretrained_ft_pesos = obtener_pesos(vectorizer, ft_ingles)

    model = CoranRNN(
        vocab_size=len(vectorizer.char_vocab),
        embedding_size=args.char_embedding_size,
        rnn_hidden_size=args.rnn_hidden_size,
        padding_idx=mask_index,
        pretrained_embeddings_ft=pretrained_ft_pesos
    ).to(args.device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)

    train_state = make_train(args)

    for epoch in range(args.num_epochs):
        train_state["epoch_index"] = epoch

        # Train
        dataset.set_split("train")
        model.train()
        running_loss, running_acc = 0.0, 0.0
        for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=True)):
            optimizer.zero_grad()
            y_pred = model(batch["x_data"])
            loss = sequence_loss(y_pred, batch["y_target"], mask_index)
            loss.backward()
            optimizer.step()

            running_loss += (loss.item() - running_loss) / (bi + 1)
            acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
            running_acc += (acc - running_acc) / (bi + 1)

        train_state["train_loss"].append(running_loss)
        train_state["train_acc"].append(running_acc)

        # Val
        dataset.set_split("val")
        model.eval()
        vloss, vacc = 0.0, 0.0
        with torch.no_grad():
            for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=False)):
                y_pred = model(batch["x_data"])
                loss = sequence_loss(y_pred, batch["y_target"], mask_index)

                vloss += (loss.item() - vloss) / (bi + 1)
                acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
                vacc += (acc - vacc) / (bi + 1)

        train_state["val_loss"].append(vloss)
        train_state["val_acc"].append(vacc)

        train_state = update_training_state(args, model, train_state)
        scheduler.step(vloss)

        print(f"Epoch {epoch+1:03d} | train_loss={running_loss:.4f} "
              f"| val_loss={vloss:.4f} | val_acc={vacc:.4f}")
        
        dataset.save_vectorizer(args.vectorizer_file)
        torch.save(model.state_dict(), args.model_state_file)

        if train_state["stop_early"]:
            print("Early stopping activado.")
            break

    return args, dataset, vectorizer, model

Función de entrenamiento LSTM:

In [None]:
def train_LSTM(coran_path, output_path):
    args = Namespace(
        coran_txt=coran_path,
        vectorizer_file="vectorizer.json",
        model_state_file="model.pth",
        save_dir=output_path,

        char_embedding_size=300, # lo mismo que ft 
        lstm_hidden_size=256,

        seed=1337,
        learning_rate=1e-3,
        batch_size=64,
        num_epochs=50,
        early_stopping_criteria=5,

        cuda=True,
        reload_from_files=False
    )

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
        args.device = torch.device("cuda")
    else:
        args.device = torch.device("cpu")

    os.makedirs(args.save_dir, exist_ok=True)
    if args.vectorizer_file and not os.path.isabs(args.vectorizer_file):
        args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    if args.model_state_file and not os.path.isabs(args.model_state_file):
        args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    if args.reload_from_files and os.path.exists(args.vectorizer_file):
        dataset = CoranDataset.load_dataset_and_load_vectorizer(args.coran_txt, args.vectorizer_file)
    else:
        dataset = CoranDataset.load_dataset_and_make_vectorizer(args.coran_txt)
        dataset.save_vectorizer(args.vectorizer_file)

    vectorizer = dataset.get_vectorizer()
    mask_index = vectorizer.char_vocab.mask_index

    def obtener_pesos(vectorizer, modelo_ft):
        vocab = vectorizer.char_vocab
        token_to_idx = vocab._token_to_idx
        tamaño_vocab = len(token_to_idx)
        embedding_dim = modelo_ft.get_dimension()
        pesos = np.zeros((tamaño_vocab, embedding_dim))

        for token, idx in token_to_idx.items():
            pesos[idx] = modelo_ft.get_word_vector(token)
    
        return torch.FloatTensor(pesos)

    ft_ingles = fasttext.load_model("../src/modelos/fasttext_english_busqueda_seamantica.bin")
    pretrained_ft_pesos = obtener_pesos(vectorizer, ft_ingles)

    model = CoranLSTM(
        vocab_size=len(vectorizer.char_vocab),
        embedding_size=args.char_embedding_size,
        lstm_hidden_size=args.lstm_hidden_size,
        padding_idx=mask_index,
        pretrained_embeddings_ft=pretrained_ft_pesos
    ).to(args.device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)

    train_state = make_train(args)

    for epoch in range(args.num_epochs):
        train_state["epoch_index"] = epoch

        # Train
        dataset.set_split("train")
        model.train()
        running_loss, running_acc = 0.0, 0.0
        for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=True)):
            optimizer.zero_grad()
            y_pred = model(batch["x_data"])
            loss = sequence_loss(y_pred, batch["y_target"], mask_index)
            loss.backward()
            optimizer.step()

            running_loss += (loss.item() - running_loss) / (bi + 1)
            acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
            running_acc += (acc - running_acc) / (bi + 1)

        train_state["train_loss"].append(running_loss)
        train_state["train_acc"].append(running_acc)

        # Val
        dataset.set_split("val")
        model.eval()
        vloss, vacc = 0.0, 0.0
        with torch.no_grad():
            for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=False)):
                y_pred = model(batch["x_data"])
                loss = sequence_loss(y_pred, batch["y_target"], mask_index)

                vloss += (loss.item() - vloss) / (bi + 1)
                acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
                vacc += (acc - vacc) / (bi + 1)

        train_state["val_loss"].append(vloss)
        train_state["val_acc"].append(vacc)

        train_state = update_training_state(args, model, train_state)
        scheduler.step(vloss)

        print(f"Epoch {epoch+1:03d} | train_loss={running_loss:.4f} "
              f"| val_loss={vloss:.4f} | val_acc={vacc:.4f}")
        
        dataset.save_vectorizer(args.vectorizer_file)
        torch.save(model.state_dict(), args.model_state_file)

        if train_state["stop_early"]:
            print("Early stopping activado.") 
            break

    return args, dataset, vectorizer, model


## Dataset del Corán

### RNN - Corán


Modelo RNN para el Corán

In [41]:
class CoranRNN(nn.Module):
    # nuestro modelo nn para el rnn
    def __init__(self, vocab_size, embedding_size, rnn_hidden_size, padding_idx, dropout_p=0.5,
                 pretrained_embeddings_ft = None):
        super().__init__()
        # arquitectura de nuestra rnn

        self.char_emb = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx) # capa de inicio del tamaño del vocabulario
        # Aquí metemos los embeddings (pesos) del fasttext
        if pretrained_embeddings_ft is not None:
            self.char_emb.weight.data.copy_(pretrained_embeddings_ft)

        self.rnn = nn.RNN(embedding_size, rnn_hidden_size, batch_first=True, nonlinearity="tanh") # rnn
        self.fc = nn.Linear(rnn_hidden_size, vocab_size) # fully connected
        self.dropout_p = dropout_p # probabilidad de dropout de neuronas

    def forward(self, x_in, apply_softmax=False):
        x_emb = self.char_emb(x_in)             
        y_out, _ = self.rnn(x_emb)               
        y_out = F.dropout(y_out, p=self.dropout_p, training=self.training)
        logits = self.fc(y_out)                  
        if apply_softmax:
            return F.softmax(logits, dim=-1)
        return logits

Entrenamiento del RNN para el Corán árabe:

In [59]:
args, dataset, vectorizer, model_rnn = train_RNN(coran_path="../data/cleaned_data/cleaned_arab_quran.txt",
                                                 output_path="Unai/Models/RNN/arab/coran/coran_rnn_v1")



Epoch 001 | train_loss=3.3760 | val_loss=2.9685 | val_acc=0.1808
Epoch 002 | train_loss=2.9717 | val_loss=2.8730 | val_acc=0.2477
Epoch 003 | train_loss=2.8376 | val_loss=2.7529 | val_acc=0.2576
Epoch 004 | train_loss=2.6975 | val_loss=2.6355 | val_acc=0.2956
Epoch 005 | train_loss=2.5832 | val_loss=2.5418 | val_acc=0.3038
Epoch 006 | train_loss=2.5095 | val_loss=2.4807 | val_acc=0.3220
Epoch 007 | train_loss=2.4592 | val_loss=2.4383 | val_acc=0.3312
Epoch 008 | train_loss=2.4255 | val_loss=2.4048 | val_acc=0.3367
Epoch 009 | train_loss=2.3964 | val_loss=2.3779 | val_acc=0.3399
Epoch 010 | train_loss=2.3723 | val_loss=2.3530 | val_acc=0.3432
Epoch 011 | train_loss=2.3490 | val_loss=2.3308 | val_acc=0.3497
Epoch 012 | train_loss=2.3288 | val_loss=2.3108 | val_acc=0.3561
Epoch 013 | train_loss=2.3090 | val_loss=2.2922 | val_acc=0.3606
Epoch 014 | train_loss=2.2934 | val_loss=2.2755 | val_acc=0.3646
Epoch 015 | train_loss=2.2757 | val_loss=2.2587 | val_acc=0.3683
Epoch 016 | train_loss=2.

In [60]:
num_names = 10

model = model_rnn.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
اقلوا يوك الشيطين

 Verse 2:
والذين الله وازلهه وصالا ولا يددكابها الره من اليه ۖ ومو عليا

 Verse 3:
والذي الاتتيما رجدوك مناي يرح هم من بست واال لنيعا ولو يكث والمورخن الصادهم فضي الادا كولت بعنزب ربي الاصر الها عرضون غشر الماسه ۖ والسلول

 Verse 4:
وان تفتر م شرك به الله ويوش يابع ني بان لهم تقولين

 Verse 5:
وا تنتم لهم لم يقول الي يوم اليه واسوات وامي الواتمن يحم وابري من ربت اخرقلله ان ارزينا لوما كانهم لناتم كعند الاكي الا تلفك فلم تكفينو ما يقول لا ان لا خشي اللم

 Verse 6:
من للكم ولوا يعذل لا يبا ولا ساء الذين تمبرون

 Verse 7:
والله السجات واتباه قالوا اعلم يلينا اليهم لتقوم من الذرا تعلاون باليه كنواۖ وقال كان الله لا ينصلون

 Verse 8:
وان تخفر من انا غرسب الله السماوا يما بلو قدور بالزي الي في الاستهم المواتي ان معذل برض منا قول الا تنكا ۖ وانهم من الستيجب عليهم كذلم لجمثرين

 Verse 9:
قنا ان المستئكم عليتا من اتمهم انصابه العلا تمعنان من دبع والله ان تنفت الصريا ۚ ومن بتله والشله ولا يطع ولكم احرنهم ۚ ونمس الله الم حصيري

 Verse 1

Entrenamiento RNN Corán en inglés:

In [None]:
args, dataset, vectorizer, model_rnn = train_RNN(coran_path="../data/cleaned_data/cleaned_english_quran.txt",
                                                 output_path="Unai/Models/RNN/english/coran/coran_rnn_v1")



Epoch 001 | train_loss=3.1010 | val_loss=2.8182 | val_acc=0.1842
Epoch 002 | train_loss=2.7748 | val_loss=2.6349 | val_acc=0.2455
Epoch 003 | train_loss=2.5712 | val_loss=2.4239 | val_acc=0.3362
Epoch 004 | train_loss=2.3842 | val_loss=2.2675 | val_acc=0.3646
Epoch 005 | train_loss=2.2653 | val_loss=2.1660 | val_acc=0.3861
Epoch 006 | train_loss=2.1874 | val_loss=2.0938 | val_acc=0.3942
Epoch 007 | train_loss=2.1297 | val_loss=2.0356 | val_acc=0.4103
Epoch 008 | train_loss=2.0815 | val_loss=1.9840 | val_acc=0.4233
Epoch 009 | train_loss=2.0389 | val_loss=1.9399 | val_acc=0.4314
Epoch 010 | train_loss=2.0018 | val_loss=1.8999 | val_acc=0.4411
Epoch 011 | train_loss=1.9663 | val_loss=1.8624 | val_acc=0.4495
Epoch 012 | train_loss=1.9353 | val_loss=1.8300 | val_acc=0.4600
Epoch 013 | train_loss=1.9069 | val_loss=1.8000 | val_acc=0.4724
Epoch 014 | train_loss=1.8811 | val_loss=1.7725 | val_acc=0.4787
Epoch 015 | train_loss=1.8579 | val_loss=1.7480 | val_acc=0.4854
Epoch 016 | train_loss=1.

Obtenemos los nuevos versos:

In [44]:
num_names = 10

model = model_rnn.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
and have sereld

 Verse 2:
and allah has of themper forced indeed we hever and the rexalend becient believers

 Verse 3:
excett of her lerdente that os unould now wat on them will not and while that will thay allah and the lould be and they say neo them him of thoth the veristod and mending we to dissens of the sever is no avestalt of him your in whome that if they sour gay the relating will shid te the reveressseng o

 Verse 4:
allah then it entiled fom and josend the earth ask and deede un aplould they will he dirngnediting us proofting to wink lord

 Verse 5:
they mis of people your lord in mall and their lers at of the people if ih no day plowen on ie oa be treas is chated and dithared and the gory right as prealy sid to the signs in the angigs oo dhed ate soor abithor and heming pherlon whom you indeed o aboigs whatre slise will not thome the fiol

 Verse 6:
will and war their mellast memens and you and them allah indeed you have with the 

### LSTM - Corán

Modelo del LSTM para el Corán

In [45]:
class CoranLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_size, lstm_hidden_size, padding_idx, dropout_p=0.5, pretrained_embeddings_ft=None):
        super().__init__()
        self.char_emb = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        if pretrained_embeddings_ft is not None:
            self.char_emb.weight.data.copy_(pretrained_embeddings_ft)
        self.lstm = nn.LSTM(embedding_size, lstm_hidden_size, batch_first=True)
        self.fc = nn.Linear(lstm_hidden_size, vocab_size)
        self.dropout_p = dropout_p

    def forward(self, x_in, apply_softmax=False):
        x_emb = self.char_emb(x_in)              
        y_out, _ = self.lstm(x_emb)              
        y_out = F.dropout(y_out, p=self.dropout_p, training=self.training)
        logits = self.fc(y_out)                 
        return F.softmax(logits, dim=-1) if apply_softmax else logits

Entrenamiento del LSTM para el Corán árabe:

In [64]:
args, dataset, vectorizer, model_lstm = train_LSTM(coran_path="../data/cleaned_data/cleaned_arab_quran.txt",
                                                 output_path="Unai/Models/LSTM/arab/coran/coran_lstm_v1")



Epoch 001 | train_loss=3.0064 | val_loss=2.7982 | val_acc=0.2474
Epoch 002 | train_loss=2.5930 | val_loss=2.4989 | val_acc=0.3116
Epoch 003 | train_loss=2.4154 | val_loss=2.3848 | val_acc=0.3378
Epoch 004 | train_loss=2.3163 | val_loss=2.2955 | val_acc=0.3590
Epoch 005 | train_loss=2.2297 | val_loss=2.2205 | val_acc=0.3758
Epoch 006 | train_loss=2.1569 | val_loss=2.1536 | val_acc=0.3923
Epoch 007 | train_loss=2.0903 | val_loss=2.0969 | val_acc=0.4116
Epoch 008 | train_loss=2.0346 | val_loss=2.0443 | val_acc=0.4306
Epoch 009 | train_loss=1.9884 | val_loss=2.0048 | val_acc=0.4441
Epoch 010 | train_loss=1.9468 | val_loss=1.9695 | val_acc=0.4507
Epoch 011 | train_loss=1.9100 | val_loss=1.9405 | val_acc=0.4590
Epoch 012 | train_loss=1.8760 | val_loss=1.9145 | val_acc=0.4656
Epoch 013 | train_loss=1.8482 | val_loss=1.8903 | val_acc=0.4716
Epoch 014 | train_loss=1.8240 | val_loss=1.8702 | val_acc=0.4778
Epoch 015 | train_loss=1.8006 | val_loss=1.8553 | val_acc=0.4814
Epoch 016 | train_loss=1.

In [65]:
num_names = 10

model = model_lstm.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
وضلغ ايات الله وقال اني ربك ليولم فهم لا يسمعون

 Verse 2:
ومن الطير من الله ان حسبا شهيدا

 Verse 3:
واذ قاله قوانهم لا يؤمنون بما لهم هي يوم يقوم الحق بشرا وعليكم بكم نعم ۚ هل انسما للناس الهم اله وله ۚ والا بالغلام ۚ وكذلك تقري البين من الارض ۖ فاخذت من السماء والاخره ۗ والله اعم علي الله من المتقين

 Verse 4:
الله يوجن للاحسي ۖ وما قد بشر الناس بريحا للمتقين

 Verse 5:
والذين اتبعوا الي الله الا من الملائكه ۗ والذين كفروا لهم عذابا امثالهم والجنه استغعفا بينهم ۚ انه لا يريد الله الا علي الناس والاخره ۖ والله لا يريد الا ما تاكلون

 Verse 6:
وما انت الا علي الكافير ما لا يغوبنا ليبغوه عن الامر في الله ويقولون او يمزل عليها وخير نصيا

 Verse 7:
واذ تمدع كل رحمتم ۚ وان ترب واخذ من المنذرين

 Verse 8:
قل للناس بشرك علي الغيب والارض من لو شديد القوم الخاقرين

 Verse 9:
يولم في الصالحين فاذا نجظل عليكم بما قلون ان ياتوا به من قبل ۚ انه المغرب ما كانوا وهم المالمين

 Verse 10:
واذ يعلم اربهم الله الذين امنوا واخذوا الفين لا يحسرون الله الا قالوا ل

Lanzamos entrenamiento inglés de LSTM:

In [67]:
args, dataset, vectorizer, model_lstm = train_LSTM(coran_path="../data/cleaned_data/cleaned_english_quran.txt",
                                                 output_path="Unai/Models/LSTM/english/coran/coran_lstm_v1")



Epoch 001 | train_loss=2.8129 | val_loss=2.4565 | val_acc=0.2744
Epoch 002 | train_loss=2.2471 | val_loss=2.0381 | val_acc=0.4043
Epoch 003 | train_loss=1.9611 | val_loss=1.8035 | val_acc=0.4709
Epoch 004 | train_loss=1.7795 | val_loss=1.6456 | val_acc=0.5110
Epoch 005 | train_loss=1.6488 | val_loss=1.5310 | val_acc=0.5419
Epoch 006 | train_loss=1.5502 | val_loss=1.4528 | val_acc=0.5646
Epoch 007 | train_loss=1.4762 | val_loss=1.3882 | val_acc=0.5806
Epoch 008 | train_loss=1.4189 | val_loss=1.3384 | val_acc=0.5957
Epoch 009 | train_loss=1.3710 | val_loss=1.2991 | val_acc=0.6080
Epoch 010 | train_loss=1.3309 | val_loss=1.2682 | val_acc=0.6165
Epoch 011 | train_loss=1.2988 | val_loss=1.2377 | val_acc=0.6234
Epoch 012 | train_loss=1.2706 | val_loss=1.2174 | val_acc=0.6288
Epoch 013 | train_loss=1.2457 | val_loss=1.1987 | val_acc=0.6370
Epoch 014 | train_loss=1.2245 | val_loss=1.1845 | val_acc=0.6409
Epoch 015 | train_loss=1.2059 | val_loss=1.1653 | val_acc=0.6465
Epoch 016 | train_loss=1.

In [68]:
num_names = 10

model = model_lstm.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
and indeed we to allah he will surely be overtioned to the earth and harms allah with madires after the wrongdoing people

 Verse 2:
the word of allah bring before you of our promise and masked him acceptable that i said o mankind each of them will intend to break they will have eat and you will find a party of being a helper

 Verse 3:
and allah called and to allah inspert is not because they will be returned word how not to in hell that they will be returned

 Verse 4:
and if they said they before them there is the his with the earth and those who bespor they will be grateful

 Verse 5:
and when he talken is allah who in the way of allah he has revealed to them about them indeed we will respond to them indeed allah is forgiving and mercifus and should and this is dust before them indeed allah is hearing and sent down and you are the scripture and his wife will deed and he do indeed

 Verse 6:
the people indeed allah is not the truth and we wi

## Dataset Hadith-s (Kaggle)

Visualizamos el la estructura del df, cogeremos las columnas (hadith-s) que nos interesan: `text_ar` y `text_en`. Como el archivo viene estructurado de una manera poco usual, realizaremos una limpieza exhaustiva.

In [70]:
hadith_df = pd.read_csv("../data/hadith_dataset/all_hadiths_clean.csv")
hadith_df.head(1)

Unnamed: 0,id,hadith_id,source,chapter_no,hadith_no,chapter,chain_indx,text_ar,text_en
0,0,1,Sahih Bukhari,1,1,Revelation - كتاب بدء الوحى,"30418, 20005, 11062, 11213, 11042, 3","حدثنا الحميدي عبد الله بن الزبير، قال حدثنا سفيان، قال حدثنا يحيى بن سعيد الأنصاري، قال أخبرني محمد بن إبراهيم التيمي، أنه سمع علقمة بن وقاص الليثي، يقول سمعت عمر بن الخطاب رضى الله عنه على المنبر قال سمعت رسول الله صلى الله عليه وسلم يقول ‏""‏ إنما الأعمال بالنيات، وإنما لكل امرئ ما نوى، فمن كانت هجرته إلى دنيا يصيبها أو إلى امرأة ينكحها فهجرته إلى ما هاجر إليه ‏""‏‏.‏","Narrated 'Umar bin Al-Khattab: I heard Allah's Apostle saying, ""The reward of deeds depends upon the intentions and every person will get the reward according to what he has intended. So whoever emigrated for worldly benefits or for a woman to marry, his emigration was for what he emigrated for."""


Árabe:

In [71]:
hadith_ar = hadith_df["text_ar"]
hadith_ar = pd.DataFrame(hadith_ar).dropna()
hadith_ar.to_csv("../data/hadith_dataset/hadith_ar/hadith_ar.csv", index=False, encoding="utf-8")
print(hadith_ar.count())
pd.set_option('display.max_colwidth', None)
hadith_ar.head(1)

text_ar    34433
dtype: int64


Unnamed: 0,text_ar
0,"حدثنا الحميدي عبد الله بن الزبير، قال حدثنا سفيان، قال حدثنا يحيى بن سعيد الأنصاري، قال أخبرني محمد بن إبراهيم التيمي، أنه سمع علقمة بن وقاص الليثي، يقول سمعت عمر بن الخطاب رضى الله عنه على المنبر قال سمعت رسول الله صلى الله عليه وسلم يقول ‏""‏ إنما الأعمال بالنيات، وإنما لكل امرئ ما نوى، فمن كانت هجرته إلى دنيا يصيبها أو إلى امرأة ينكحها فهجرته إلى ما هاجر إليه ‏""‏‏.‏"


Limpieza árabe:

In [72]:
QUOTE_CHARS = r"\"'“”„«»‹›`´"

# Diacríticos árabes (harakat) + marcas coránicas comunes
ARABIC_DIACRITICS = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")

# Rangos Unicode típicos para árabe (básico + extendidos)
ARABIC_RANGES = r"\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF"

def _strip_wrapping_quotes(text: str, max_loops: int = 5) -> str:
    if not text:
        return text
    t = text.strip()
    for _ in range(max_loops):
        new_t = re.sub(rf'^\s*[{QUOTE_CHARS}]+\s*', '', t)
        new_t = re.sub(rf'\s*[{QUOTE_CHARS}]+\s*$', '', new_t)
        new_t = new_t.strip()
        if new_t == t:
            break
        t = new_t
    return t

def normalize_arabic(text: str, remove_diacritics: bool = True) -> str:
    # Normalización Unicode (unifica formas)
    text = unicodedata.normalize("NFKC", text)

    # Quitar tatweel (kashida)
    text = text.replace("\u0640", "")

    # Unificar algunas variantes comunes (opcional, útil en muchos corpus)
    text = text.replace("أ", "ا").replace("إ", "ا").replace("آ", "ا")
    text = text.replace("ى", "ي")
    text = text.replace("ة", "ه")  # si prefieres mantenerla, comenta esta línea

    if remove_diacritics:
        text = re.sub(ARABIC_DIACRITICS, "", text)

    return text

def clean_hadith_text_ar(text, remove_diacritics: bool = True):
    """
    Limpieza para árabe manteniendo el mismo formato que el inglés:
    - Quita comillas envolventes
    - Elimina prefijo 'narrated ...' si existe (en inglés)
    - Normaliza árabe (opcional quitar harakat)
    - Mantiene letras árabes + números + puntuación básica árabe/latina
    """
    if not isinstance(text, str):
        return ""

    # 1) Espacios/saltos de línea
    text = text.replace("\n", " ").replace("\r", " ").strip()

    # 2) Quitar comillas envolventes
    text = _strip_wrapping_quotes(text)

    # 3) Normalización árabe (sin lower)
    text = normalize_arabic(text, remove_diacritics=remove_diacritics)

    # 4) Eliminar narrador (si el encabezado está en inglés, como en tu caso)
    palabras_clave = (
        r"(said|asked|the|i\s+heard|i\s+was\s+told|i\s+informed|while|informed|abu|allah|"
        r"if|when|once|some|whenever|it|sometimes|thereupon|then|and|but)"
    )
    patron_narrador = r'^\s*narrated\s+.*?[:\-]?\s*(?=\b' + palabras_clave + r'\b)'
    text = re.sub(patron_narrador, "", text).strip()

    # 5) Quitar comillas residuales
    text = re.sub(rf'^\s*[{QUOTE_CHARS}]+\s*', "", text)
    text = re.sub(rf'\s*[{QUOTE_CHARS}]+\s*$', "", text)

    # 6) Mantener: letras árabes, números, espacios y puntuación básica.
    # Incluye puntuación árabe: ، ؛ ؟  (comma/semicolon/question mark)
    allowed = rf"[^0-9\s{ARABIC_RANGES}\.,!?'\-\(\)«»\"،؛؟]"
    text = re.sub(allowed, " ", text)

    # 7) Colapsar espacios
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [75]:
hadith_ar = hadith_df[["text_ar"]].copy()

hadith_ar = hadith_ar.dropna(subset=["text_ar"])

hadith_ar = hadith_ar.iloc[1:].reset_index(drop=True)

hadith_ar["text_ar"] = hadith_ar["text_ar"].apply(clean_hadith_text_ar)
hadith_ar = hadith_ar.iloc[1:].reset_index(drop=True)

hadith_ar = hadith_ar[hadith_ar["text_ar"] != ""].reset_index(drop=True)

output_path = "../data/hadith_dataset/hadith_ar/hadith_ar_cleaned.csv"

hadith_ar.to_csv(output_path, index=False, encoding="utf-8")

Inglés + función de limpieza inglesa

In [76]:
QUOTE_CHARS = r"\"'“”„«»‹›`´"

def _strip_wrapping_quotes(text: str, max_loops: int = 5) -> str:
    """
    Elimina comillas envolventes repetidas (incluyendo tipográficas),
    tolerando espacios alrededor.
    Ej:
      '"hola"' -> hola
      '  “hola”  ' -> hola
      '""hola""' -> hola
    """
    if not text:
        return text

    t = text.strip()
    for _ in range(max_loops):
        # ^\s*["'“”...]+ (captura comillas al inicio) y ["'“”...]+\s*$ (al final)
        new_t = re.sub(rf'^\s*[{QUOTE_CHARS}]+\s*', '', t)
        new_t = re.sub(rf'\s*[{QUOTE_CHARS}]+\s*$', '', new_t)
        new_t = new_t.strip()
        if new_t == t:
            break
        t = new_t
    return t

def clean_hadith_text(text):
    if not isinstance(text, str):
        return ""

    text = text.replace('\n', ' ').replace('\r', ' ').strip()

    text = _strip_wrapping_quotes(text)

    text = text.replace('""', '"').lower()

    # Limpieza del formato original del .csv: narrated by (nommbre del narrador) + texto que queremos
    palabras_clave = (
        r"(said|asked|the|i\s+heard|i\s+was\s+told|i\s+informed|while|informed|abu|allah|"
        r"if|when|once|some|whenever|it|sometimes|thereupon|then|and|but)"
    )
    patron_narrador = r'^\s*narrated\s+.*?[:\-]?\s*(?=\b' + palabras_clave + r'\b)'
    text = re.sub(patron_narrador, '', text).strip()

    text = re.sub(rf'^\s*[{QUOTE_CHARS}]+\s*', '', text)
    text = re.sub(rf'\s*[{QUOTE_CHARS}]+\s*$', '', text)
    text = re.sub(r"[^a-z0-9\s.,!?'\-\(\)]", " ", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text


In [77]:
hadith_en = hadith_df[["text_en"]].copy()

hadith_en = hadith_en.dropna(subset=["text_en"])

hadith_en = hadith_en.iloc[1:].reset_index(drop=True)

hadith_en["text_en"] = hadith_en["text_en"].apply(clean_hadith_text)
hadith_en = hadith_en.iloc[1:].reset_index(drop=True)

hadith_en = hadith_en[hadith_en["text_en"] != ""].reset_index(drop=True)

output_path = "../data/hadith_dataset/hadith_en/hadith_en_cleaned.csv"

hadith_en.to_csv(output_path, index=False, encoding="utf-8")

Clase Dataset del Hadith dataset

In [82]:
class HadithDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vectorizer: CoranVectorizer, text_col):
        # text_col: text_en (hadith_en) y text_ar (hadith_ar)
        self.df = df.reset_index(drop=True)
        self._vectorizer = vectorizer
        self._text_col = text_col
        self._max_seq_length = min(int(self.df[text_col].astype(str).map(len).max()) + 2, 500)        
        n = len(self.df)
        train_end = int(n * 0.70) # 70% de las instancias al train set
        val_end = int(n * .85) # 15 para el validation set, y el otro 15 para el test

        self.train_df = self.df.iloc[:train_end]
        self.val_df = self.df.iloc[train_end:val_end]
        self.test_df = self.df.iloc[val_end:]

        self._lookup_dict = {
            "train": (self.train_df, len(self.train_df)),
            "val": (self.val_df, len(self.val_df)),
            "test": (self.test_df, len(self.test_df)),
        }

        self.set_split("train")

    @classmethod
    def load_dataset_and_make_vectorizer(cls, hadith_csv, text_col):
        df = pd.read_csv(hadith_csv)
        # FIX: Use text_col instead of "text"
        df[text_col] = df[text_col].astype(str).str.lower()
        vectorizer = CoranVectorizer.from_dataframe(df, text_col)
        return cls(df, vectorizer, text_col)

    @classmethod
    def load_dataset_and_load_vectorizer(cls, hadith_csv, vectorizer_filepath, text_col):
        df = pd.read_csv(hadith_csv)
        # FIX: Use text_col instead of "text"
        df[text_col] = df[text_col].astype(str).str.lower()
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(df, vectorizer, text_col)
        
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath, "r", encoding="utf-8") as f:
            contents = json.load(f)
        return CoranVectorizer.from_serializable(contents)

    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w", encoding="utf-8") as f:
            json.dump(self._vectorizer.to_serializable(), f, ensure_ascii=False)

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        text = str(row[self._text_col])
        x, y = self._vectorizer.vectorize(text, vector_length=self._max_seq_length)
        return {"x_data": torch.tensor(x, dtype=torch.long), "y_target": torch.tensor(y, dtype=torch.long)}

### RNN - Hadith

In [83]:
def train_RNN(hadith_path, output_path, text_col):
    args = Namespace(
        hadith_csv=hadith_path,
        vectorizer_file="vectorizer.json",
        model_state_file="model.pth",
        save_dir=output_path,

        char_embedding_size=300, # 300 porque los embeddings del ft son de 300, tienen que coincidir
        rnn_hidden_size=128, # 256-ekin peatau itenzatek

        seed=1337,
        learning_rate=1e-3,
        batch_size=256,
        num_epochs=50,
        early_stopping_criteria=5,

        cuda=True,
        reload_from_files=False
    )

    print(args.batch_size)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
        args.device = torch.device("cuda")
    else:
        args.device = torch.device("cpu")

    os.makedirs(args.save_dir, exist_ok=True)
    if args.vectorizer_file and not os.path.isabs(args.vectorizer_file):
        args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    if args.model_state_file and not os.path.isabs(args.model_state_file):
        args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    if args.reload_from_files and os.path.exists(args.vectorizer_file):
        dataset = HadithDataset.load_dataset_and_load_vectorizer(args.hadith_csv, args.vectorizer_file, text_col)
    else:
        dataset = HadithDataset.load_dataset_and_make_vectorizer(args.hadith_csv, text_col)
        dataset.save_vectorizer(args.vectorizer_file)

    vectorizer = dataset.get_vectorizer()
    mask_index = vectorizer.char_vocab.mask_index

    def obtener_pesos(vectorizer, modelo_ft):
        vocab = vectorizer.char_vocab
        token_to_idx = vocab._token_to_idx
        tamaño_vocab = len(token_to_idx)
        embedding_dim = modelo_ft.get_dimension()
        pesos = np.zeros((tamaño_vocab, embedding_dim))

        for token, idx in token_to_idx.items():
            pesos[idx] = modelo_ft.get_word_vector(token)
    
        return torch.FloatTensor(pesos)

    ft_ingles = fasttext.load_model("../src/modelos/fasttext_english_busqueda_seamantica.bin")
    pretrained_ft_pesos = obtener_pesos(vectorizer, ft_ingles)

    model = CoranRNN(
        vocab_size=len(vectorizer.char_vocab),
        embedding_size=args.char_embedding_size,
        rnn_hidden_size=args.rnn_hidden_size,
        padding_idx=mask_index,
        pretrained_embeddings_ft=pretrained_ft_pesos
    ).to(args.device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)

    train_state = make_train(args)

    for epoch in range(args.num_epochs):
        train_state["epoch_index"] = epoch

        # Train
        dataset.set_split("train")
        model.train()
        running_loss, running_acc = 0.0, 0.0
        for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=True)):
            optimizer.zero_grad()
            y_pred = model(batch["x_data"])
            loss = sequence_loss(y_pred, batch["y_target"], mask_index)
            loss.backward()
            optimizer.step()

            running_loss += (loss.item() - running_loss) / (bi + 1)
            acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
            running_acc += (acc - running_acc) / (bi + 1)

        train_state["train_loss"].append(running_loss)
        train_state["train_acc"].append(running_acc)

        # Val
        dataset.set_split("val")
        model.eval()
        vloss, vacc = 0.0, 0.0
        with torch.no_grad():
            for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=False)):
                y_pred = model(batch["x_data"])
                loss = sequence_loss(y_pred, batch["y_target"], mask_index)

                vloss += (loss.item() - vloss) / (bi + 1)
                acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
                vacc += (acc - vacc) / (bi + 1)

        train_state["val_loss"].append(vloss)
        train_state["val_acc"].append(vacc)

        train_state = update_training_state(args, model, train_state)
        scheduler.step(vloss)

        print(f"Epoch {epoch+1:03d} | train_loss={running_loss:.4f} "
              f"| val_loss={vloss:.4f} | val_acc={vacc:.4f}")
        
        dataset.save_vectorizer(args.vectorizer_file)
        torch.save(model.state_dict(), args.model_state_file)

        if train_state["stop_early"]:
            print("Early stopping activado.")
            break

    return args, dataset, vectorizer, model

Entrenamiento RNN con dataset Hadith en árabe:

In [85]:
args, dataset, vectorizer, model_rnn = train_RNN(hadith_path="../data/hadith_dataset/hadith_ar/hadith_ar_cleaned.csv",
                                                 output_path="Unai/Models/RNN/arab/hadith/coran_rnn_v1",
                                                 text_col="text_ar")

256




Epoch 001 | train_loss=2.8292 | val_loss=2.2735 | val_acc=0.3922
Epoch 002 | train_loss=2.1674 | val_loss=1.9568 | val_acc=0.4754
Epoch 003 | train_loss=1.9599 | val_loss=1.7686 | val_acc=0.5213
Epoch 004 | train_loss=1.8397 | val_loss=1.6645 | val_acc=0.5467
Epoch 005 | train_loss=1.7722 | val_loss=1.6053 | val_acc=0.5620
Epoch 006 | train_loss=1.7289 | val_loss=1.5654 | val_acc=0.5700
Epoch 007 | train_loss=1.6981 | val_loss=1.5379 | val_acc=0.5776
Epoch 008 | train_loss=1.6748 | val_loss=1.5169 | val_acc=0.5788
Epoch 009 | train_loss=1.6563 | val_loss=1.4993 | val_acc=0.5833
Epoch 010 | train_loss=1.6424 | val_loss=1.4898 | val_acc=0.5850
Epoch 011 | train_loss=1.6301 | val_loss=1.4769 | val_acc=0.5914
Epoch 012 | train_loss=1.6198 | val_loss=1.4664 | val_acc=0.5935
Epoch 013 | train_loss=1.6099 | val_loss=1.4574 | val_acc=0.5956
Epoch 014 | train_loss=1.6030 | val_loss=1.4509 | val_acc=0.5990
Epoch 015 | train_loss=1.5953 | val_loss=1.4451 | val_acc=0.5988
Epoch 016 | train_loss=1.

In [86]:
num_names = 10

model = model_rnn.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
حدثناه واعر، عن حبيد بن اليسير، عن ابي زحيبه بن عائشه - قال قال رسول الله صلي الله عليه وسلم فقال له عن يليقه النهم الله يعمر رالت انه لكوني وللا شعر حتي اوضي ورؤي التبال قال فقال اما قال " ما الي المشاوي يوقوا الا ان ربرارا . قال ابن عبي من صلي الذا بعدك وال بيده وجله فلاسم عل هذا حديث غضي الولسا

 Verse 2:
حدثنا عبد الله بن ابي اليسمت، قال احسم النحلم بن ابي عبيد الله، عن ابن ععم، ان النبي صلي الله عليه وسلم رقرك ولا يصلوضه تعوا الله الا علي العبد في ما انبشه " . قال ابو عيسو حتي اسلم تمول الولا ولقولي مه ابي جرير ان الناس الي الصواه من اللفف امذي ان فيها " .

 Verse 3:
حدثنا ابو بهركي، قري عن رزيد وفا حديث وحيد بن ابي برسوا ابي بعر - عن اسحعم بن ملين، عن عبيد الله بن عبر بن زهر بن الاحي، حدثنا مكري، عن ابي الابي، قال كن فرج الالون الباك " .

 Verse 4:
حدثنا يحيي بن قال البنبا بن حديث بن عبد الله بن ابي رسوي بن سعيد، عن ابي هريره، ين يصلا قال رسول الله صلي الله عليه وسلم فاليه عن النهام عن ابن عبد الله بن محمر بن سفمان، عن عبد الرحرد، ماو يحد

Entrenamiento Hadith RNN inglés:

In [87]:
args, dataset, vectorizer, model_rnn = train_RNN(hadith_path="../data/hadith_dataset/hadith_en/hadith_en_cleaned.csv",
                                                 output_path="Unai/Models/RNN/english/hadith/coran_rnn_v1",
                                                 text_col="text_en")

256




Epoch 001 | train_loss=2.8150 | val_loss=2.2923 | val_acc=0.3488
Epoch 002 | train_loss=2.2527 | val_loss=2.0397 | val_acc=0.4163
Epoch 003 | train_loss=2.0801 | val_loss=1.8875 | val_acc=0.4721
Epoch 004 | train_loss=1.9640 | val_loss=1.7762 | val_acc=0.4978
Epoch 005 | train_loss=1.8838 | val_loss=1.7031 | val_acc=0.5181
Epoch 006 | train_loss=1.8304 | val_loss=1.6532 | val_acc=0.5309
Epoch 007 | train_loss=1.7912 | val_loss=1.6177 | val_acc=0.5379
Epoch 008 | train_loss=1.7626 | val_loss=1.5924 | val_acc=0.5445
Epoch 009 | train_loss=1.7395 | val_loss=1.5729 | val_acc=0.5476
Epoch 010 | train_loss=1.7211 | val_loss=1.5560 | val_acc=0.5532
Epoch 011 | train_loss=1.7057 | val_loss=1.5416 | val_acc=0.5533
Epoch 012 | train_loss=1.6934 | val_loss=1.5302 | val_acc=0.5568
Epoch 013 | train_loss=1.6829 | val_loss=1.5210 | val_acc=0.5588
Epoch 014 | train_loss=1.6741 | val_loss=1.5118 | val_acc=0.5607
Epoch 015 | train_loss=1.6658 | val_loss=1.5059 | val_acc=0.5630
Epoch 016 | train_loss=1.

In [88]:
num_names = 10

model = model_rnn.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
ighilger to not with one of obre'san me who trounge aforher to his norn mad to remall of face rish whine and of intony warces. she said asked the prophet said while il the lakin. on the son over allah's aponthing white one is the prophet what the messenger of allah ( ) was would nater of manting and

 Verse 2:
in the said and caner it in the promertidian angingas if the people of was a bont of the hant (it who it.

 Verse 3:
abu harie mircher and he go with entrong undore of allah's messenger (mal would, who were bouthees (the hir! the messenger of allah ( ) said to hand to tamal and and a toms and whos of the nafrat and it is thit i. i said the my somment soment on the inaning him to day is the bagt ase whee toan and c

 Verse 4:
the prophet said you the messenger of allah (may peace be upon him) tarat the mastted a fromt ho for his kand and come to the from the ant of bur asked by allah.

 Verse 5:
abu hurairah he fill in the vernaty of the f

### LSTM - Hadith

In [None]:
def train_LSTM(hadith_path, output_path, text_col):
    args = Namespace(
        hadith_csv=hadith_path,
        vectorizer_file="vectorizer.json",
        model_state_file="model.pth",
        save_dir=output_path,

        char_embedding_size=300, # lo mismo que ft 
        lstm_hidden_size=256,

        seed=1337,
        learning_rate=1e-3,
        batch_size=64,
        num_epochs=50,
        early_stopping_criteria=5,

        cuda=True,
        reload_from_files=False
    )

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
        args.device = torch.device("cuda")
    else:
        args.device = torch.device("cpu")

    os.makedirs(args.save_dir, exist_ok=True)
    if args.vectorizer_file and not os.path.isabs(args.vectorizer_file):
        args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    if args.model_state_file and not os.path.isabs(args.model_state_file):
        args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    if args.reload_from_files and os.path.exists(args.vectorizer_file):
        dataset = HadithDataset.load_dataset_and_load_vectorizer(args.hadith_csv, args.vectorizer_file, text_col)
    else:
        dataset = HadithDataset.load_dataset_and_make_vectorizer(args.hadith_csv, text_col)
        dataset.save_vectorizer(args.vectorizer_file)

    vectorizer = dataset.get_vectorizer()
    mask_index = vectorizer.char_vocab.mask_index

    def obtener_pesos(vectorizer, modelo_ft):
        vocab = vectorizer.char_vocab
        token_to_idx = vocab._token_to_idx
        tamaño_vocab = len(token_to_idx)
        embedding_dim = modelo_ft.get_dimension()
        pesos = np.zeros((tamaño_vocab, embedding_dim))

        for token, idx in token_to_idx.items():
            pesos[idx] = modelo_ft.get_word_vector(token)
    
        return torch.FloatTensor(pesos)

    ft_ingles = fasttext.load_model("../src/modelos/fasttext_english_busqueda_seamantica.bin")
    pretrained_ft_pesos = obtener_pesos(vectorizer, ft_ingles)

    model = CoranLSTM(
        vocab_size=len(vectorizer.char_vocab),
        embedding_size=args.char_embedding_size,
        lstm_hidden_size=args.lstm_hidden_size,
        padding_idx=mask_index,
        pretrained_embeddings_ft=pretrained_ft_pesos
    ).to(args.device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)

    train_state = make_train(args)

    for epoch in range(args.num_epochs):
        train_state["epoch_index"] = epoch

        # Train
        dataset.set_split("train")
        model.train()
        running_loss, running_acc = 0.0, 0.0
        for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=True)):
            optimizer.zero_grad()
            y_pred = model(batch["x_data"])
            loss = sequence_loss(y_pred, batch["y_target"], mask_index)
            loss.backward()
            optimizer.step()

            running_loss += (loss.item() - running_loss) / (bi + 1)
            acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
            running_acc += (acc - running_acc) / (bi + 1)

        train_state["train_loss"].append(running_loss)
        train_state["train_acc"].append(running_acc)

        # Val
        dataset.set_split("val")
        model.eval()
        vloss, vacc = 0.0, 0.0
        with torch.no_grad():
            for bi, batch in enumerate(generate_batches(dataset, args.batch_size, args.device, shuffle=False)):
                y_pred = model(batch["x_data"])
                loss = sequence_loss(y_pred, batch["y_target"], mask_index)

                vloss += (loss.item() - vloss) / (bi + 1)
                acc = compute_accuracy(y_pred, batch["y_target"], mask_index)
                vacc += (acc - vacc) / (bi + 1)

        train_state["val_loss"].append(vloss)
        train_state["val_acc"].append(vacc)

        train_state = update_training_state(args, model, train_state)
        scheduler.step(vloss)

        print(f"Epoch {epoch+1:03d} | train_loss={running_loss:.4f} "
              f"| val_loss={vloss:.4f} | val_acc={vacc:.4f}")
        
        dataset.save_vectorizer(args.vectorizer_file)
        torch.save(model.state_dict(), args.model_state_file)

        if train_state["stop_early"]:
            print("Early stopping activado.")
            break

    return args, dataset, vectorizer, model


Entrenamiento LSTM Hadith árabe:

In [90]:
args, dataset, vectorizer, model_lstm = train_LSTM(hadith_path="../data/hadith_dataset/hadith_ar/hadith_ar_cleaned.csv",
                                                 output_path="Unai/Models/LSTM/arab/hadith/hadith_lstm_v1",
                                                 text_col="text_ar")



Epoch 001 | train_loss=2.2244 | val_loss=1.6010 | val_acc=0.5521
Epoch 002 | train_loss=1.5442 | val_loss=1.3511 | val_acc=0.6187
Epoch 003 | train_loss=1.3814 | val_loss=1.2559 | val_acc=0.6392
Epoch 004 | train_loss=1.3001 | val_loss=1.2028 | val_acc=0.6529
Epoch 005 | train_loss=1.2489 | val_loss=1.1655 | val_acc=0.6629
Epoch 006 | train_loss=1.2133 | val_loss=1.1475 | val_acc=0.6667
Epoch 007 | train_loss=1.1867 | val_loss=1.1165 | val_acc=0.6774
Epoch 008 | train_loss=1.1655 | val_loss=1.0994 | val_acc=0.6821
Epoch 009 | train_loss=1.1486 | val_loss=1.0839 | val_acc=0.6860
Epoch 010 | train_loss=1.1338 | val_loss=1.0716 | val_acc=0.6887
Epoch 011 | train_loss=1.1217 | val_loss=1.0630 | val_acc=0.6925
Epoch 012 | train_loss=1.1106 | val_loss=1.0542 | val_acc=0.6942
Epoch 013 | train_loss=1.1013 | val_loss=1.0469 | val_acc=0.6959
Epoch 014 | train_loss=1.0932 | val_loss=1.0433 | val_acc=0.6960
Epoch 015 | train_loss=1.0864 | val_loss=1.0361 | val_acc=0.7006
Epoch 016 | train_loss=1.

In [91]:
num_names = 10

model = model_rnn.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
حدثنمي مدثحنخكققنكزئجنكزئنحدثنمي مدثحنخنهنتكشنوكقق ي،حدنحدثزنكنشقثثزنحدثنش ميثثنحدثنشرثئنا شحندثانح ناثيشرحئنترحدنشكققثسكحن بنشكقثن بن نكب،ناثنب زنحدثنسرحدثيءنرندكعثنزثشحن زنحدثنو،اثنت يئنرزنحدثنشكشثشنش تثئكدرزن زندرانت نقكحثشنب ينكققكدنتدثنز كيثنرزن زنحدثنبيكشحدثينح نرحناكفنميكفثينحدثناثششثزسثين بن

 Verse 2:
حدثنمي مدثحنتكشنكشنكنح باثثعثئنحدثناثششثزسثين بنكققكدنخنهنشثكينمقكسكحر زئرزوثجن ،حنيثارئشنتدكحندثنت ،قئنكنمكشحرزسنترحدنز يثنحدثنييكرزوفنرحنخ بنحدثندكزئنرزنتدرودناكزثئنح نميثكحدحكحدنكزئنشكرئنثمنش تشثزنخشكفن بنكنب يحرزسنتدثزنحدثنشثكزئنحدثنم يشثئن،م زنحدثنز كيئن زنحدثنرشحكصنتدثزنترندكعثنئ ،زئءنحدثناكشح

 Verse 3:
حدثنمي مدثحنتدثنكلئندرشنب ينحدثنلثثزنحدثنمي مدثحنخشنرصشناكقن،مكزنكزئنتكحنرحنكزئنتدثقثنوكققنحدكشن،زب يثئندرشنكققكدؤشنكم شحقثنشكرئنحدثنمي مدثحنخنهنشكنترحدنرل،ندكئنحدثناثيزكحثئنحدكحنح نرزشنكليثكحشنكزئنوكححثثزن زنحدكحنت،ققنو ارزسنب ينحدثنرحدثيشرزسنحدثنا،زوثنشرسحققرحدن بنحدثنت،يين،»،اي يثن بنحدثناثششثزسث

 Verse 4:
كل،ندرئجندثنشكرئجنكم ز

Entrenamiento Hadith LSTM inglés:

In [92]:
args, dataset, vectorizer, model_lstm = train_LSTM(hadith_path="../data/hadith_dataset/hadith_en/hadith_en_cleaned.csv",
                                                 output_path="Unai/Models/LSTM/english/hadith/hadith_lstm_v1",
                                                 text_col="text_en")



Epoch 001 | train_loss=2.2798 | val_loss=1.6943 | val_acc=0.5154
Epoch 002 | train_loss=1.6412 | val_loss=1.4032 | val_acc=0.5868
Epoch 003 | train_loss=1.4417 | val_loss=1.2776 | val_acc=0.6206
Epoch 004 | train_loss=1.3376 | val_loss=1.2104 | val_acc=0.6362
Epoch 005 | train_loss=1.2768 | val_loss=1.1699 | val_acc=0.6464
Epoch 006 | train_loss=1.2358 | val_loss=1.1359 | val_acc=0.6560
Epoch 007 | train_loss=1.2066 | val_loss=1.1143 | val_acc=0.6614
Epoch 008 | train_loss=1.1836 | val_loss=1.1015 | val_acc=0.6649
Epoch 009 | train_loss=1.1653 | val_loss=1.0856 | val_acc=0.6679
Epoch 010 | train_loss=1.1503 | val_loss=1.0731 | val_acc=0.6736
Epoch 011 | train_loss=1.1376 | val_loss=1.0619 | val_acc=0.6735
Epoch 012 | train_loss=1.1265 | val_loss=1.0544 | val_acc=0.6783
Epoch 013 | train_loss=1.1177 | val_loss=1.0507 | val_acc=0.6787
Epoch 014 | train_loss=1.1095 | val_loss=1.0429 | val_acc=0.6796
Epoch 015 | train_loss=1.1026 | val_loss=1.0367 | val_acc=0.6816
Epoch 016 | train_loss=1.

In [93]:
num_names = 10

model = model_lstm.cpu()

sampled_verses = decode_samples(
    sample_from_model(
        model,
        vectorizer,
        num_samples=num_names,
        max_length=300,
        temperature=0.8
    ),
    vectorizer
)

print("-" * 30)
for i in range(num_names):
    print(f"\n Verse {i+1}:\n{sampled_verses[i]}")

------------------------------

 Verse 1:
abu wa'il reported the (mightirt) said no inspended anyone with me given in where his black man or seven persons who loves for allah's cause and faith with that and she said, may allah seins some of his grave in an intord and so she said i have been given to jow his face. i passed from it is thrown 

 Verse 2:
the messenger of allah ( ) said indeed it stose the state to for a person to me. and the prophet said, is had if they return the third hand, and the side commodouting is one far of magastible said, when the messenger of allah ( ) should say to him and his rediya a displisting between the two rak'ahs

 Verse 3:
the prophet ( ) said there is none good bevere that if you invoke allah as merent.

 Verse 4:
ibn umar (allah be pleased wint and the sont of nafr) are water by a would say this that you should find the first time while you will not know while called me from the heant of its liser a screat my use and so he died and forbade a man who