<a href="https://colab.research.google.com/github/SantiDrelewicz/TP2AA/blob/main/TP_AA_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicción de Puntuación y Capitalización en Texto Normalizado

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertTokenizerFast, BertModel
import matplotlib.pyplot as plt
import torch
import re
import pandas as pd
import random
from tqdm import tqdm
from datasets import load_dataset
from torch import nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, classification_report

from typing import Any, Optional

from google.colab import drive



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Constants

In [None]:
PUNT_INCIAL_MAP = {'': 0, '¿': 1}
PUNT_FINAL_MAP = {'': 0, ',': 1, '.': 2, '?': 3}
TOKENIZER_EMBEDDING_MODEL_NAME = "bert-base-multilingual-cased"

## Preprocesamiento


In [None]:
def capitalizacion_de_palabra(palabra: str) -> int:
    if palabra.islower(): return 0
    elif palabra.istitle(): return 1
    elif palabra.isupper(): return 3
    else: return 2


def extraer_etiquetas(oracion: str, instancia_id: int, tokenizer) -> list[dict]:
    """
    Tokeniza la oración y devuelve un dataframe con las etiquetas correspondientes.
    """
    palabras_reales = re.findall(r"\w+['’]?\w*|¿|\?|,|\.|!|¡", oracion)
    dataset = []
    for i in range(len(palabras_reales)):

      if palabras_reales[i] == '¿':
        continue
      else:
        if palabras_reales[i-1] == '¿':
          punt_inicial = palabras_reales[i-1]
        else:
          punt_inicial = ""

      if palabras_reales[i] in ['.', ',', '?']:
        punt_final = palabras_reales[i]
        dataset[-1]["punt_final"] = punt_final
        continue

      tokens = tokenizer.tokenize(palabras_reales[i].lower())
      for j, token in enumerate(tokens):
        token_id = tokenizer.convert_tokens_to_ids(token)
        dataset.append({
          "instancia_id": instancia_id,
          "token_id": token_id,
          "token": token,
          "punt_inicial": punt_inicial if j == 0 else "",
          "punt_final": "",
          "capitalización": capitalizacion_de_palabra(palabras_reales[i])
        })

    return dataset


def split_data_from_file(
    filepath: str,
    tokenizer: BertTokenizer = BertTokenizer.from_pretrained(TOKENIZER_EMBEDDING_MODEL_NAME),
    n_max_oraciones: int | None = None, fraccion: float = 1,
    shuffle: bool = False, random_seed: int = 0,
    train_size: float | None = None, test_size: float | None = 0.2
  ) -> tuple[pd.DataFrame, pd.DataFrame]:

    with open(filepath, "r", encoding='utf-8') as file:
        if n_max_oraciones is not None:
            oraciones = [file.readline() for _ in range(n_max_oraciones)]
        else:
            oraciones = file.readlines()
            oraciones = oraciones[:int(len(oraciones) * fraccion)]

    if shuffle:
        random.seed(random_seed)
        random.shuffle(oraciones)

    if train_size is not None:
        test_size = 1 - train_size

    oraciones_test = oraciones[:int(len(oraciones) * test_size)]
    oraciones_train = oraciones[int(len(oraciones) * test_size):]

    train_rows = []
    print("Cargando instancias de entrenamiento")
    for instancia_id, oracion in tqdm(enumerate(oraciones_train), total=len(oraciones_train), ):
        train_rows.extend(extraer_etiquetas(oracion, instancia_id + 1, tokenizer))

    test_rows = []
    print("Cargando instancias de test")
    for instancia_id, oracion in tqdm(enumerate(oraciones_test), total=len(oraciones_test)):
        test_rows.extend(extraer_etiquetas(oracion, instancia_id + 1, tokenizer))

    train_df, test_df = pd.DataFrame(train_rows), pd.DataFrame(test_rows)

    train_df["punt_inicial"] = train_df["punt_inicial"].map(PUNT_INCIAL_MAP)
    train_df["punt_final"] = train_df["punt_final"].map(PUNT_FINAL_MAP)

    test_df["punt_inicial"] = test_df["punt_inicial"].map(PUNT_INCIAL_MAP)
    test_df["punt_final"] = test_df["punt_final"].map(PUNT_FINAL_MAP)

    return train_df, test_df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
class PuntCapitalDataset(Dataset):
  def __init__(self, dataset: pd.DataFrame):
      self.dataset = dataset

  def __len__(self):
      return len(self.dataset["instancia_id"].unique())

  def __getitem__(self, instancia_id: int):
      instancia = self.dataset[self.dataset["instancia_id"] == instancia_id + 1]

      return (
          torch.tensor(instancia["token_id"].tolist(), dtype=torch.long),
          torch.tensor(instancia["punt_inicial"].tolist(), dtype=torch.long),
          torch.tensor(instancia["punt_final"].tolist(), dtype=torch.long),
          torch.tensor(instancia["capitalización"].tolist(), dtype=torch.long)
      )


def collate_fn(batch):
    input_ids, init_labs, final_labs, cap_labs = zip(*batch)

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)

    init_labs = pad_sequence(init_labs, batch_first=True, padding_value=-100)
    final_labs = pad_sequence(final_labs, batch_first=True, padding_value=-100)
    cap_labs = pad_sequence(cap_labs, batch_first=True, padding_value=-100)

    return input_ids, init_labs, final_labs, cap_labs

# Modelo secuencial

In [None]:
class RNN(nn.Module):
    def __init__(self,
                 hidden_size: int,
                 bidirectional: bool = False,
                 lstm: bool = False,
                 num_layers: int = 1,
                 dropout: float = 0.1,
                 freeze_embedding_params: bool = True):
        super().__init__()

        self.embedding = BertModel.from_pretrained(TOKENIZER_EMBEDDING_MODEL_NAME).embeddings.word_embeddings

        if freeze_embedding_params:
          for param in self.embedding.parameters():
            param.requires_grad = False

        self.num_directions = 2 if bidirectional else 1

        if lstm:
          self.rnn = nn.LSTM(input_size=self.embedding.embedding_dim,
                             hidden_size=hidden_size,
                             num_layers=num_layers,
                             batch_first=True,
                             bidirectional=bidirectional,
                             dropout = dropout if num_layers > 1 else 0.0)
        else:
          self.rnn = nn.RNN(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=bidirectional,
                            dropout = dropout if num_layers > 1 else 0.0)

        self.punt_inicial_head = nn.Linear(hidden_size * self.num_directions, 2)
        self.punt_final_head = nn.Linear(hidden_size * self.num_directions, 4)
        self.capitalizacion_head = nn.Linear(hidden_size * self.num_directions, 4)


    def forward(self, x):
        # x: (batch_size, seq_len, embedding_dim)
        emb = self.embedding(x)  # emb: (batch_size, seq_len, embedding_dim)

        out, _ = self.rnn(emb)  # out: (batch_size, seq_len, 2*hidden_size)

        init_logits = self.punt_inicial_head(out)  # [B, T, 2]
        final_logits = self.punt_final_head(out)  # [B, T, 4]
        cap_logits = self.capitalizacion_head(out)  # [B, T, 4]
        return init_logits, final_logits, cap_logits

In [None]:
def plot_learning_curves(learning_curves: dict[str, dict[str, list[float]]]):
    train_losses = learning_curves["losses"]["train"]
    val_losses = learning_curves["losses"]["val"]
    f1s_punt_inicial = learning["f1s_macro"]["punt_incial"]
    f1s_punt_final = learning["f1s_macro"]["punt_final"]
    f1s_capitalizacion = learning["f1s_macro"]["capitalizacion"]

    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label="Train loss")
    plt.plot(val_losses, label="Validation loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

    plt.figure(figsize=(10, 6))
    plt.plot(f1s_punt_inicial, label="Punt. inicial")
    plt.plot(f1s_punt_final, label="Punt. final")
    plt.plot(f1s_capitalizacion, label="Capitalización")
    plt.xlabel("Epoch")
    plt.ylabel("F1 macro")
    plt.legend()

# Junto todo

In [None]:
class RNNPuntCapitalModel():
    def __init__(
        self,
        file_path: str,
        n_max_oraciones: int = 1000,
        test_size: float = 0.2,
        batch_size: int = 32,
        hidden_dim: int = 128,
        lstm: bool = False,
        bidirectional: bool = False,
        n_layers: int = 1,
        dropout: float = 0.1,
        device: Optional[torch.device] = None
    ):
        super().__init__()

        self.file_path = file_path
        self.n_max_oraciones = n_max_oraciones
        self.test_size = test_size

        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        self.lstm = lstm
        self.n_layers = n_layers
        self.dropout = dropout

        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size

        self.tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
        self.tokenizer_fast = BertTokenizerFast.from_pretrained("google-bert/bert-base-multilingual-cased")

        self.model = RNN(hidden_size=hidden_dim,
                         bidirectional=bidirectional,
                         lstm=lstm,
                         num_layers=n_layers,
                         dropout=dropout).to(self.device)

        self.criterion = nn.CrossEntropyLoss(ignore_index=-100)
        self.learing_rate = None
        self.optimizer = None

        self.is_fitted = False

        self.idx_map_init = {0: "", 1: "¿"}
        self.idx_map_final = {0: "", 1: ".", 2: "?", 3: ","}

        self.train_data, self.val_data = split_data_from_file(self.file_path,
                                                              tokenizer=self.tokenizer,
                                                              n_max_oraciones=self.n_max_oraciones,
                                                              test_size=self.test_size,
                                                              shuffle=True)


    def _create_data_loaders(self, train_data: list[dict], val_data: list[dict]) -> tuple[DataLoader, DataLoader]:
        """Create PyTorch data loaders"""
        train_loader = DataLoader(
            PuntCapitalDataset(train_data), batch_size=self.batch_size, collate_fn=collate_fn
        )
        val_loader = DataLoader(
            PuntCapitalDataset(val_data), batch_size=self.batch_size, collate_fn=collate_fn
        )
        return train_loader, val_loader


    def _train_step(self, train_loader: DataLoader, optimizer: torch.optim, criterion) -> float:
        """Entrena una época del modelo y devuelve la pérdida en la época"""

        self.model.train()
        running_loss = 0.0

        for input_ids, init_labs, final_labs, cap_labs in train_loader:
            input_ids = input_ids.to(self.device)
            init_labs = init_labs.to(self.device)
            final_labs = final_labs.to(self.device)
            cap_labs = cap_labs.to(self.device)

            init_logits, final_logits, cap_logits = self.model(input_ids)

            loss_init = criterion(init_logits.view(-1, 2), init_labs.view(-1))
            loss_final = criterion(final_logits.view(-1, 4), final_labs.view(-1))
            loss_cap = criterion(cap_logits.view(-1, 4), cap_labs.view(-1))
            loss = loss_init + loss_final + loss_cap

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)
        return avg_train_loss


    def _eval_step(self, val_loader: DataLoader, show_classif_report) -> dict[str, float | dict[str, float]]:
        """Evalúa el modelo en el conjunto de validación y devuelve la pérdida"""

        self.model.eval()
        val_loss = 0.0

        all_init_trues, all_init_preds = [], []
        all_final_trues, all_final_preds = [], []
        all_cap_trues, all_cap_preds = [], []

        with torch.no_grad():
            for input_ids, init_labs, final_labs, cap_labs in val_loader:
                input_ids = input_ids.to(self.device)
                init_labs = init_labs.to(self.device)
                final_labs = final_labs.to(self.device)
                cap_labs = cap_labs.to(self.device)

                # (B, T, 2)  (B, T, 4)     (B, T, 4)
                init_logits, final_logits, cap_logits = self.model(input_ids)

                loss_init = self.criterion(init_logits.view(-1, 2), init_labs.view(-1))
                loss_final =self. criterion(final_logits.view(-1, 4), final_labs.view(-1))
                loss_cap = self.criterion(cap_logits.view(-1, 4), cap_labs.view(-1))
                loss = loss_init + loss_final + loss_cap
                val_loss += loss.item()

                init_preds = init_logits.softmax(dim=-1).argmax(dim=-1) # (B, T)
                final_preds = final_logits.softmax(dim=-1).argmax(dim=-1) # (B, T)
                cap_preds = cap_logits.softmax(dim=-1).argmax(dim=-1) # (B, T)

                mask_init = init_labs.view(-1) != -100
                mask_final = final_labs.view(-1) != -100
                mask_cap = cap_labs.view(-1) != -100

                all_init_trues.extend(init_labs.view(-1)[mask_init].cpu().tolist())
                all_init_preds.extend(init_preds.view(-1)[mask_init].cpu().tolist())
                all_final_trues.extend(final_labs.view(-1)[mask_final].cpu().tolist())
                all_final_preds.extend(final_preds.view(-1)[mask_final].cpu().tolist())
                all_cap_trues.extend(cap_labs.view(-1)[mask_cap].cpu().tolist())
                all_cap_preds.extend(cap_preds.view(-1)[mask_cap].cpu().tolist())

        avg_val_loss = val_loss / len(val_loader)

        f1_init_macro = f1_score(all_init_trues, all_init_preds, average="macro", zero_division=0)
        f1_final_macro = f1_score(all_final_trues, all_final_preds, average="macro", zero_division=0)
        f1_cap_macro = f1_score(all_cap_trues, all_cap_preds, average="macro", zero_division=0)

        if show_classif_report:
            print("\nInitial puntuation per-class F1:")
            print(classification_report(
                all_init_trues, all_init_preds,
                labels=[0, 1], target_names=[' ', '¿'], zero_division=0,
            ))
            print("Final punctuation per-class F1:")
            print(classification_report(
                all_final_trues, all_final_preds,
                labels=[0, 1, 2, 3], target_names=[' ', ',', '.', '?'], zero_division=0,
            ))
            print("Capitalization per-class F1:")
            print(classification_report(
                all_cap_trues, all_cap_preds,
                labels=[0, 1, 2, 3], target_names=["Lower", "Initial", "Mixed", "ALLCAP"], zero_division=0,
            ))

        return {"loss": avg_val_loss,
                "f1_macro": {"punt_inicial": f1_init_macro,
                            "punt_final": f1_final_macro,
                            "capitalizacion": f1_cap_macro}}


    def train(self,
              optimizer: torch.optim = torch.optim.SGD,
              lr: float = 1e-3,
              epochs: int = 1,
              show_classification_report: bool = False) -> dict[str, dict[str, list[float]]]:

        """Entrena el modelo y devuelve las pérdidas y F1 por época"""
        train_loader, val_loader = self._create_data_loaders(self.train_data, self.val_data)

        self.learing_rate = lr
        self.optimizer = optimizer(self.model.parameters(), lr=self.learing_rate)

        train_losses, val_losses = [], []
        f1s_punt_inicial, f1s_punt_final, f1s_capitalizacion = [], [], []
        for epoch in range(epochs):
            train_loss = self._train_step(train_loader, self.optimizer, self.criterion)
            train_losses.append(train_loss)
            print(f"Epoch {epoch+1}/{epochs} - Train loss: {train_loss:.4f}")

            metrics = self._eval_step(val_loader, show_classification_report)

            val_loss = metrics["loss"]
            val_losses.append(val_loss)
            print(f"Epoch {epoch+1}/{epochs} - Val loss: {val_loss:.4f}")

            f1s = metrics["f1_macro"]
            f1s_punt_inicial.append(f1s["punt_inicial"])
            f1s_punt_final.append(f1s["punt_final"])
            f1s_capitalizacion.append(f1s["capitalizacion"])

            print(f"Epoch {epoch+1}/{epochs} - Punt. inicial F1: {f1s['punt_inicial']:.4f}")
            print(f"Epoch {epoch+1}/{epochs} - Punt. final F1: {f1s['punt_final']:.4f}")
            print(f"Epoch {epoch+1}/{epochs} - Capitalización F1: {f1s['capitalizacion']:.4f}")

            print("-----------------------------------------")

        self.is_fitted = True

        return {"losses": {"train": train_losses, "val": val_losses},
                "f1s_macro": {"punt_incial": f1s_punt_inicial,
                              "punt_final": f1s_punt_final,
                              "capitalizacion": f1s_capitalizacion}}


    def predict(self, text: str):
        """Predict punctuation and capitalization for input raw text"""
        if not self.is_fitted:
            raise ValueError("Model must be trained before prediction")

        return self._predict_and_reconstruct(text)


    def _predict_and_reconstruct(self, raw_sentence: str):
        """
        Runs the model on a single raw sentence and returns the
        reconstructed sentence with punctuation & capitalization.
        """
        if not self.is_fitted:
            raise ValueError("Model must be trained before prediction")

        # Fix: Removed the extra call to tokenizer_fast
        enc = self.tokenizer_fast(
            raw_sentence.lower().split(),  # split into words so word_ids works
            is_split_into_words=True,
            return_offsets_mapping=False,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )

        input_ids = enc["input_ids"].to(self.device)
        word_ids = enc.word_ids(batch_index=0)  # list of length L

        # 2) Model forward
        self.model.eval()
        with torch.no_grad():
            init_logits, final_logits, cap_logits = self.model(input_ids)

        init_pred = init_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
        final_pred = final_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
        cap_pred = cap_logits.argmax(dim=-1).squeeze(0).cpu().tolist()

        # 3) Gather per-word predictions
        words: list[str] = []
        cur_word_idx = None
        cur_subtokens: list[str] = []
        cur_init = ""
        cur_cap = 0
        cur_final = ""

        for i, wid in enumerate(word_ids):
            if wid is None:
                continue
            token = self.tokenizer_fast.convert_ids_to_tokens(int(input_ids[0, i]))
            # start of a new word?
            if wid != cur_word_idx:
                # flush previous
                if cur_word_idx is not None:
                    # assemble the word text
                    word_text = "".join(cur_subtokens)
                    # apply capitalization
                    if cur_cap == 3:
                        word_text = word_text.upper()
                    elif cur_cap == 1:
                        word_text = word_text.capitalize()
                    elif cur_cap == 2:
                        if len(word_text) > 1:
                            word_text = word_text[0].upper() + word_text[1:]
                        else:
                            word_text = word_text.upper()
                    # attach final punctuation
                    word_text = word_text + self.idx_map_final[cur_final]
                    # prepend initial punctuation if any
                    word_text = cur_init + word_text
                    words.append(word_text)
                # reset for new word
                cur_word_idx = wid
                cur_subtokens = [token.replace("##", "")]  # start fresh
                cur_init = self.idx_map_init[init_pred[i]]
                cur_final = final_pred[i]
                cur_cap = cap_pred[i]
            else:
                # continuing same word
                cur_subtokens.append(token.replace("##", ""))
                # update final & cap to last sub-token's prediction
                cur_final = final_pred[i]
                # we keep init and cap from first subtoken

        # flush last word
        if cur_word_idx is not None:
            word_text = "".join(cur_subtokens)
            if cur_cap == 3:
                word_text = word_text.upper()
            elif cur_cap == 1:
                word_text = word_text.capitalize()
            elif cur_cap == 2:
                word_text = word_text[0].upper() + word_text[1:]
            word_text = word_text + self.idx_map_final[cur_final]
            word_text = cur_init + word_text
            words.append(word_text)

        # finally, join with spaces:
        return " ".join(words)


    def predict_and_fill_csv(self, input_df: pd.DataFrame, output_file: str = "predicted.csv") -> pd.DataFrame:
        """
        Takes a dataframe with columns: instancia_id, token_id, token
        Returns a new dataframe with added columns:
        punt_inicial, punt_final, capitalización
        One row per *input token* (same granularity).
        """

        results = []
        tokenizer_fast = self.tokenizer_fast
        device = self.device

        for instancia_id, group in input_df.groupby("instancia_id"):
            # 1. Get the *input tokens exactly as they appear* (these are already subword tokens)
            tokens = group["token"].tolist()

            # 2. Convert tokens to IDs using tokenizer's vocab
            input_ids = tokenizer_fast.convert_tokens_to_ids(tokens)
            input_ids_tensor = torch.tensor([input_ids], device=device)

            # 3. Predict
            self.model.eval()
            with torch.no_grad():
                init_logits, final_logits, cap_logits = self.model(input_ids_tensor)

            # 4. Get predictions per token
            init_pred = init_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
            final_pred = final_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
            cap_pred = cap_logits.argmax(dim=-1).squeeze(0).cpu().tolist()

            # 5. Decode label indices
            punt_inicial = [self.idx_map_init[idx] for idx in init_pred]
            punt_final = [self.idx_map_final[idx] for idx in final_pred]
            capitalizacion = cap_pred  # leave as integers or map if you want

            # 6. Build output dataframe for this group
            predicted_group = group.copy()
            predicted_group["punt_inicial"] = punt_inicial
            predicted_group["punt_final"] = punt_final
            predicted_group["capitalización"] = capitalizacion

            results.append(predicted_group)

        # Concatenate all
        final_df = pd.concat(results, ignore_index=True)

        if output_file:
            final_df.to_csv(output_file, index=False)

        return final_df


    def save_model(self, filepath: str):
        """Save trained model"""
        if not self.is_fitted:
            raise ValueError("Model must be trained before saving")

        model_data = {
            "model_state_dict": self.model.state_state(),
            "model_config": {
                "hidden_dim": self.hidden_dim,
                "n_layers": self.n_layers,
                "dropout": self.dropout,
                "bidirectional": self.bidirectional,
                "lstm": self.lstm,
            },
            "training_config": {
                "learning_rate": self.learing_rate,
                "epochs": self.epochs,
                "batch_size": self.batch_size,
            },
            "is_fitted": self.is_fitted,
        }

        torch.save(model_data, filepath)
        print(f"Model saved to {filepath}")


    def load_model(self, filepath: str):
        """Load trained model"""
        model_data = torch.load(filepath, map_location=self.device)

        # Update configs
        config = model_data["model_config"]
        self.hidden_dim = config["hidden_dim"]
        self.n_layers = config["n_layers"]
        self.dropout = config["dropout"]
        self.lstm = config["lstm"]
        self.bidirectional = config["bidirectional"]

        train_config = model_data["training_config"]
        self.learning_rate = train_config["learning_rate"]
        self.batch_size = train_config["batch_size"]

        # Recreate model
        self.model = RNN(
            hidden_size=self.hidden_dim,
            num_layers=self.n_layers,
            dropout=self.dropout,
            bidirectional=self.bidirectional,
            lstm=self.lstm
        ).to(self.device)


        # Load state
        self.model.load_state_dict(model_data["model_state_dict"])
        self.is_fitted = model_data["is_fitted"]

        print(f"Model loaded from {filepath}")

# Crear modelo

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = "/content/drive/MyDrive/Cs. de Datos/Aprendizaje Automático/Aprendizaje Automático II/TP implementativo/es_419_validas.txt"

In [None]:
rnn_model = RNNPuntCapitalModel(
    file_path = file_path,
    n_max_oraciones=10000,
    batch_size=32,
    test_size=0.2,
    device=device,
    hidden_dim=256,
    n_layers=2,
    dropout=0.1,
    bidirectional=True,
    lstm=True,
)

Cargando instancias de entrenamiento


100%|██████████| 8000/8000 [00:01<00:00, 5223.19it/s]


Cargando instancias de test


100%|██████████| 2000/2000 [00:00<00:00, 5038.00it/s]


In [None]:
metricas = rnn_model.train(torch.optim.Adam, lr=0.001, epochs=5)

Epoch 1/5 - Train loss: 1.5478
Epoch 1/5 - Val loss: 1.0858
Epoch 1/5 - Punt. inicial F1: 0.5225
Epoch 1/5 - Punt. final F1: 0.4333
Epoch 1/5 - Capitalización F1: 0.5332
-----------------------------------------
Epoch 2/5 - Train loss: 0.9364
Epoch 2/5 - Val loss: 0.9653
Epoch 2/5 - Punt. inicial F1: 0.4941
Epoch 2/5 - Punt. final F1: 0.4634
Epoch 2/5 - Capitalización F1: 0.5211
-----------------------------------------
Epoch 3/5 - Train loss: 1.0881
Epoch 3/5 - Val loss: 1.4736
Epoch 3/5 - Punt. inicial F1: 0.5574
Epoch 3/5 - Punt. final F1: 0.4620
Epoch 3/5 - Capitalización F1: 0.5538
-----------------------------------------
Epoch 4/5 - Train loss: 1.2055
Epoch 4/5 - Val loss: 1.7944
Epoch 4/5 - Punt. inicial F1: 0.6546
Epoch 4/5 - Punt. final F1: 0.4615
Epoch 4/5 - Capitalización F1: 0.6151
-----------------------------------------
Epoch 5/5 - Train loss: 1.3002
Epoch 5/5 - Val loss: 1.0384
Epoch 5/5 - Punt. inicial F1: 0.6532
Epoch 5/5 - Punt. final F1: 0.4617
Epoch 5/5 - Capitali

In [None]:
metricas

{'losses': {'train': [5.687512044429779,
   12.646207201004028,
   14.59199898147583,
   22.245486515045165,
   23.385197284698485],
  'val': [13.299078805106026,
   10.735164967794267,
   19.21682213980054,
   29.027669997442338,
   24.348436491830007]},
 'f1s_macro': {'punt_incial': [0.6234265045756511,
   0.6677124714501377,
   0.5196511929198409,
   0.639997571958326,
   0.6711484404281014],
  'punt_final': [0.2279048309497133,
   0.22829269897673807,
   0.22829269897673807,
   0.22829269897673807,
   0.24539606962265656],
  'capitalizacion': [0.40566929168212296,
   0.4070712631587785,
   0.39057531593235534,
   0.1949700065631227,
   0.5413095946494535]}}

In [None]:
rnn_model.predict("hola cómo estás")

'hola ¿cómo. estás'