# Predicción de Puntuación y Capitalización en Texto Normalizado

In [1]:
!pip install transformers
from transformers import BertTokenizer, BertModel
import torch
import re
import pandas as pd
import random
from datasets import load_dataset
from torch import nn

RANDOM_SEED = 0



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
bert_model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)

def get_multilingual_token_embedding(token: str):
  """
  Devuelve el embedding (estático) para el token.
  """
  token_id = tokenizer.convert_tokens_to_ids(token)
  if token_id is None or token_id == tokenizer.unk_token_id:
    print(f"❌ El token '{token}' no pertenece al vocabulario de multilingual BERT.")
    return None

  embedding_vector = bert_model.embeddings.word_embeddings.weight[token_id]

  print(f"✅ Token: '{token}' | ID: {token_id}")
  print(f"Embedding shape: {embedding_vector.shape}")
  return embedding_vector

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [4]:
def capitalizacion_de_palabra(palabra: str) -> int:
    if palabra.islower(): return 0
    elif palabra.istitle(): return 1
    elif palabra.isupper(): return 3
    else: return 2

def tokenizar_con_etiquetas(oracion: str, instancia_id: int) -> pd.DataFrame:
    tokens_reales = re.findall(r"\w+['’]?\w*|¿|\?|,|\.|!|¡", oracion)
    dataset = []
    i = 0
    while i < len(tokens_reales):

      if tokens_reales[i] == '¿':
        i += 1
        continue
      else:
        if tokens_reales[i-1] == '¿':
          punt_inicial = tokens_reales[i-1]
        else:
          punt_inicial = ""

      if tokens_reales[i] in ['.', ',', '?']:
        punt_final = tokens_reales[i]
        dataset[-1]["punt_final"] = punt_final
        i += 1
        continue

      tokens = tokenizer.tokenize(tokens_reales[i].lower())
      for j, token in enumerate(tokens):
        token_id = tokenizer.convert_tokens_to_ids(token)
        dataset.append({
          "instancia_id": instancia_id,
          "token_id": token_id,
          "token": token,
          "punt_inicial": punt_inicial if j == 0 else "",
          "punt_final": "",
          "capitalización": capitalizacion_de_palabra(tokens_reales[i])
        })

      i += 1

    return pd.DataFrame(dataset)

In [5]:
def cargar_dataset(path: str,
                   n_max_oraciones: int | None = None,
                   n_max_tokens: int | None = None,
                   fraccion: float | None = None,
                   shuffle: bool = False,
                   random_seed: int = 0) -> pd.DataFrame:

  with open(path, "r", encoding='utf-8') as file:
    if n_max_oraciones is not None:
      oraciones = [file.readline() for _ in range(n_max_oraciones)]
    else:
      oraciones = file.readlines()

  if shuffle:
    random.seed(random_seed)
    random.shuffle(oraciones)

  if n_max_tokens is not None:
    n_tokens = n_max_tokens
  elif fraccion is not None:
    n_tokens = int(fraccion * len(oraciones))
  else:
    n_tokens = len(oraciones)

  dataset = pd.DataFrame()
  if n_tokens == 0 or n_max_oraciones == 0:
    return dataset

  instancia_id = 0
  while (
      instancia_id < n_max_oraciones if n_max_oraciones is not None
      else len(dataset) < n_tokens
  ):
    dataset = pd.concat([
        dataset, tokenizar_con_etiquetas(oraciones[instancia_id], instancia_id + 1)
    ])
    instancia_id += 1

  return dataset

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [41]:
data_path = "/content/drive/MyDrive/Cs. de Datos/Aprendizaje Automático/Aprendizaje Automático II/TP implementativo/es_419_validas.txt"
dataset = cargar_dataset(data_path, n_max_oraciones=1000, random_seed=RANDOM_SEED)
dataset

Unnamed: 0,instancia_id,token_id,token,punt_inicial,punt_final,capitalización
0,1,10361,te,,,1
1,1,48535,mostrar,,,0
2,1,10333,##é,,,0
3,1,10182,los,,,0
4,1,27870,resultados,,.,0
...,...,...,...,...,...,...
0,1000,10189,that,,,2
1,1000,112,',,,2
2,1000,187,s,,,2
3,1000,13448,right,,",",0


In [162]:
dataset['punt_final'].value_counts()

Unnamed: 0_level_0,count
punt_final,Unnamed: 1_level_1
,7182
.,783
",",393
?,187


In [116]:
from torch.utils.data import Dataset

class CapitalizacionDataset(Dataset):
  def __init__(self, dataset: pd.DataFrame):
    self.dataset = dataset

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, instancia_id: int):
    palabras_oracion = self.dataset[self.dataset["instancia_id"] == instancia_id + 1]

    input_indexes = palabras_oracion["token_id"].tolist()
    target_indexes = palabras_oracion["capitalización"].tolist()

    return (torch.tensor(input_indexes, dtype=torch.long),
            torch.tensor(target_indexes, dtype=torch.long))



train_dataset = CapitalizacionDataset(dataset)

In [175]:
from torch.nn.utils.rnn import pad_sequence

PAD_token = 0
PAD_token_target = 4
MAX_LEN = dataset["instancia_id"].value_counts().max()

def collate_fn(batch):
    """
    batch: lista de tuplas (tokens_tensor, target_tensor)
    MAX_LEN: longitud fija a la que se debe paddear o truncar
    PAD_token: ID usado para el padding
    """
    tokens, targets = [], []
    for token_tensor, target_tensor in batch:
        tokens.append(token_tensor)
        targets.append(target_tensor)

    # Primero paddeamos hasta el más largo del batch
    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=PAD_token)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=PAD_token_target)

    # Luego truncamos o paddeamos a longitud fija MAX_LEN
    def pad_or_truncate(tensor, max_len, pad_token):
        if tensor.size(1) > max_len:
            return tensor[:, :max_len]
        elif tensor.size(1) < max_len:
            pad_size = max_len - tensor.size(1)
            padding = torch.full((tensor.size(0), pad_size), pad_token, dtype=torch.long)
            return torch.cat([tensor, padding], dim=1)
        else:
            return tensor

    tokens_fixed = pad_or_truncate(tokens_padded, MAX_LEN, PAD_token)
    targets_fixed = pad_or_truncate(targets_padded, MAX_LEN, PAD_token_target)

    return tokens_fixed, targets_fixed

In [176]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, collate_fn=collate_fn)

In [177]:
for batch_idx, (input_seqs, target_seqs) in enumerate(train_dataloader):
  print(input_seqs)
  print(target_seqs)
  break

tensor([[10361, 48535, 10333, 10182, 27870,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [10911, 22889, 85941, 10183, 10182, 59432,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])
tensor([[1, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,

In [None]:
def evaluate_model(model, dataloader, criterion):
    """Evalúa el modelo en el conjunto de validación"""
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for input_seq, target_seq in dataloader:
            output = model(input_seq=input_seq, max_len=MAX_LEN)

            output = output.reshape(-1, output.size(-1))
            target_seq = target_seq.reshape(-1)

            loss = criterion(output, target_seq)
            total_loss += loss.item()

    return total_loss / len(dataloader)


def train_epoch(model, dataloader, optimizer, criterion):
    """Entrena el modelo en un epoch"""
    model.train()
    total_loss = 0

    for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
        optimizer.zero_grad()
        # Forward pass
        output = model(input_seq, target_seq, MAX_LEN, teacher_forcing_ratio=0.5)
        # Calcular pérdida
        # Reshape para calcular cross entropy
        output = output.reshape(-1, output.size(-1))
        target_seq = target_seq.reshape(-1)

        loss = criterion(output, target_seq)

        # Backward pass
        loss.backward()

        # Gradient clipping para evitar exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)