In [1]:
# Install necessary libraries (run in the notebook environment)

import re
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from sklearn.metrics import f1_score
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


# Pre procesamiento

In [2]:
import re
import random
from typing import List, Optional
import pandas as pd
from transformers import BertTokenizer

# 1) Loader function
def read_raw_sentences(
    path: str,
    n_max_sentences: Optional[int] = None,
    shuffle: bool = False,
    random_seed: int = 0,
    report_every: int = 100_000
) -> List[str]:
    """
    Reads up to `n_max_sentences` lines from `path` (one sentence per line).
    Returns a list of the raw lines (with trailing newlines stripped).
    """
    sentences: List[str] = []
    with open(path, "r", encoding="utf-8") as f:
        if n_max_sentences is not None:
            for i in range(n_max_sentences):
                line = f.readline()
                if not line:
                    break
                sentences.append(line.rstrip("\n"))
                if (i + 1) % report_every == 0:
                    print(f"… loaded {i+1} sentences")
        else:
            for i, line in enumerate(f, start=1):
                sentences.append(line.rstrip("\n"))
                if i % report_every == 0:
                    print(f"… loaded {i} sentences")
    print(f"Done loading: {len(sentences)} sentences")
    if shuffle:
        random.seed(random_seed)
        random.shuffle(sentences)
        print("Shuffled sentences")
    return sentences

# 2) Your label extractor & pattern
pattern = re.compile(r"\w+|[^\w\s]", flags=re.UNICODE)

def extract_labels(sentence: str):
    tokens = pattern.findall(sentence)
    words, init_labels, final_labels, cap_labels = [], [], [], []
    for i, token in enumerate(tokens):
        if re.match(r"\w+", token, flags=re.UNICODE):
            # initial punctuation
            init = '¿' if i>0 and tokens[i-1]=='¿' else ''
            # final punctuation
            final = tokens[i+1] if i < len(tokens)-1 and tokens[i+1] in {'.',',','?'} else ''
            # capitalization
            if token.isupper():
                cap = 3
            elif token[0].isupper() and token[1:].islower():
                cap = 1
            elif token.islower():
                cap = 0
            else:
                cap = 2
            words.append(token)
            init_labels.append(init)
            final_labels.append(final)
            cap_labels.append(cap)
    return words, init_labels, final_labels, cap_labels

# 3) Load sentences from file
path = "es_419_validas.txt"
raw_sentences = read_raw_sentences(
    path=path,
    n_max_sentences=500000,   # or None to load all
    shuffle=True,
    random_seed=42,
    report_every=500000
)

# 4) Tokenize+label into DataFrame
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
data = []
for inst_id, sentence in enumerate(raw_sentences, start=1):
    words, init_lbls, final_lbls, cap_lbls = extract_labels(sentence)
    token_idx = 0
    for word, init_lbl, final_lbl, cap_lbl in zip(words, init_lbls, final_lbls, cap_lbls):
        subtokens = tokenizer.tokenize(word.lower())
        for i, sub in enumerate(subtokens):
            # initial only on first subtoken
            punct_init = init_lbl if i == 0 else ''
            # final only on last subtoken
            punct_final = final_lbl if i == len(subtokens)-1 else ''
            data.append([
                inst_id,
                token_idx,
                sub,
                punct_init,
                punct_final,
                cap_lbl
            ])
            token_idx += 1
    if inst_id % 500_000 == 0:
        print(f"… processed {inst_id} sentences, {len(data)} tokens so far")

df = pd.DataFrame(
    data,
    columns=["inst_id", "token_id", "token", "punt_inicial", "punt_final", "capitalizacion"]
)
print(f"Final: {df.shape[0]} tokens from {inst_id} sentences")
print(df.head())


… loaded 500000 sentences
Done loading: 500000 sentences
Shuffled sentences
… processed 500000 sentences, 4096405 tokens so far
Final: 4096405 tokens from 500000 sentences
   inst_id  token_id  token punt_inicial punt_final  capitalizacion
0        1         0    qui                                       1
1        1         1  ##ero                                       1
2        1         2  vivir                                       0
3        2         0     es                                       1
4        2         1    una                                       0


In [3]:
# Convert token strings to BERT token IDs
df["token_id_bert"] = tokenizer.convert_tokens_to_ids(df["token"].tolist())

# Group by instance to form sequences
grouped = {}
for inst_id, group in df.groupby("inst_id"):
    grouped[inst_id] = {
        "input_ids": group["token_id_bert"].tolist(),
        "init_labels": [0 if lbl=='' else 1 for lbl in group["punt_inicial"]],
        "final_labels": [0 if lbl=='' else (1 if lbl=='.' else (2 if lbl=='?' else 3))
                         for lbl in group["punt_final"]],
        "cap_labels": group["capitalizacion"].tolist(),
        "tokens": group["token"].tolist()
    }

# Create a list of instances for splitting
instances = list(grouped.values())
random.shuffle(instances)
n = len(instances)
train_split = int(0.8 * n)
val_split = int(0.9 * n)
train_data = instances[:train_split]
val_data   = instances[train_split:val_split]
test_data  = instances[val_split:]


In [5]:
from torch.nn.utils.rnn import pad_sequence

class PunctCapitalDataset(Dataset):
    def __init__(self, instances):
        self.instances = instances
    def __len__(self):
        return len(self.instances)
    def __getitem__(self, idx):
        inst = self.instances[idx]
        return (
            torch.tensor(inst["input_ids"], dtype=torch.long),
            torch.tensor(inst["init_labels"], dtype=torch.long),
            torch.tensor(inst["final_labels"], dtype=torch.long),
            torch.tensor(inst["cap_labels"], dtype=torch.long)
        )

def collate_fn(batch):
    input_ids, init_labs, final_labs, cap_labs = zip(*batch)
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    init_labs  = pad_sequence(init_labs,  batch_first=True, padding_value=-100)
    final_labs = pad_sequence(final_labs, batch_first=True, padding_value=-100)
    cap_labs   = pad_sequence(cap_labs,   batch_first=True, padding_value=-100)
    return input_ids, init_labs, final_labs, cap_labs

train_loader = DataLoader(PunctCapitalDataset(train_data), batch_size=128, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(PunctCapitalDataset(val_data), batch_size=128, shuffle=False, collate_fn=collate_fn)


# Modelo

In [6]:
import torch.nn as nn

class JointPunctCapitalModel(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        hidden_dim: int,
        num_init: int,
        num_final: int,
        num_cap: int,
        n_layers: int = 1,
        dropout: float = 0.3
    ):
        super().__init__()
        # Embedding + input dropout
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.input_dropout = nn.Dropout(dropout)

        # BiLSTM with inter-layer dropout (only applies if n_layers > 1)
        self.bilstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim // 2,
            num_layers=n_layers,
            dropout=dropout if n_layers > 1 else 0.0,
            batch_first=True,
            bidirectional=True
        )

        # Output dropout before heads
        self.output_dropout = nn.Dropout(dropout)

        # Three classification heads
        self.init_head  = nn.Linear(hidden_dim, num_init)
        self.final_head = nn.Linear(hidden_dim, num_final)
        self.cap_head   = nn.Linear(hidden_dim, num_cap)

    def forward(self, x):
        # x: [B, T]
        emb = self.embedding(x)          # [B, T, E]
        emb = self.input_dropout(emb)    # dropout on embeddings

        out, _ = self.bilstm(emb)        # [B, T, H]
        out = self.output_dropout(out)   # dropout on LSTM outputs

        init_logits  = self.init_head(out)    # [B, T, num_init]
        final_logits = self.final_head(out)   # [B, T, num_final]
        cap_logits   = self.cap_head(out)     # [B, T, num_cap]
        return init_logits, final_logits, cap_logits

# Example instantiation:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = tokenizer.vocab_size
model = JointPunctCapitalModel(
    vocab_size=vocab_size,
    embed_dim=128,
    hidden_dim=256,
    num_init=2,
    num_final=4,
    num_cap=4,
    n_layers=2,
    dropout=0.1
).to(device)


# Entrenamiento y evaluacion

In [7]:
from sklearn.metrics import f1_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

for epoch in range(1, 1+3):  # e.g. 5 epochs
    model.train()
    running_loss = 0.0
    n_batches = 0
    for input_ids, init_labs, final_labs, cap_labs in train_loader:
        input_ids = input_ids.to(device)
        init_labs  = init_labs.to(device)
        final_labs = final_labs.to(device)
        cap_labs   = cap_labs.to(device)

        optimizer.zero_grad()
        init_logits, final_logits, cap_logits = model(input_ids)

        loss_init  = criterion(init_logits.view(-1, 2),  init_labs.view(-1))
        loss_final = criterion(final_logits.view(-1, 4), final_labs.view(-1))
        loss_cap   = criterion(cap_logits.view(-1, 4),   cap_labs.view(-1))
        loss = loss_init + loss_final + loss_cap
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        n_batches += 1

    avg_train_loss = running_loss / n_batches
    print(f"Epoch {epoch} — Train loss: {avg_train_loss:.4f}")

    # --- Validation ---
    model.eval()
    val_loss = 0.0
    n_val_batches = 0

    all_init_trues,  all_init_preds  = [], []
    all_final_trues, all_final_preds = [], []
    all_cap_trues,   all_cap_preds   = [], []

    with torch.no_grad():
        for input_ids, init_labs, final_labs, cap_labs in val_loader:
            input_ids = input_ids.to(device)
            init_labs  = init_labs.to(device)
            final_labs = final_labs.to(device)
            cap_labs   = cap_labs.to(device)

            init_logits, final_logits, cap_logits = model(input_ids)

            # compute val loss
            loss_init  = criterion(init_logits.view(-1, 2),  init_labs.view(-1))
            loss_final = criterion(final_logits.view(-1, 4), final_labs.view(-1))
            loss_cap   = criterion(cap_logits.view(-1, 4),   cap_labs.view(-1))
            loss = loss_init + loss_final + loss_cap
            val_loss += loss.item()
            n_val_batches += 1

            # get predictions
            init_preds  = init_logits.argmax(dim=-1)
            final_preds = final_logits.argmax(dim=-1)
            cap_preds   = cap_logits.argmax(dim=-1)

            # mask out padding (-100)
            mask_init  = (init_labs.view(-1)  != -100)
            mask_final = (final_labs.view(-1) != -100)
            mask_cap   = (cap_labs.view(-1)   != -100)

            all_init_trues.extend(init_labs.view(-1)[mask_init].cpu().tolist())
            all_init_preds.extend(init_preds.view(-1)[mask_init].cpu().tolist())
            all_final_trues.extend(final_labs.view(-1)[mask_final].cpu().tolist())
            all_final_preds.extend(final_preds.view(-1)[mask_final].cpu().tolist())
            all_cap_trues.extend(cap_labs.view(-1)[mask_cap].cpu().tolist())
            all_cap_preds.extend(cap_preds.view(-1)[mask_cap].cpu().tolist())

    avg_val_loss = val_loss / n_val_batches
    print(f"Epoch {epoch} — Val loss:   {avg_val_loss:.4f}")

    # Compute macro-F1
    f1_init_macro  = f1_score(all_init_trues,  all_init_preds,  average='macro', zero_division=0)
    f1_final_macro = f1_score(all_final_trues, all_final_preds, average='macro', zero_division=0)
    f1_cap_macro   = f1_score(all_cap_trues,   all_cap_preds,   average='macro', zero_division=0)
    print(f"Epoch {epoch} — F1 (macro): init={f1_init_macro:.3f}, final={f1_final_macro:.3f}, cap={f1_cap_macro:.3f}")

    # Per-class F1 reports
    print("\nInitial punctuation per-class F1:")
    print(classification_report(all_init_trues, all_init_preds, labels=[0,1], target_names=['no-¿','¿'], zero_division=0))

    print("Final punctuation per-class F1:")
    print(classification_report(all_final_trues, all_final_preds,
                                labels=[0,1,2,3],
                                target_names=['none','.', '?', ','], zero_division=0))

    print("Capitalization per-class F1:")
    print(classification_report(all_cap_trues, all_cap_preds,
                                labels=[0,1,2,3],
                                target_names=['lower','Initial','Mixed','ALLCAP'], zero_division=0))

    print("-"*60) 


Epoch 1 — Train loss: 0.3600
Epoch 1 — Val loss:   0.2517
Epoch 1 — F1 (macro): init=0.853, final=0.817, cap=0.906

Initial punctuation per-class F1:
              precision    recall  f1-score   support

        no-¿       0.99      1.00      0.99    400149
           ¿       0.85      0.61      0.71      9561

    accuracy                           0.99    409710
   macro avg       0.92      0.80      0.85    409710
weighted avg       0.99      0.99      0.99    409710

Final punctuation per-class F1:
              precision    recall  f1-score   support

        none       0.98      0.98      0.98    344383
           .       0.84      0.96      0.90     37987
           ?       0.85      0.64      0.73      9719
           ,       0.77      0.58      0.66     17621

    accuracy                           0.95    409710
   macro avg       0.86      0.79      0.82    409710
weighted avg       0.95      0.95      0.95    409710

Capitalization per-class F1:
              precision    

# Inferencia

In [8]:
from sklearn.metrics import f1_score

model.eval()
output_rows = []

# For metric accumulation
all_init_trues,  all_init_preds  = [], [] 
all_final_trues, all_final_preds = [], []
all_cap_trues,   all_cap_preds   = [], []

idx_map_init    = {0:'', 1:'¿'}
idx_map_final   = {0:'', 1:'.', 2:'?', 3:','}

for inst_id, instance in enumerate(test_data):
    # prepare inputs
    input_ids = torch.tensor(instance["input_ids"], dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        init_logits, final_logits, cap_logits = model(input_ids)

    # get token-level preds
    init_pred  = init_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
    final_pred = final_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
    cap_pred   = cap_logits.argmax(dim=-1).squeeze(0).cpu().tolist()

    # retrieve true labels
    init_true  = instance["init_labels"]
    final_true = instance["final_labels"]
    cap_true   = instance["cap_labels"]
    tokens     = instance["tokens"]

    # sanity check
    assert len(init_pred)==len(init_true)==len(tokens)

    # accumulate and record
    for token_idx, token in enumerate(tokens):
        # append to CSV rows
        output_rows.append({
            "instancia_id": inst_id,
            "token_id":     token_idx,
            "token":        token,
            "punt_inicial": idx_map_init[init_pred[token_idx]],
            "punt_final":   idx_map_final[final_pred[token_idx]],
            "capitalizacion": cap_pred[token_idx]
        })
        # accumulate for metrics
        all_init_trues.append(init_true[token_idx])
        all_init_preds.append(init_pred[token_idx])
        all_final_trues.append(final_true[token_idx])
        all_final_preds.append(final_pred[token_idx])
        all_cap_trues.append(cap_true[token_idx])
        all_cap_preds.append(cap_pred[token_idx])

# build and save DataFrame
output_df = pd.DataFrame(output_rows)
output_df.to_csv("predictions.csv", index=False)
print("Wrote predictions.csv")

# compute and print macro-F1 for each task
f1_init  = f1_score(all_init_trues,  all_init_preds,  average="macro", zero_division=0)
f1_final = f1_score(all_final_trues, all_final_preds, average="macro", zero_division=0)
f1_cap   = f1_score(all_cap_trues,   all_cap_preds,   average="macro", zero_division=0)

print(f"Test set performance:")
print(f"  • Initial punctuation F1-macro: {f1_init:.4f}")
print(f"  • Final punctuation   F1-macro: {f1_final:.4f}")
print(f"  • Capitalization      F1-macro: {f1_cap:.4f}")


Wrote predictions.csv
Test set performance:
  • Initial punctuation F1-macro: 0.8626
  • Final punctuation   F1-macro: 0.8390
  • Capitalization      F1-macro: 0.9332


# Inferencia manual

In [None]:
from typing import List, Tuple, Dict
import torch
import pandas as pd
from transformers import BertTokenizer
import torch.nn as nn


def reconstruct_sentence_with_tokenizer(
    raw_sentence: str,
    model: nn.Module,
    tokenizer: BertTokenizer,
    device: torch.device,
    idx_map_init: Dict[int, str] = {0: '', 1: '¿'},
    idx_map_final: Dict[int, str] = {0: '', 1: '.', 2: '?', 3: ','}
) -> Tuple[str, List[Dict]]:
    """
    Runs the model on a single raw sentence and returns:
    1. The reconstructed sentence (with punctuation & capitalization)
    2. A list of predictions per word (token, init, final, cap labels)
    Tokenization replicates the same method used in training (manual subtokens).
    """
    words = raw_sentence.strip().split()
    all_subtokens = []
    subtoken_to_word = []

    for word_idx, word in enumerate(words):
        subtokens = tokenizer.tokenize(word.lower())
        all_subtokens.extend(subtokens)
        subtoken_to_word.extend([word_idx] * len(subtokens))

    input_ids = tokenizer.convert_tokens_to_ids(all_subtokens)
    input_tensor = torch.tensor([input_ids]).to(device)

    model.eval()
    with torch.no_grad():
        init_logits, final_logits, cap_logits = model(input_tensor)

    init_pred  = init_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
    final_pred = final_logits.argmax(dim=-1).squeeze(0).cpu().tolist()
    cap_pred   = cap_logits.argmax(dim=-1).squeeze(0).cpu().tolist()

    predictions = []
    reconstructed_words = []
    word_pieces = []
    cur_word_idx = -1

    for i, (subtoken, wid) in enumerate(zip(all_subtokens, subtoken_to_word)):
        token_clean = subtoken.replace("##", "")

        if wid != cur_word_idx:
            # flush previous word
            if word_pieces:
                word_str = "".join(word_pieces)
                # Capitalization
                if cur_cap == 3:
                    word_str = word_str.upper()
                elif cur_cap == 1:
                    word_str = word_str.capitalize()
                elif cur_cap == 2:
                    word_str = word_str[0].upper() + word_str[1:]
                # Punctuation
                word_str = cur_init + word_str + idx_map_final[cur_final]
                reconstructed_words.append(word_str)
                predictions.append({
                    "token": word_str,
                    "punt_inicial": cur_init,
                    "punt_final": idx_map_final[cur_final],
                    "capitalizacion": cur_cap
                })

            # reset
            word_pieces = [token_clean]
            cur_word_idx = wid
            cur_init = idx_map_init[init_pred[i]]
            cur_final = final_pred[i]
            cur_cap = cap_pred[i]
        else:
            word_pieces.append(token_clean)
            cur_final = final_pred[i]  # update with latest subtoken prediction

    # flush last word
    if word_pieces:
        word_str = "".join(word_pieces)
        if cur_cap == 3:
            word_str = word_str.upper()
        elif cur_cap == 1:
            word_str = word_str.capitalize()
        elif cur_cap == 2:
            word_str = word_str[0].upper() + word_str[1:]
        word_str = cur_init + word_str + idx_map_final[cur_final]
        reconstructed_words.append(word_str)
        predictions.append({
            "token": word_str,
            "punt_inicial": cur_init,
            "punt_final": idx_map_final[cur_final],
            "capitalizacion": cur_cap
        })

    return " ".join(reconstructed_words), predictions


# Example usage
raw = "estara despierto"
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
recon, preds = reconstruct_sentence_with_tokenizer(
    raw_sentence=raw,
    model=model,
    tokenizer=tokenizer,
    device=device
)

print("Input:", raw)
print("Reconstructed:", recon)
print("Predictions:")
for p in preds:
    print(p)


In [None]:
torch.save(model.state_dict(), "model_weights_bidireccional.pth")


In [2]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined