In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification, BertTokenizerFast
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:
MODEL_NAME = 'google-bert/bert-base-multilingual-cased'
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
label_encoder = LabelEncoder()
labels = [
    "O",  # Para tokens que no son parte de ninguna entidad nombrada
    "B-invoice_id", "I-invoice_id",
    "B-issue_date", "I-issue_date",
    "B-due_date", "I-due_date",
    "B-issuer_name", "I-issuer_name",
    "B-issuer_address", "I-issuer_address",
    "B-issuer_phone",
    "B-issuer_email",
    "B-issuer_tax_id",
    "B-recipient_name", "I-recipient_name",
    "B-recipient_address", "I-recipient_address",
    "B-recipient_phone",
    "B-recipient_email",
    "B-recipient_tax_id",
    "B-item_description", "I-item_description",
    "B-item_quantity",
    "B-item_unit_price",
    "B-item_total",
    "B-subtotal",
    "B-tax_description", "I-tax_description",
    "B-tax_percentage",
    "B-tax_amount",
    "B-total",
    "B-payment_method",
    "UNK"
]

label_encoder.fit(labels)



In [3]:
def align_tokens_and_labels(text,tags):
    encoded_input = tokenizer(text, is_split_into_words=True, padding="max_length", truncation=True, max_length=512)
    encoded_as_text = tokenizer.convert_ids_to_tokens(encoded_input["input_ids"])

    word_ids = encoded_input.word_ids()

    labels = []
    for i in range(len(word_ids)):
        if word_ids[i] is None:
            labels.append('UNK')
        else:
            labels.append(tags[word_ids[i]])
    
    labels = label_encoder.transform(labels)
    
    return {
        "encoded_input": encoded_input, 
        "encoded_labels": labels,
    }



class InvoiceDataset(Dataset):
    def __init__(self, texts, tags, max_len=512):
        self.texts = texts
        self.tags = tags
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tags = self.tags[idx]


        # Tokenización y alineación de etiquetas
        result = align_tokens_and_labels(text, tags)
        encoding = result["encoded_input"]
        labels = result["encoded_labels"]

        return {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [4]:


texts = []  # Lista de textos de factura
tags = []  # Lista de etiquetas (cada etiqueta es una lista de ids de etiquetas)

def load_tags(file_path):
    tags = []
    with open(file_path, 'r') as file:
        for line in file:
            # Dividir la línea por ' -> ' y tomar el segundo elemento, que es la etiqueta
            parts = line.strip().split(' -> ')
            if len(parts) > 1:
                tags.append(parts[1])  # Agrega la etiqueta a la lista
    return tags

def load_text(file_path):
    texts = []
    with open(file_path, 'r') as file:
        for line in file:
            # Dividir la línea por ' -> ' y tomar el segundo elemento, que es la etiqueta
            parts = line.strip().split(' -> ')
            if len(parts) > 1:
                texts.append(parts[0])  # Agrega la etiqueta a la lista
    return texts

tags = []
for i in range(10):  # Ajusta el rango según la cantidad de facturas
    tags.append(load_tags(f'facturas/factura{i}.tokens'))
    texts.append(load_text(f'facturas/factura{i}.tokens'))

# Crear el dataset y dataloader
dataset = InvoiceDataset(texts, tags)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

print(tags[0])
print(label_encoder.transform(tags[0]))




['O', 'B-invoice_id', 'O', 'O', 'O', 'O', 'B-issue_date', 'O', 'O', 'O', 'O', 'B-due_date', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-issuer_name', 'I-issuer_name', 'B-recipient_name', 'I-recipient_name', 'B-issuer_address', 'I-issuer_address', 'I-issuer_address', 'I-issuer_address', 'I-issuer_address', 'I-issuer_address', 'I-issuer_address', 'I-issuer_address', 'I-issuer_address', 'B-recipient_address', 'I-recipient_address', 'I-recipient_address', 'I-recipient_address', 'I-recipient_address', 'I-recipient_address', 'B-issuer_phone', 'B-recipient_phone', 'B-issuer_email', 'B-recipient_email', 'B-issuer_tax_id', 'B-recipient_tax_id', 'O', 'O', 'O', 'O', 'O', 'B-item_description', 'I-item_description', 'I-item_description', 'B-item_quantity', 'B-item_unit_price', 'B-item_total', 'B-item_description', 'I-item_description', 'I-item_description', 'B-item_quantity', 'B-item_unit_price', 'B-item_total', 'O', 'O', 'B-subtotal', 'B-tax_description', 'B-tax_percentage', 'O', 'B-tax_amount', 'O

In [5]:
#Probar a tokenizar un texto para ver su longitud tokenizada

print(dataset[0]['input_ids'].tolist())
print(dataset[0]['labels'].tolist())

ids = tokenizer.convert_ids_to_tokens(dataset[0]['input_ids'])

for i in range(len(ids)):
    print(ids[i], label_encoder.inverse_transform([dataset[0]['labels'][i].item()]))

[101, 85245, 11465, 143, 32792, 22650, 11396, 11305, 20187, 12964, 10104, 83054, 131, 22171, 11011, 118, 10814, 118, 10907, 20187, 12964, 10104, 26044, 43053, 131, 22171, 11011, 118, 10831, 118, 10814, 84387, 10127, 11289, 105580, 131, 84387, 10127, 97200, 54973, 10667, 131, 73230, 15129, 26580, 43707, 13820, 27173, 67574, 15377, 34528, 138, 14971, 119, 56114, 11830, 10900, 117, 80821, 49517, 20305, 11383, 11396, 13759, 90847, 25675, 23044, 117, 25067, 10878, 89478, 70072, 118, 56267, 118, 40633, 10929, 10686, 21069, 49469, 11396, 17449, 118, 33195, 118, 69717, 118, 29718, 11305, 29467, 10874, 40762, 19029, 137, 77436, 119, 26978, 20169, 10237, 76977, 137, 29698, 85505, 119, 10212, 193, 38850, 10929, 103450, 13695, 11703, 99555, 10884, 39999, 10162, 32168, 11166, 10858, 12387, 11373, 68430, 13810, 51991, 14820, 26680, 10162, 35248, 13584, 19919, 16780, 25220, 11639, 118, 30798, 47543, 10870, 11356, 118, 13596, 29277, 10107, 122, 12074, 119, 12535, 12074, 119, 12535, 79326, 25470, 10407

) ['B-tax_percentage']
: ['O']
22 ['B-tax_amount']
. ['B-tax_amount']
27 ['B-tax_amount']
Total ['O']
: ['O']
161 ['B-total']
. ['B-total']
46 ['B-total']
M ['O']
##ét ['O']
##odo ['O']
de ['O']
pago ['O']
: ['O']
Credit ['B-payment_method']
Card ['B-payment_method']
[SEP] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD] ['UNK']
[PAD

In [6]:
# Dividir datos
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, tags, test_size=0.1)

# Crear DataLoaders
train_dataset = InvoiceDataset(train_texts, train_labels)
val_dataset = InvoiceDataset(val_texts, val_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Determinar el dispositivo a usar (GPU si está disponible, de lo contrario CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modelo
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
model.to(device)  # Mover el modelo a la GPU si está disponible

# Optimizador
optimizer = AdamW(model.parameters(), lr=5e-5)

# Función de entrenamiento
def train(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        # Mover los datos al dispositivo correcto
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)



Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
if torch.cuda.is_available():
    print("CUDA is available. GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

CUDA is available. GPU: NVIDIA GeForce RTX 3060


In [8]:
#Debugging


In [9]:
# Entrenamiento
for epoch in range(10000):
    train_loss = train(model, train_loader, optimizer)
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss}')

Epoch 1, Train Loss: 3.2029229402542114
Epoch 2, Train Loss: 1.9260507225990295
Epoch 3, Train Loss: 1.7057088017463684
Epoch 4, Train Loss: 1.6786693930625916
Epoch 5, Train Loss: 1.312235713005066
Epoch 6, Train Loss: 1.0991789102554321
Epoch 7, Train Loss: 0.9233507812023163
Epoch 8, Train Loss: 0.925711452960968
Epoch 9, Train Loss: 0.8094072639942169
Epoch 10, Train Loss: 0.7061797082424164
Epoch 11, Train Loss: 0.5781725645065308
Epoch 12, Train Loss: 0.508179783821106
Epoch 13, Train Loss: 0.4347299784421921
Epoch 14, Train Loss: 0.37733136117458344
Epoch 15, Train Loss: 0.36994677782058716
Epoch 16, Train Loss: 0.3232783377170563
Epoch 17, Train Loss: 0.32145826518535614
Epoch 18, Train Loss: 0.2721906155347824
Epoch 19, Train Loss: 0.2419148087501526
Epoch 20, Train Loss: 0.200696662068367
Epoch 21, Train Loss: 0.21911227703094482
Epoch 22, Train Loss: 0.16867458075284958
Epoch 23, Train Loss: 0.16425105184316635
Epoch 24, Train Loss: 0.14910607039928436
Epoch 25, Train Loss: 

KeyboardInterrupt: 

In [10]:
model.eval()  # Pon el modelo en modo evaluación
model.to(device)  # Asegúrate de que el modelo esté en el dispositivo correcto

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [11]:
from transformers import BertTokenizer

# Asumiendo que 'MODEL_NAME' es el nombre del modelo BERT que usaste
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

text = """
Factura
H748950

Fecha de emisión: 2024-04-05
Fecha de vencimiento: 2024-04-11

Datos del Emisor:
Hall, Howard and Compton

Datos del Receptor:
John Martin

7780 Christine Underpass East Daniel, SD 10665 981 Perez Ports Cheryltown, VA 68274

546-443-6969
mathisnatalieOfitzgerald.com
YzN809qpY325

Cantidad Precio Unitario Total

morph scalable functionalities
harness viral eyeballs
enable one-to-one systems

iterate intuitive ROI

(913)675-2764x8171
tiffanys6O yahoo.com
IKQ896GSX614

4 34.04 136.16
9 88.77 798.93
5 60.5 302.5
10 50.02 500.2

Subtotal: 1737.79
VAT (16%): 278.05
Total: 2015.84

Método de pago: Bank Transfer
"""

encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
input_ids = encoded_input['input_ids'].to(device)
attention_mask = encoded_input['attention_mask'].to(device)


In [12]:
with torch.no_grad():  # No necesitas calcular gradientes aquí
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

In [13]:
import torch.nn.functional as F

# Aplicar softmax para obtener probabilidades
probabilities = F.softmax(logits, dim=-1)
predictions = torch.argmax(probabilities, dim=-1)
predicted_labels = [label_encoder.inverse_transform([label.item()])[0] for label in predictions[0]]

In [31]:
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])


#Compactar la salida tokens y etiquetas

def compact_output(tokens, labels):
    compacted_tokens = []
    compacted_labels = []
    current_token = ''
    current_label = ''
    for token, label in zip(tokens, labels):
        if token.startswith('##'):
            current_token += token[2:]
        else:
            if current_token:
                compacted_tokens.append(current_token)
                compacted_labels.append(current_label)
            current_token = token
            current_label = label
    if current_token:
        compacted_tokens.append(current_token)
        compacted_labels.append(current_label)

    #Eliminat todos los tokens que no son parte de ninguna entidad nombrada
    for i in range(len(compacted_labels)):
        if compacted_labels[i] == 'O':
            compacted_tokens[i] = None
            compacted_labels[i] = None
        if compacted_labels[i] == 'UNK':
            compacted_tokens[i] = None
            compacted_labels[i] = None
    
    compacted_tokens = [token for token in compacted_tokens if token is not None]
    compacted_labels = [label for label in compacted_labels if label is not None]
            
    
    #Si tokens consecutivos tienen el mismo label, se concatenan
    for i in range(1, len(compacted_labels)):
        if compacted_labels[i] == compacted_labels[i - 1]:
            compacted_tokens[i] = compacted_tokens[i - 1] + " " + compacted_tokens[i]
            # Marcar el token anterior para eliminarlo
            compacted_tokens[i - 1] = None
            # Marcar el label anterior para eliminarlo
            compacted_labels[i - 1] = None

    compacted_tokens = [token for token in compacted_tokens if token is not None]
    compacted_labels = [label for label in compacted_labels if label is not None]
    
    return compacted_tokens, compacted_labels



tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
tokens, labels = compact_output(tokens, predicted_labels)
for token, label in zip(tokens, labels):
    print(f'{token} -> {label}')


H748950 -> B-invoice_id
2024-04-05 -> B-issue_date
2024-04-11 -> B-due_date
Hall, -> B-issuer_name
HowardandCompton -> I-issuer_name
John -> B-recipient_name
Martin -> I-recipient_name
7780 -> B-issuer_address
ChristineUnderpassEastDaniel,SD10665 -> I-issuer_address
981 -> B-recipient_address
PerezPortsCheryltown,VA68274 -> I-recipient_address
546-443-6969 -> B-issuer_phone
mathisnatalieOfitzgerald.com -> B-issuer_email
YzN809qpY325 -> B-issuer_tax_id
morphscalablefunctionalitiesharnessviraleyeballs -> I-item_description
enable -> B-item_description
one-to-onesystems -> I-item_description
iterate -> B-item_description
intuitiveROI -> I-item_description
(913)675-2764x8171 -> B-issuer_phone
tiffanys6Oyahoo.com -> I-item_description
IKQ896GSX614 -> B-recipient_tax_id
434.04136.16988.77798.93560.5302 -> B-item_unit_price
. -> B-item_total
5 -> B-item_unit_price
1050.02500.2 -> B-item_total
1737.79 -> B-subtotal
VAT -> B-tax_description
(16%) -> B-tax_percentage
278.05 -> B-tax_amount
2015.