In [None]:
!pip install transformers torch pandas scikit-learn openpyxl tqdm



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW   # ✅ use this instead of transformers.AdamW
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json

In [None]:
data_path = "/content/converted_invoice_dataset (1).xlsx"
df = pd.read_excel(data_path)

print("Columns:", df.columns)
print(df.head())

Columns: Index(['Input', 'Final_Output'], dtype='object')
                                               Input  \
0  Cream and White Simple Minimalist Catering Ser...   
1  Beige Elegant Professional Business Invoice\n\...   
2  Black and White Clean Modern Invoice\n\nConsul...   
3  Black and White Minimalist Business Invoice\n\...   
4  White Minimalist Business Invoice\n\nSUBTOTALN...   

                                        Final_Output  
0  {"TOTAL_AMOUNT": "$1000", "DUE_AMOUNT": "$550"...  
1  {"INVOICE_NUMBER": "#01234", "BILLED_TO": "Est...  
2  {"BILL_TO": "SALFORD & CO.", "BANK_NAME": "Bor...  
3  {"INVOICE_NUMBER": "12345", "BILLED_TO": "Marc...  
4  {"INVOICE_NUMBER": "#123456", "DATE_ISSUED": "...  


In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")

Train size: 53, Validation size: 14


In [None]:
class InvoiceNERDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.inputs = df["Input"].tolist()
        self.outputs = df["Final_Output"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = "Extract key invoice fields: " + str(self.inputs[idx])
        target_text = str(self.outputs[idx])

        source = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": source["input_ids"].squeeze(),
            "attention_mask": source["attention_mask"].squeeze(),
            "labels": target["input_ids"].squeeze()
        }


In [None]:
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_dataset = InvoiceNERDataset(train_df, tokenizer)
val_dataset = InvoiceNERDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Avg Loss: {total_loss / len(train_loader):.4f}")


Epoch 1 Training: 100%|██████████| 27/27 [00:08<00:00,  3.29it/s]


Epoch 1 | Avg Loss: 22.9465


Epoch 2 Training: 100%|██████████| 27/27 [00:06<00:00,  3.99it/s]


Epoch 2 | Avg Loss: 13.4980


Epoch 3 Training: 100%|██████████| 27/27 [00:07<00:00,  3.60it/s]

Epoch 3 | Avg Loss: 7.1881





In [None]:
model.eval()
preds, trues = [], []

for batch in tqdm(val_loader, desc="Validating"):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"]

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=512
        )

    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    preds.extend(decoded_preds)
    trues.extend(decoded_labels)

# Display few predictions
for i in range(3):
    print(f"\nInput:\n{val_df.iloc[i]['Input'][:200]}...")
    print(f"Prediction:\n{preds[i]}")
    print(f"Ground Truth:\n{trues[i]}")


Validating: 100%|██████████| 7/7 [00:21<00:00,  3.02s/it]


Input:
Green IT Company Professional Service Invoice

DESCRIPTION QTY TOTAL
SUBTOTAL $500
Tax
TOTAL
10%
$550
UNIT PRICE
logo design 1 $100
brand consultation 1 $100
website design 1 $100
social media templat...
Prediction:
Extract key invoice fields: Green IT Company Professional Service Invoice DESCRIPTION QTY TOTAL SUBTOTAL $500 Tax TOTAL 10% $550 UNIT PRICE logo design 1 $100 brand consultation 1 $100 website design 1 $100 website design 1 $100 social media templates 1 $100 brand manual 1 $100 100 100 100 100 100 100 ISSUED TO: Jonathan Patterson Liceria & Co. 123 Anywhere St., Any City BANK DETAILS AveryDavis THANK YOU DATE: 11.02.2030 INVOICE NO: 01234 DUE DATE: 11.03.2030 Borcele Bank Account Name: Avery Davis Account No.: 0123 4567 8901
Ground Truth:
"INVOICE_NUMBER": "01234", "DATE": "11.02.2030", "DUE_DATE": "11.03.2030", "BILL_TO": "Jonathan Patterson", "ADDRESS": "123 Anywhere St. , Any City", "BANK_NAME": "Borcele Bank", "ACCOUNT_NAME": "Avery Davis", "ACCOUNT_NUMBER": "01




In [None]:
output_dir = "/content/invoice_t5_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Model saved to:", output_dir)

Model saved to: /content/invoice_t5_model


In [None]:
def predict_invoice(text):
    input_text = "Extract key invoice fields: " + text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=512)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# Example:
sample_invoice = "Invoice No: #A1234\nTotal Amount: $250.00\nBilled To: John Doe"
print(predict_invoice(sample_invoice))


Extract key invoice fields: Invoice No: #A1234 Total Amount: $250.00 Billed To: John Doe
