<a href="https://colab.research.google.com/github/Sakshi-S-S/Docusign-Document-Autotagging_Wealth-Hackathon/blob/main/Auto_Tagging_Roberta_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Latest code
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV data
data = pd.read_csv("/content/Demo-tagged-pdf.csv")

# Drop rows where 'field_name' or 'tag' is NaN
data.dropna(subset=['label', 'name'], inplace=True)

# Extract relevant columns
texts = data['label'].tolist()
labels = data['name'].tolist()

# Create a mapping for labels to integers
unique_tags = list(set(labels))
label_to_int = {label: i for i, label in enumerate(unique_tags)}
int_labels = [label_to_int[label] for label in labels]

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, int_labels, test_size=0.2, random_state=42
)


In [None]:
pip install pandas



# New Section

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Ensure the texts are in the correct format
assert isinstance(train_texts, list) and all(isinstance(text, str) for text in train_texts), "train_texts should be a list of strings"
assert isinstance(val_texts, list) and all(isinstance(text, str) for text in val_texts), "val_texts should be a list of strings"

train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
#Preparing custom dataset
import torch
from torch.utils.data import Dataset, DataLoader

class TaggingDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TaggingDataset(train_encodings, train_labels)
val_dataset = TaggingDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
from transformers import RobertaForSequenceClassification
from sklearn.metrics import accuracy_score
import torch.optim as optim

# Initialize the model
NUM_LABELS = len(unique_tags)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=NUM_LABELS)

# Training setup
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(4):  # Number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Training Loss: {loss.item()}")

    model.eval()
    val_loss = 0
    for batch in val_loader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(**inputs, labels=labels)
        val_loss += outputs.loss.item()
    print(f"Epoch {epoch+1}, Validation Loss: {val_loss / len(val_loader)}")
    #accuracy_score(true_labels, predicted_labels)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Training Loss: 5.565396785736084
Epoch 1, Validation Loss: 5.590958595275879
Epoch 2, Training Loss: 5.605140686035156
Epoch 2, Validation Loss: 5.666267037391663
Epoch 3, Training Loss: 5.4934258460998535
Epoch 3, Validation Loss: 5.858937501907349
Epoch 4, Training Loss: 5.403222560882568
Epoch 4, Validation Loss: 6.056890368461609


In [None]:
pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.10 (from pymupdf)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.10 pymupdf-1.24.10


In [None]:
#Extraction prior to feeding model
import fitz

# Function to get form field attributes
def get_field_attributes(field):
    attributes = {
        'name': field.field_name,
        'type': field.field_type,
        'label': field.field_label,
        'value': field.field_value,
        'tooltip': field.field_tooltip,
        'flags': field.field_flags,
        'rect': field.rect
    }
    return attributes

def extract_editable_fields(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Initialize a list to store field information
    fields_info = []

    # Iterate through each page in the PDF
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)

        # Extract widget annotations (form fields)
        widget_annotations = page.widgets()

        if widget_annotations:
            for widget in widget_annotations:
                field = {
                    "page": page_num + 1,
                    "field_name": widget.field_name or "N/A",
                    "field_type": widget.field_type or "N/A",
                    "field_value": widget.field_value or "N/A",
                    'field_label': widget.field_label or "N/A",
                    "rect": widget.rect,  # This gives the coordinates
                    "x0": widget.rect.x0,
                    "y0": widget.rect.y0,
                    "x1": widget.rect.x1,
                    "y1": widget.rect.y1,
                }
                fields_info.append(field)

    return fields_info

# Path to the PDF
pdf_path = "/content/demo-file-test.pdf"

# Extract editable fields
editable_fields = extract_editable_fields(pdf_path)
field_input = [field['field_label'] for field in editable_fields]


In [None]:
# Predict tags
def predict_tags(texts):
    tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokenized_texts)
    predictions = torch.argmax(outputs.logits, dim=1)
    predicted_labels = [unique_tags[pred] for pred in predictions]
    return predicted_labels

# # Example usage:
# new_texts = ["Zip Code", "Document Title 1"]
# # predicted_tags = predict_tags(new_texts)
# # print(predicted_tags)
predicted_tags = predict_tags(field_input)
print(predicted_tags)

['undefined_25', 'Pension p', 'Pension p', 'Subsequent Purchase', 'undefined_25', '5E+13', 'undefined_25', '5E+13', 'undefined_25', '5E+13', 'undefined_25', '5E+13', 'undefined_25', '5E+13', 'undefined_25', 'Sole Propr', 'Note Check the LLC box above and in the entry space enter the appropriate code C S or P for the tax class', 'undefined_25', 'US TaxExempt Account Type', 'Pension p', 'undefined_25', 'undefined_25', 'undefined_25', 'undefined_25', 'undefined_25', 'undefined_25', 'US TaxExempt Account Type', 'undefined_25', 'Security Number  Tax ID', 'Security Number  Tax ID', 'undefined_25', 'Indiv_2', 'Zip Code_7', 'undefined_25', 'Zip Code_7', 'Date_3', 'Primary', 'undefined_25', 'Zip Code_7', 'Zip Code_7', 'undefined_25', 'Zip Code_7', 'undefined_25', 'undefined_25', 'Zip Code_7', 'Pension p', 'undefined_25', 'Date_3', 'nors State of', 'Subsequent Purchase', 'Zip Code_7', 'undefined_25', 'undefined_25', 'Zip Code_7', 'Security Number  Tax ID_2', 'Pension p', 'undefined_25', 'Pension

In [None]:
import fitz

def update_editable_fields(pdf_path, predicted_tags, updated_pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Initialize a list to store field information
    fields_info = []
    tag_index = 0

    # Iterate through each page in the PDF
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)

        # Extract widget annotations (form fields)
        widget_annotations = page.widgets()

        if widget_annotations:
            for widget in widget_annotations:
                # Ensure we don't run out of predicted tags
                if tag_index < len(predicted_tags):
                    tag = predicted_tags[tag_index]
                    tag_index += 1
                else:
                    tag = "N/A"

                field = {
                    "page": page_num + 1,
                    "field_name": tag,
                    "field_type": widget.field_type,
                    "field_value": widget.field_value,
                    "field_label": widget.field_label,
                    "rect": widget.rect,  # This gives the coordinates
                    "x0": widget.rect.x0,
                    "y0": widget.rect.y0,
                    "x1": widget.rect.x1,
                    "y1": widget.rect.y1
                }

                # Update the widget field name with the predicted tag
                widget.field_value = tag
                widget.update()
                fields_info.append(field)

    # Save the updated PDF to a new file
    pdf_document.save(updated_pdf_path)
    return fields_info

# Path to the PDF
pdf_path = "/content/demo-file-test.pdf"
updated_pdf_path = "/content/demo-file-test (1).pdf"

# Update editable fields with predicted tags
updated_fields = update_editable_fields(pdf_path, predicted_tags, updated_pdf_path)

