## Text Classification using LayoutLM

In [1]:
# Import dependencies
import torch
from transformers import LayoutLMForTokenClassification, LayoutLMConfig, LayoutLMTokenizer
import pdfplumber
import base64

# Load LayoutLM model and tokenizer
model_name_or_path = "microsoft/layoutlm-base-uncased"
config = LayoutLMConfig.from_pretrained(model_name_or_path)
tokenizer = LayoutLMTokenizer.from_pretrained(model_name_or_path)
model = LayoutLMForTokenClassification.from_pretrained(model_name_or_path, config=config)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define heading labels
heading_labels = ['B-H', 'I-H']

# Define dictionary to store text
text_dict = {}

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at microsoft/layoutlm-base-uncased were not used when initializing LayoutLMForTokenClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LayoutLMForTokenClassification were not 

In [None]:
# Load PDF file
pdf_file_path = "./Data/pdf/test_pdf_page_1.pdf" # Give a single page of the entire pdf as input and check
with pdfplumber.open(pdf_file_path) as pdf:
    # Loop through pages
    for page in pdf.pages:
        # Extract image
        img = page.to_image(resolution=150)

        # Convert image to bytes
        img_bytes = img.original.convert('RGB').tobytes()
        img_bytes = base64.b64encode(img_bytes).decode('utf-8')


        # Tokenize image
        input_ids = tokenizer.encode(img_bytes, return_tensors="pt").to(device)

        # Get model predictions
        with torch.no_grad():
            outputs = model(input_ids=input_ids)[0].argmax(2)[0]

        # Convert predictions to tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

        # Extract text and heading labels
        current_heading = ''
        current_text = ''
        for i in range(1, len(tokens)):
            if tokens[i][:2] in heading_labels:
                # If a new heading is found, add the previous text to the dictionary
                if current_heading != '':
                    text_dict[current_heading] = current_text
                # Update the current heading and reset the current text
                current_heading = tokens[i][2:]
                current_text = ''
            else:
                # If the current token is not a heading, add it to the current text
                current_text += tokens[i] + ' '

        # Add the last heading and text to the dictionary
        if current_heading != '':
            text_dict[current_heading] = current_text.strip()

# Print the dictionary
print(text_dict)
