In [17]:
# Install necessary libraries
!pip install transformers torch datasets pandas pillow numpy pdf2image pytesseract

# Install Tesseract OCR
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev

# Install Poppler Utilities
!apt-get install -y poppler-utils

# Set the Tesseract command path
import pytesseract
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Verify installation
!tesseract --version
!pdftoppm -h

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
tesseract 4.1.1
 leptonica-1.82.0
  libgif 5.1.9 : libjpeg 8d (libjpeg-turbo 2.1.1) : libpng 1.6.37 : libtiff 4.3.0 : zlib 1.2.11 : libwebp 1.2.2 : libopenjp2 2.4.0
 Found AVX2
 Found AVX
 Found FMA
 Found SSE
 Found libarchive 3.6.0 zlib/1.2.11 liblzma/5.2.5 bz2lib/1.0.8 liblz4/1.9.3 libzstd/1.4.8
pdftoppm version 22.02

In [18]:
!pip install pandas langchain-groq



In [19]:
# from Predictor import predict
import getpass
import pandas as pd
import os
api_key = os.getenv("GROQ_API_KEY")
os.environ["GROQ_API_KEY"] = api_key
from langchain_groq import ChatGroq
llm = ChatGroq(model="llama3-8b-8192")

def predict_llms(text):
    from langchain_core.messages import HumanMessage, SystemMessage

    messages = [
        SystemMessage("the given text is extracted from a scanned PDF document using OCR. Based on the text, return what type of document label it is in maximum of 3 words only .Refrain from using any adjectives, be as straight forward and to the point as possible. For example: cards, credit cards, application form, etc. If nothing can be deduced directly, return Nan."),
        HumanMessage(text),
    ]

    return llm.invoke(messages).content

In [20]:
import json
from transformers import LayoutLMForSequenceClassification, LayoutLMConfig

def initialize_and_save_model():
    # Load the configuration from config.json
    with open('config.json', 'r') as f:
        config_dict = json.load(f)

    # Initialize the model with the configuration
    config = LayoutLMConfig.from_dict(config_dict)
    model = LayoutLMForSequenceClassification(config)

    # Save the model
    model.save_pretrained("./saved_model")

# Initialize and save the model
initialize_and_save_model()

In [23]:
import os
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
from transformers import LayoutLMForSequenceClassification, LayoutLMTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import pytesseract
from datasets import Features, Sequence, ClassLabel, Value, Array2D
import numpy as np
from datasets import Dataset
from pdf2image import convert_from_path
classes = ['invoice', 'resume', 'passport', 'Tax_Statement', 'balance_sheet', 'Income_Statement', 'Driving_License', ]


def normalize_box(box, width, height):
     return [
         int(1000 * (box[0] / width)),
         int(1000 * (box[1] / height)),
         int(1000 * (box[2] / width)),
         int(1000 * (box[3] / height)),
     ]

def apply_ocr(example):
        # get the image
        image = convert_from_path(example['file_path'])[0]

        width, height = image.size

        # apply ocr to the image
        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
        float_cols = ocr_df.select_dtypes('float').columns
        ocr_df = ocr_df.dropna().reset_index(drop=True)
        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
        ocr_df = ocr_df.dropna().reset_index(drop=True)

        # get the words and actual (unnormalized) bounding boxes
        #words = [word for word in ocr_df.text if str(word) != 'nan'])
        words = list(ocr_df.text)
        words = [str(w) for w in words]
        coordinates = ocr_df[['left', 'top', 'width', 'height']]
        actual_boxes = []
        for idx, row in coordinates.iterrows():
            x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format
            actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box
            actual_boxes.append(actual_box)

        # normalize the bounding boxes
        boxes = []
        for box in actual_boxes:
            boxes.append(normalize_box(box, width, height))

        # add as extra columns
        assert len(words) == len(boxes)
        example['words'] = words
        example['bbox'] = boxes
        return example
tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
label2idx = {'invoice': 0, 'resume': 1, 'passport':2, 'Tax_Statement':3, 'balance_sheet':4, 'Income_Statement':5, 'Driving_License':6}

def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
  words = example['words']
  normalized_word_boxes = example['bbox']

  assert len(words) == len(normalized_word_boxes)

  token_boxes = []
  for word, box in zip(words, normalized_word_boxes):
      word_tokens = tokenizer.tokenize(word)
      token_boxes.extend([box] * len(word_tokens))

  # Truncation of token_boxes
  special_tokens_count = 2
  if len(token_boxes) > max_seq_length - special_tokens_count:
      token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

  # add bounding boxes of cls + sep tokens
  token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

  encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
  # Padding of token_boxes up the bounding boxes to the sequence length.
  input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
  padding_length = max_seq_length - len(input_ids)
  token_boxes += [pad_token_box] * padding_length
  encoding['bbox'] = token_boxes

  assert len(encoding['input_ids']) == max_seq_length
  assert len(encoding['attention_mask']) == max_seq_length
  assert len(encoding['token_type_ids']) == max_seq_length
  assert len(encoding['bbox']) == max_seq_length

  return encoding

# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'label': ClassLabel(names=['refuted', 'entailed']),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LayoutLMForSequenceClassification.from_pretrained("./saved_model")
model.to(device)
import pytesseract
import numpy as np
import torch.nn.functional as F

def predict(test_data):

    human_needed = False
    pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
    test_dataset = Dataset.from_pandas(test_data)
    updated_test_dataset = test_dataset.map(apply_ocr)

    df = pd.DataFrame.from_dict(updated_test_dataset)
    text = " ".join(df['words'][0])

    encoded_test_dataset = updated_test_dataset.map(lambda example: encode_example(example))

    encoded_test_dataset.set_format(type='torch', columns=['input_ids', 'bbox', 'attention_mask', 'token_type_ids'])

    test_dataloader = torch.utils.data.DataLoader(encoded_test_dataset, batch_size=1, shuffle=False)

    for test_batch in test_dataloader:

        input_ids = test_batch["input_ids"].to(device)
        bbox = test_batch["bbox"].to(device)
        attention_mask = test_batch["attention_mask"].to(device)
        token_type_ids = test_batch["token_type_ids"].to(device)

        # forward pass
        outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        classification_logits = outputs.logits
        classification_results = torch.softmax(classification_logits, dim=1).tolist()[0]
        # for i in range(len(classes)):
        #     print(f"{classes[i]}: {int(round(classification_results[i] * 100))}%")
        res = []
        for i in range(len(classes)):
            res.append(int(round(classification_results[i] * 100)))
        if any(value > 90 for value in res):
            prediction = (outputs.logits.argmax(-1).squeeze().tolist())
            return text, classes[prediction], human_needed

        else:
            prediction = predict_llms(text)
            human_needed = True
            return text, prediction, human_needed

        # print(test_batch['label'])



file_path = 'receipt1.pdf'
data = {'file_path': [file_path]}
df = pd.DataFrame(data)
text, prediction, human_needed = predict(df)
print(prediction)
print(text)
print(human_needed)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Restaurant receipt
GREEN FIELD 5305 E PACIFIC COAST H Long Beach, CA 90604 (S62) 597-0906 Server: Francis Order #: 699?3 lable: Bll | Coffee eC Lunch 1 Coke SUB TOTAL: Tax 1: IOTAL: 5/26/2016 12:53:10 PM NNN THANK YOU! WY Station: 3 Dine In Guests: e
True
