# Convert the json to layoutlv3 format

In [5]:

# // ignored_text: 0
# // parent_name: 1
# // parent_value: 2
# // child_key: 3
# // child_value: 4
label_studio_file_path = "project-5-at-2024-07-15-09-06-f9dee677.json"

In [4]:
import json

def convert_bounding_box(x, y, width, height):
	"""Converts the given bounding box coordinates to the YOLO format.

	Args:
	x: The x-coordinate of the top-left corner of the bounding box.
	y: The y-coordinate of the top-left corner of the bounding box.
	width: The width of the bounding box.
	height: The height of the bounding box.

	Returns:
	A tuple of four coordinates (x1, y1, x2, y2) in the YOLO format.
	"""

	x1 = x
	y1 = y
	x2 = x + width
	y2 = y + height

	return [x1, y1, x2, y2]


####################################### Loading json data ###################################
with open(label_studio_file_path) as f:
    data = json.load(f)

output = []

for annoatated_image in data:
	data = {}
	annotation = []
	ann_list = []

	if len(annoatated_image) < 8:
		continue

	for k, v in annoatated_image.items():
		if k == 'ocr':
			v = v.split('8080/')[-1]
			print(f'filename: {v}')

			data["file_name"] = f"{label_studio_file_path}/{v}"
			output.append(data)


		if k == 'bbox':
			width = v[0]['original_width']
			height = v[0]['original_height']

			data["height"] = height
			data["width"] = width


	for bb, text, label in zip(annoatated_image['bbox'], annoatated_image['transcription'],   annoatated_image['label']):
		ann_dict = {}

		# print('text :', text)

		ann_dict["box"] = convert_bounding_box(bb['x'], bb['y'], bb['width'], bb['height'])
		ann_dict["text"] = text
		ann_dict["label"] = label['labels'][-1]
		ann_list.append(ann_dict)
	data["annotations"] = ann_list

# print(output)
with open("Training_layoutLMV3.json", "w") as f:
  json.dump(output, f, indent=4)


filename: standard-settlement-instructions.png


In [None]:
# engine code:
from tqdm import tqdm

def train_fn(data_loader, model, optimizer):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)

def eval_fn(data_loader, model):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

In [None]:
# utils code
# utils
import json

def read_json(json_path:str)->dict:
    with open(json_path,'r') as fp:
        data = json.loads(fp.read())
    return data

def train_data_format(json_to_dict:list):

    final_list = []
    count=0
    for item in json_to_dict:
        count = count+1
        # print(item['annotations'])
        test_dict = {"id":int,"tokens":[],"bboxes":[],"ner_tag":[]}
        test_dict["id"] = count
        test_dict["img_path"] = item['file_name']
        for cont in item['annotations']:
            test_dict['tokens'].append(cont['text'])
            test_dict['bboxes'].append(cont['box'])
            test_dict['ner_tag'].append(cont['label'])

        final_list.append(test_dict)
    #print(final_list)
    return final_list

In [None]:
# Data Loader

import torch
from tqdm import tqdm
from PIL import Image

class dataSet:
    def __init__(self,json_path,processor=None) -> None:
        self.json_data = train_data_format(read_json(json_path))

        self.processor = processor

    def __len__(self)->int:
        # print(self.json_data)
        return len(self.json_data)

    def __getitem__(self,index)->dict:
        imgs = []
        words = []
        label = []
        bboxes = []
        data = self.json_data[index]
        
        imgs.append(Image.open(data['img_path']).convert('RGB'))
        words.append(data['tokens'])
        label.append(data['ner_tag'])
        bboxes.append(data['bboxes'])

        encoding = self.processor(
            imgs,
            words,
            boxes = bboxes,
            word_labels = label,
            max_length=512,padding="max_length",truncation="longest_first",return_tensors='pt'
        )

        return {
            "input_ids" : torch.tensor(encoding["input_ids"],dtype=torch.int64).flatten(),
            "attention_mask" : torch.tensor(encoding["attention_mask"],dtype=torch.int64).flatten(),
            "bbox" : torch.tensor(encoding["bbox"],dtype=torch.int64).flatten(end_dim=1),
            "pixel_values" : torch.tensor(encoding["pixel_values"],dtype=torch.float32).flatten(end_dim=1),
            "lables" : torch.tensor(encoding["labels"],dtype=torch.int64)
        }

In [None]:
# Trainer

import torch.nn as nn
import torch
# import torch.nn.functional as F
from transformers import LayoutLMv3ForTokenClassification
import torch.nn.functional as nnf

def loss_fn(pred,target):
    return nn.CrossEntropyLoss()(pred.view(-1,4),target.view(-1))

class ModelModule(nn.Module):
    def __init__(self,n_classes:int) -> None:
        super().__init__()
        self.model = LayoutLMv3ForTokenClassification.from_pretrained('../inputs/layoutlmv3Microsoft')
        self.cls_layer = nn.Sequential(nn.Linear(in_features = 2,
                                                out_features = 512),
                                      nn.ReLU(), nn.Linear(in_features = 512, out_features = n_classes))

    def forward(self,input_ids,attention_mask,bbox,pixel_values,lables=None):
        output = self.model(input_ids,attention_mask=attention_mask,bbox=bbox,pixel_values=pixel_values)

        output = self.cls_layer(output.logits)

        prob = nnf.softmax(output, dim=1)
        top_p, top_class = prob.topk(1, dim = 1)

        print("Probability score :", prob)
        print("top_p, top_class ",top_p, top_class)
        loss = loss_fn(output,lables)

        return  output, loss

In [None]:
# Main

import torch
from transformers import LayoutLMv3FeatureExtractor, LayoutLMv3TokenizerFast, LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from torch.optim import AdamW
import numpy as np

base_model_path = '../inputs/layoutlmv3Microsoft'
train_data_path = '../inputs/Training_layoutLMV3.json'
save_model_path = '../inputs/model.bin'

featur_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
tokeniser = LayoutLMv3TokenizerFast.from_pretrained(base_model_path,ignore_mismatched_sizes=True)

processor = LayoutLMv3Processor(tokenizer=tokeniser,feature_extractor=featur_extractor)
model = LayoutLMv3ForTokenClassification.from_pretrained(base_model_path)

if __name__ == "__main__":
    ds = dataSet(train_data_path,processor)
    dataload = torch.utils.data.DataLoader(ds,batch_size=2)

    # creating model instance
    model = ModelModule(4) 

    # optimizer and loss
    optimizer = AdamW(model.parameters(),lr=5e-5)
    best_loss = np.inf

    # Training the model
    loss_list = []
    for epoch in range(30):
        # Training
        train_loss = train_fn(dataload, model, optimizer)
        # print(model.parameters)
        # break

        if train_loss < best_loss:
            torch.save(model.state_dict(), save_model_path)
            best_loss = train_loss

        if epoch % 10 == 0:
            torch.save(model.state_dict(), f'./model_{epoch}.bin')
            print('i = {}'.format(epoch))
            print(f"{epoch} with loss {train_loss}")

        print(f"{epoch} with loss {train_loss}")

        loss_list.append(train_loss)

        # evaluation
        eval_loss = eval_fn(dataload, model)
        print("Evaluation loss :",  eval_loss)

    np.array(loss_list).dump(open('loss_list.npy', 'wb'))

