In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/DAN.zip

Archive:  /content/drive/MyDrive/DAN.zip
   creating: content/DAN/
  inflating: content/DAN/README.md   
  inflating: content/DAN/LICENSE_CECILL-C.md  
  inflating: content/DAN/visual_slanted_lines.png  
   creating: content/DAN/.git/
   creating: content/DAN/.git/branches/
  inflating: content/DAN/.git/packed-refs  
 extracting: content/DAN/.git/HEAD   
   creating: content/DAN/.git/hooks/
  inflating: content/DAN/.git/hooks/pre-merge-commit.sample  
  inflating: content/DAN/.git/hooks/update.sample  
  inflating: content/DAN/.git/hooks/post-update.sample  
  inflating: content/DAN/.git/hooks/pre-push.sample  
  inflating: content/DAN/.git/hooks/applypatch-msg.sample  
  inflating: content/DAN/.git/hooks/pre-applypatch.sample  
  inflating: content/DAN/.git/hooks/commit-msg.sample  
  inflating: content/DAN/.git/hooks/prepare-commit-msg.sample  
  inflating: content/DAN/.git/hooks/push-to-checkout.sample  
  inflating: content/DAN/.git/hooks/pre-receive.sample  
  inflating: content/D

# **Dependencies Installation**

In [1]:
!pip install scipy
!pip install skia-pathops
!pip install tiffile
!pip install srt
!pip install skia-pathops
!pip install torch
!pip install  torchvision
!pip install screeninfo==0.7
!pip install pyunpack
#!pip install pycairo
!pip install -r /content/content/DAN/requirements.txt

Collecting tiffile
  Using cached tiffile-2018.10.18-py2.py3-none-any.whl (2.7 kB)
Installing collected packages: tiffile
Successfully installed tiffile-2018.10.18
Collecting srt
  Using cached srt-3.5.3-py3-none-any.whl
Installing collected packages: srt
Successfully installed srt-3.5.3
Collecting screeninfo==0.7
  Downloading screeninfo-0.7.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: screeninfo
  Building wheel for screeninfo (setup.py) ... [?25l[?25hdone
  Created wheel for screeninfo: filename=screeninfo-0.7-py3-none-any.whl size=13624 sha256=0428bfdf649b382d27be2da899971dae27785b114f471cdb8edd780146199706
  Stored in directory: /root/.cache/pip/wheels/02/8a/a8/3bb3aa4fd31941124c5975d00412a63938fa5879c4b7b63458
Successfully built screeninfo
Installing collected packages: screeninfo
Successfully installed screeninfo-0.7
Collecting pyunpack
  Downloading pyunpack-0.3-py2.py3-none-any.whl (4.1 kB)
Collecting easyprocess

# **Dataset Creation**

In [None]:
%cd /content/content/DAN/Datasets/dataset_formatters
!python read2016_formatter.py

In [None]:
#!zip -r /content/drive/MyDrive/DAN.zip /content/content/DAN

In [None]:
%cd /content/content/DAN

/content/content/DAN


# **Training**

In [None]:
import os
import sys
#DOSSIER_COURRANT = os.path.dirname(os.path.abspath(__file__))
#DOSSIER_PARENT = os.path.dirname(DOSSIER_COURRANT)
from torch.optim import Adam
from basic.transforms import aug_config
from OCR.ocr_dataset_manager import OCRDataset, OCRDatasetManager
from OCR.document_OCR.dan.trainer_dan import Manager
from OCR.document_OCR.dan.models_dan import GlobalHTADecoder
from basic.models import FCN_Encoder
from basic.scheduler import exponential_dropout_scheduler, linear_scheduler
import torch
import numpy as np
import random
import torch.multiprocessing as mp

# function to train the model
def train_and_test(rank, params):
    torch.manual_seed(0)
    torch.cuda.manual_seed(0)
    np.random.seed(0)
    random.seed(0)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    params["training_params"]["ddp_rank"] = rank
    model = Manager(params)
    model.load_model()

    model.train()

    # load weights giving best CER on valid set
    model.params["training_params"]["load_epoch"] = "best"
    model.load_model()

    metrics = ["cer", "wer", "time", "map_cer",  "loer"]
    for dataset_name in params["dataset_params"]["datasets"].keys():
        for set_name in ["test", "valid", "train"]:
            model.predict("{}-{}".format(dataset_name, set_name), [(dataset_name, set_name), ], metrics, output=True)


if __name__ == "__main__":

    dataset_name = "READ_2016"  # ["RIMES", "READ_2016"]
    dataset_level = "page"  # ["page", "double_page"]
    dataset_variant = "_sem"

    # max number of lines for synthetic documents
    max_nb_lines = {
        "RIMES": 40,
        "READ_2016": 30,
    }

    params = {
        "dataset_params": {
            "dataset_manager": OCRDatasetManager,
            "dataset_class": OCRDataset,
            "datasets": {
                dataset_name: "/content/drive/MyDrive/{}_{}{}".format(dataset_name, dataset_level, dataset_variant),
            },
            "train": {
                "name": "{}-train".format(dataset_name),
                "datasets": [(dataset_name, "train"), ],
            },
            "valid": {
                "{}-valid".format(dataset_name): [(dataset_name, "valid"), ],
            },
            "config": {
                "load_in_memory": True,  # Load all images in CPU memory
                "worker_per_gpu": 4,  # Num of parallel processes per gpu for data loading
                "width_divisor": 8,  # Image width will be divided by 8
                "height_divisor": 32,  # Image height will be divided by 32
                "padding_value": 0,  # Image padding value
                "padding_token": None,  # Label padding value
                "charset_mode": "seq2seq",  # add end-of-transcription ans start-of-transcription tokens to charset
                "constraints": ["add_eot", "add_sot"],  # add end-of-transcription ans start-of-transcription tokens in labels
                "normalize": True,  # Normalize with mean and variance of training dataset
                "preprocessings": [
                    {
                        "type": "to_RGB",
                        # if grayscaled image, produce RGB one (3 channels with same value) otherwise do nothing
                    },
                ],
                "augmentation": aug_config(0.9, 0.1),
                # "synthetic_data": None,
                "synthetic_data": {
                    "init_proba": 0.9,  # begin proba to generate synthetic document
                    "end_proba": 0.2,  # end proba to generate synthetic document
                    "num_steps_proba": 200000,  # linearly decrease the percent of synthetic document from 90% to 20% through 200000 samples
                    "proba_scheduler_function": linear_scheduler,  # decrease proba rate linearly
                    "start_scheduler_at_max_line": True,  # start decreasing proba only after curriculum reach max number of lines
                    "dataset_level": dataset_level,
                    "curriculum": True,  # use curriculum learning (slowly increase number of lines per synthetic samples)
                    "crop_curriculum": True,  # during curriculum learning, crop images under the last text line
                    "curr_start": 0,  # start curriculum at iteration
                    "curr_step": 10000,  # interval to increase the number of lines for curriculum learning
                    "min_nb_lines": 1,  # initial number of lines for curriculum learning
                    "max_nb_lines": max_nb_lines[dataset_name],  # maximum number of lines for curriculum learning
                    "padding_value": 255,
                    # config for synthetic line generation
                    "config": {
                        "background_color_default": (255, 255, 255),
                        "background_color_eps": 15,
                        "text_color_default": (0, 0, 0),
                        "text_color_eps": 15,
                        "font_size_min": 35,
                        "font_size_max": 45,
                        "color_mode": "RGB",
                        "padding_left_ratio_min": 0.00,
                        "padding_left_ratio_max": 0.05,
                        "padding_right_ratio_min": 0.02,
                        "padding_right_ratio_max": 0.2,
                        "padding_top_ratio_min": 0.02,
                        "padding_top_ratio_max": 0.1,
                        "padding_bottom_ratio_min": 0.02,
                        "padding_bottom_ratio_max": 0.1,
                    },
                }
            }
        },

        "model_params": {
            "models": {
                "encoder": FCN_Encoder,
                "decoder": GlobalHTADecoder,
            },
            # "transfer_learning": None,
            "transfer_learning": {
                # model_name: [state_dict_name, checkpoint_path, learnable, strict]
                "encoder": ["encoder", "/content/drive/MyDrive/READ_2016_page/checkpoints/best.pt", True, True],
                "decoder": ["decoder", "/content/drive/MyDrive/READ_2016_page/checkpoints/best.pt", True, False],
            },
            "transfered_charset": True,  # Transfer learning of the decision layer based on charset of the line HTR model
            "additional_tokens": 1,  # for decision layer = [<eot>, ], only for transfered charset

            "input_channels": 3,  # number of channels of input image
            "dropout": 0.5,  # dropout rate for encoder
            "enc_dim": 256,  # dimension of extracted features
            "nb_layers": 5,  # encoder
            "h_max": 500,  # maximum height for encoder output (for 2D positional embedding)
            "w_max": 1000,  # maximum width for encoder output (for 2D positional embedding)
            "l_max": 15000,  # max predicted sequence (for 1D positional embedding)
            "dec_num_layers": 8,  # number of transformer decoder layers
            "dec_num_heads": 4,  # number of heads in transformer decoder layers
            "dec_res_dropout": 0.1,  # dropout in transformer decoder layers
            "dec_pred_dropout": 0.1,  # dropout rate before decision layer
            "dec_att_dropout": 0.1,  # dropout rate in multi head attention
            "dec_dim_feedforward": 256,  # number of dimension for feedforward layer in transformer decoder layers
            "use_2d_pe": True,  # use 2D positional embedding
            "use_1d_pe": True,  # use 1D positional embedding
            "use_lstm": False,
            "attention_win": 100,  # length of attention window
            # Curriculum dropout
            "dropout_scheduler": {
                "function": exponential_dropout_scheduler,
                "T": 5e4,
            }

        },

        "training_params": {
            "output_folder": "dan_read_page",  # folder name for checkpoint and results
            "max_nb_epochs": 50000,  # maximum number of epochs before to stop
            "max_training_time": 3600 * 24 * 1.9,  # maximum time before to stop (in seconds)
            "load_epoch": "last",  # ["best", "last"]: last to continue training, best to evaluate
            "interval_save_weights": None,  # None: keep best and last only
            "batch_size": 1,  # mini-batch size for training
            "valid_batch_size": 4,  # mini-batch size for valdiation
            "use_ddp": False,  # Use DistributedDataParallel
            "ddp_port": "20027",
            "use_amp": True,  # Enable automatic mix-precision
            "nb_gpu": torch.cuda.device_count(),
            "optimizers": {
                "all": {
                    "class": Adam,
                    "args": {
                        "lr": 0.0001,
                        "amsgrad": False,
                    }
                },
            },
            "lr_schedulers": None,  # Learning rate schedulers
            "eval_on_valid": True,  # Whether to eval and logs metrics on validation set during training or not
            "eval_on_valid_interval": 5,  # Interval (in epochs) to evaluate during training
            "focus_metric": "cer",  # Metrics to focus on to determine best epoch
            "expected_metric_value": "low",  # ["high", "low"] What is best for the focus metric value
            "set_name_focus_metric": "{}-valid".format(dataset_name),  # Which dataset to focus on to select best weights
            "train_metrics": ["loss_ce", "cer", "wer", "syn_max_lines"],  # Metrics name for training
            "eval_metrics": ["cer", "wer", "map_cer"],  # Metrics name for evaluation on validation set during training
            "force_cpu": False,  # True for debug purposes
            "max_char_prediction": 3000,  # max number of token prediction
            # Keep teacher forcing rate to 20% during whole training
            "teacher_forcing_scheduler": {
                "min_error_rate": 0.2,
                "max_error_rate": 0.2,
                "total_num_steps": 5e4
            },
        },
    }

    if params["training_params"]["use_ddp"] and not params["training_params"]["force_cpu"]:
        mp.spawn(train_and_test, args=(params,), nprocs=params["training_params"]["nb_gpu"])
    else:
        train_and_test(0, params)

## **Predictions**

*Single lingual*

In [None]:
!cp /content/content/DAN/Datasets/dataset_formatters/generic_dataset_formatter.py /content/content/DAN
# Import necessary libraries
import os.path

import torch
from torch.optim import Adam
from PIL import Image
import numpy as np
# Importing specific modules from custom packages
from basic.models import FCN_Encoder
from OCR.document_OCR.dan.models_dan import GlobalHTADecoder
from OCR.document_OCR.dan.trainer_dan import Manager
from basic.utils import pad_images
from basic.metric_manager import keep_all_but_tokens

# Define a FakeDataset class for placeholder dataset information
class FakeDataset:

    def __init__(self, charset):
        self.charset = charset

        self.tokens = {
            "end": len(self.charset),
            "start": len(self.charset) + 1,
            "pad": len(self.charset) + 2,
        }

# Function to get model parameters
def get_params(weight_path):
    return {
        "dataset_params": {
            "charset": None,
        },
        "model_params": {
            "models": {
                "encoder": FCN_Encoder,
                "decoder": GlobalHTADecoder,
            },
            # "transfer_learning": None,
            "transfer_learning": {
                # model_name: [state_dict_name, checkpoint_path, learnable, strict]
                "encoder": ["encoder", weight_path, True, True],
                "decoder": ["decoder", weight_path, True, False],
            },
            "transfered_charset": True,  # Transfer learning of the decision layer based on charset of the line HTR model
            "additional_tokens": 1,  # for decision layer = [<eot>, ], only for transfered charset

            "input_channels": 3,  # number of channels of input image
            "dropout": 0.5,  # dropout rate for encoder
            "enc_dim": 256,  # dimension of extracted features
            "nb_layers": 5,  # encoder
            "h_max": 500,  # maximum height for encoder output (for 2D positional embedding)
            "w_max": 1000,  # maximum width for encoder output (for 2D positional embedding)
            "l_max": 15000,  # max predicted sequence (for 1D positional embedding)
            "dec_num_layers": 8,  # number of transformer decoder layers
            "dec_num_heads": 4,  # number of heads in transformer decoder layers
            "dec_res_dropout": 0.1,  # dropout in transformer decoder layers
            "dec_pred_dropout": 0.1,  # dropout rate before decision layer
            "dec_att_dropout": 0.1,  # dropout rate in multi head attention
            "dec_dim_feedforward": 256,  # number of dimension for feedforward layer in transformer decoder layers
            "use_2d_pe": True,  # use 2D positional embedding
            "use_1d_pe": True,  # use 1D positional embedding
            "use_lstm": False,
            "attention_win": 100,  # length of attention window
        },

        "training_params": {
            "output_folder": "dan_rimes_page",  # folder name for checkpoint and results
            "max_nb_epochs": 50000,  # maximum number of epochs before to stop
            "max_training_time": 3600 * 24 * 1.9,  # maximum time before to stop (in seconds)
            "load_epoch": "last",  # ["best", "last"]: last to continue training, best to evaluate
            "interval_save_weights": None,  # None: keep best and last only
            "batch_size": 1,  # mini-batch size for training
            "valid_batch_size": 4,  # mini-batch size for valdiation
            "use_ddp": False,  # Use DistributedDataParallel
            "ddp_port": "20027",
            "use_amp": True,  # Enable automatic mix-precision
            "nb_gpu": torch.cuda.device_count(),
            "ddp_rank": 0,
            "lr_schedulers": None,  # Learning rate schedulers
            "eval_on_valid": True,  # Whether to eval and logs metrics on validation set during training or not
            "eval_on_valid_interval": 5,  # Interval (in epochs) to evaluate during training
            "focus_metric": "cer",  # Metrics to focus on to determine best epoch
            "expected_metric_value": "low",  # ["high", "low"] What is best for the focus metric value
            "eval_metrics": ["cer", "wer", "map_cer"],  # Metrics name for evaluation on validation set during training
            "force_cpu": True,  # True for debug purposes
            "max_char_prediction": 3000,  # max number of token prediction
            # Keep teacher forcing rate to 20% during whole training
            "teacher_forcing_scheduler": {
                "min_error_rate": 0.2,
                "max_error_rate": 0.2,
                "total_num_steps": 5e4
            },
            "optimizers": {
                "all": {
                    "class": Adam,
                    "args": {
                        "lr": 0.0001,
                        "amsgrad": False,
                    }
                },
            },
        },
    }

#Function to make predictions using the trained model
def predict(model_path, img_paths):
    params = get_params(model_path)
    checkpoint = torch.load(model_path, map_location="cpu")
    charset = checkpoint["charset"]
    # Set models to evaluation mode
    manager = Manager(params)
    manager.params["model_params"]["vocab_size"] = len(charset)
    manager.load_model()
    for model_name in manager.models.keys():
        manager.models[model_name].eval()
    manager.dataset = FakeDataset(charset)

    # format images
    # Load and preprocess input images
    imgs = [np.array(Image.open(img_path)) for img_path in img_paths]
    imgs = [np.expand_dims(img, axis=2) if len(img.shape)==2 else img for img in imgs]
    imgs = [np.concatenate([img, img, img], axis=2) if img.shape[2] == 1 else img for img in imgs]
    shapes = [img.shape[:2] for img in imgs]
    reduced_shapes = [[shape[0]//32, shape[1]//8] for shape in shapes]
    imgs_positions = [([0, shape[0]], [0, shape[1]]) for shape in shapes]
    imgs = pad_images(imgs, padding_value=0, padding_mode="br")
    imgs = torch.tensor(imgs).float().permute(0, 3, 1, 2)

    # Prepare batch data for evaluation
    batch_data = {
        "imgs": imgs,
        "imgs_reduced_shape": reduced_shapes,
        "imgs_position": imgs_positions,
        "raw_labels": None,
    }
    # Perform evaluation on the batch
    with torch.no_grad():
        res = manager.evaluate_batch(batch_data, metric_names = [])
    prediction = res["str_x"]
    # Define layout tokens for post-processing
    layout_tokens = "".join(['Ⓑ', 'Ⓞ', 'Ⓟ', 'Ⓡ', 'Ⓢ', 'Ⓦ', 'Ⓨ', "Ⓐ", "Ⓝ", 'ⓑ', 'ⓞ', 'ⓟ', 'ⓡ', 'ⓢ', 'ⓦ', 'ⓨ', "ⓐ", "ⓝ"])
    prediction = [keep_all_but_tokens(x, layout_tokens) for x in prediction]
    print(prediction)
    # Post-process predictions and write to a file
    with open('/content/content/DAN/prediction-single-lingual.txt','w') as f:
      f.write(prediction[0])
      f.close()

if __name__ == "__main__":
    # Set the path to the pre-trained model and input image(s)
    model_path = "/content/drive/MyDrive/DAN Model/dan_read_page.pt"
    #img_paths = ["../../../test.png", "../../../test2.png"]  # CHANGE WITH YOUR IMAGES PATH
    img_paths = ["/content/drive/MyDrive/handwritting-to-text-with-ocr.jpg"]
    # Make predictions using the specified model and input image(s)
    predict(model_path, img_paths)



##################
Available GPUS: 1
Rank 0: Tesla T4 _CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15101MB, multi_processor_count=40)
##################
Local GPU:
WORKING ON CPU !

##################
transfered weights for encoder
transfered weights for decoder
LOADED EPOCH: -1

['9h4Whenpast Calls,\nanswer.Itnew new ']


**Multilingual prediction**

In [None]:
!cp /content/content/DAN/Datasets/dataset_formatters/generic_dataset_formatter.py /content/content/DAN
# Import necessary libraries
import os.path

import torch
from torch.optim import Adam
from PIL import Image
import numpy as np
# Importing specific modules from custom packages
from basic.models import FCN_Encoder
from OCR.document_OCR.dan.models_dan import GlobalHTADecoder
from OCR.document_OCR.dan.trainer_dan import Manager
from basic.utils import pad_images
from basic.metric_manager import keep_all_but_tokens

# Define a FakeDataset class for placeholder dataset information
class FakeDataset:

    def __init__(self, charset):
        self.charset = charset

        self.tokens = {
            "end": len(self.charset),
            "start": len(self.charset) + 1,
            "pad": len(self.charset) + 2,
        }

# Function to get model parameters
def get_params(weight_path):
    return {
        "dataset_params": {
            "charset": None,
        },
        "model_params": {
            "models": {
                "encoder": FCN_Encoder,
                "decoder": GlobalHTADecoder,
            },
            # "transfer_learning": None,
            "transfer_learning": {
                # model_name: [state_dict_name, checkpoint_path, learnable, strict]
                "encoder": ["encoder", weight_path, True, True],
                "decoder": ["decoder", weight_path, True, False],
            },
            "transfered_charset": True,  # Transfer learning of the decision layer based on charset of the line HTR model
            "additional_tokens": 1,  # for decision layer = [<eot>, ], only for transfered charset

            "input_channels": 3,  # number of channels of input image
            "dropout": 0.5,  # dropout rate for encoder
            "enc_dim": 256,  # dimension of extracted features
            "nb_layers": 5,  # encoder
            "h_max": 500,  # maximum height for encoder output (for 2D positional embedding)
            "w_max": 1000,  # maximum width for encoder output (for 2D positional embedding)
            "l_max": 15000,  # max predicted sequence (for 1D positional embedding)
            "dec_num_layers": 8,  # number of transformer decoder layers
            "dec_num_heads": 4,  # number of heads in transformer decoder layers
            "dec_res_dropout": 0.1,  # dropout in transformer decoder layers
            "dec_pred_dropout": 0.1,  # dropout rate before decision layer
            "dec_att_dropout": 0.1,  # dropout rate in multi head attention
            "dec_dim_feedforward": 256,  # number of dimension for feedforward layer in transformer decoder layers
            "use_2d_pe": True,  # use 2D positional embedding
            "use_1d_pe": True,  # use 1D positional embedding
            "use_lstm": False,
            "attention_win": 100,  # length of attention window
        },

        "training_params": {
            "output_folder": "dan_rimes_page",  # folder name for checkpoint and results
            "max_nb_epochs": 50000,  # maximum number of epochs before to stop
            "max_training_time": 3600 * 24 * 1.9,  # maximum time before to stop (in seconds)
            "load_epoch": "last",  # ["best", "last"]: last to continue training, best to evaluate
            "interval_save_weights": None,  # None: keep best and last only
            "batch_size": 1,  # mini-batch size for training
            "valid_batch_size": 4,  # mini-batch size for valdiation
            "use_ddp": False,  # Use DistributedDataParallel
            "ddp_port": "20027",
            "use_amp": True,  # Enable automatic mix-precision
            "nb_gpu": torch.cuda.device_count(),
            "ddp_rank": 0,
            "lr_schedulers": None,  # Learning rate schedulers
            "eval_on_valid": True,  # Whether to eval and logs metrics on validation set during training or not
            "eval_on_valid_interval": 5,  # Interval (in epochs) to evaluate during training
            "focus_metric": "cer",  # Metrics to focus on to determine best epoch
            "expected_metric_value": "low",  # ["high", "low"] What is best for the focus metric value
            "eval_metrics": ["cer", "wer", "map_cer"],  # Metrics name for evaluation on validation set during training
            "force_cpu": True,  # True for debug purposes
            "max_char_prediction": 3000,  # max number of token prediction
            # Keep teacher forcing rate to 20% during whole training
            "teacher_forcing_scheduler": {
                "min_error_rate": 0.2,
                "max_error_rate": 0.2,
                "total_num_steps": 5e4
            },
            "optimizers": {
                "all": {
                    "class": Adam,
                    "args": {
                        "lr": 0.0001,
                        "amsgrad": False,
                    }
                },
            },
        },
    }

#Function to make predictions using the trained model
def predict(model_path, img_paths):
    params = get_params(model_path)
    checkpoint = torch.load(model_path, map_location="cpu")
    charset = checkpoint["charset"]
    # Set models to evaluation mode
    manager = Manager(params)
    manager.params["model_params"]["vocab_size"] = len(charset)
    manager.load_model()
    for model_name in manager.models.keys():
        manager.models[model_name].eval()
    manager.dataset = FakeDataset(charset)

    # format images
    # Load and preprocess input images
    imgs = [np.array(Image.open(img_path)) for img_path in img_paths]
    imgs = [np.expand_dims(img, axis=2) if len(img.shape)==2 else img for img in imgs]
    imgs = [np.concatenate([img, img, img], axis=2) if img.shape[2] == 1 else img for img in imgs]
    shapes = [img.shape[:2] for img in imgs]
    reduced_shapes = [[shape[0]//32, shape[1]//8] for shape in shapes]
    imgs_positions = [([0, shape[0]], [0, shape[1]]) for shape in shapes]
    imgs = pad_images(imgs, padding_value=0, padding_mode="br")
    imgs = torch.tensor(imgs).float().permute(0, 3, 1, 2)

    # Prepare batch data for evaluation
    batch_data = {
        "imgs": imgs,
        "imgs_reduced_shape": reduced_shapes,
        "imgs_position": imgs_positions,
        "raw_labels": None,
    }
    # Perform evaluation on the batch
    with torch.no_grad():
        res = manager.evaluate_batch(batch_data, metric_names = [])
    prediction = res["str_x"]
    # Define layout tokens for post-processing
    layout_tokens = "".join(['Ⓑ', 'Ⓞ', 'Ⓟ', 'Ⓡ', 'Ⓢ', 'Ⓦ', 'Ⓨ', "Ⓐ", "Ⓝ", 'ⓑ', 'ⓞ', 'ⓟ', 'ⓡ', 'ⓢ', 'ⓦ', 'ⓨ', "ⓐ", "ⓝ"])
    prediction = [keep_all_but_tokens(x, layout_tokens) for x in prediction]
    print(prediction)
    # Post-process predictions and write to a file
    with open('/content/content/DAN/prediction-single-lingual.txt','w') as f:
      f.write(prediction[0])
      f.close()

if __name__ == "__main__":
    # Set the path to the pre-trained model and input image(s)
    model_path = "/content/drive/MyDrive/DAN Model/dan_read_page.pt"
    #img_paths = ["../../../test.png", "../../../test2.png"]  # CHANGE WITH YOUR IMAGES PATH
    img_paths = ["/content/drive/MyDrive/multilingual.jpg"]
    # Make predictions using the specified model and input image(s)
    predict(model_path, img_paths)


##################
Available GPUS: 1
Rank 0: Tesla T4 _CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15101MB, multi_processor_count=40)
##################
Local GPU:
WORKING ON CPU !

##################
transfered weights for encoder
transfered weights for decoder
LOADED EPOCH: -1

['816Ir Eeie Catūat obt Ach Aūf Caber\n\nKax Er Bei Et 1 fr/ sti\nCrisen.\nD W E:\nAzt .8']
