## Config

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

from vietocr.tool.config import Cfg
from vietocr.model.trainer import Trainer

In [None]:
config = Cfg.load_config_from_name('vgg_transformer')
dataset_params = {
    'name':'imgVietocr',
    'data_root':'/mnt/disk1/mbbank/OCR/DATA/data_quangnd/new_train',
    'train_annotation':'/mnt/disk1/mbbank/OCR/DATA/team/train.txt',
    'valid_annotation':'/mnt/disk1/mbbank/OCR/DATA/team/val.txt'
}

params = {
         'print_every':200,
         'valid_every':2*200,
          'iters':2000000,
          'checkpoint':'/mnt/disk1/mbbank/OCR/CODE/VietOcr/weight/vietocr_V1.pth',
          'export':'/mnt/disk1/mbbank/OCR/CODE/VietOcr/weight/vietocr_V2.pth',
          'metrics': 150
         }

config['trainer'].update(params)
config['dataset'].update(dataset_params)
config['vocab'] += '–' + 'ü' + 'ā' + 'ö' # Ko cần dòng này, nếu cần thì thêm các kí tự
config['device'] = 'cuda:0'
config['optimizer']['max_lr'] = 0.00005

config

## Train

In [None]:
trainer = Trainer(config, pretrained=False)
trainer.config.save('/mnt/disk1/mbbank/OCR/CODE/VietOcr/vietocr/config/config.yml')

In [None]:
# trainer.visualize_dataset()
trainer.train()

In [None]:
trainer.precision()

In [None]:
trainer.visualize_prediction()

In [None]:
with torch.no_grad():
    torch.cuda.empty_cache()

## Infer

In [None]:
# auto reload
%load_ext autoreload
%autoreload 2

In [26]:
import  sys
sys.path.append('/mnt/disk1/mbbank/OCR/CODE/VietOcr')

from vietocr.tool.predictor import Predictor
from vietocr.tool.translate import translate_full_prob
from vietocr.tool.config import Cfg
sys.path.append('../')
from Preprocessing.skew import SkewCorrection
from Preprocessing.perspective import PerspectiveCorrection

import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch

class RECOGNIZE():
    def __init__(self, weight_path='./weight/vietocr_v1.pth', 
                 config_path = '/mnt/disk1/mbbank/OCR/CODE/VietOcr/config/config_V5.yml',
                 device='cpu') -> None:
        config = Cfg.load_config_from_file(config_path)
        config['weights'] = weight_path
        config['cnn']['pretrained'] = False
        config['device'] = device
        config['predictor']['beamsearch'] = False
        self.config = config
        self.detector = Predictor(config)
    
    def predict_image(self, img_path, preprocess=False):
        img = Image.open(img_path).convert('RGB')
        if preprocess:
            img_np = np.array(img)
            img_corrected = PerspectiveCorrection(img_np)
            img_pil = Image.fromarray(img_corrected)
        else:
            img_pil = img
            
        words_predicted, full_prob = self.detector.predict(img_pil, full_seq=True)
        
        # s, probs =  translate_full_prob(full_prob)
        # words_predicted_dulp = self.detector.vocab.decode(s.tolist())
        # print('dulp', words_predicted_dulp)
        
        return words_predicted
    
class RECOGNIZE_ENSEMBLE():
    def __init__(self, weight_paths=['./weight/vietocr_v3.pth'], 
                 config_path = '/mnt/disk1/mbbank/OCR/CODE/VietOcr/config/config_V3.yml',
                 device='cpu') -> None:
        config = Cfg.load_config_from_file(config_path)
        config['cnn']['pretrained'] = False
        config['device'] = device
        config['predictor']['beamsearch'] = False
        print('LEN VOCAB' ,len(config['vocab']))
        self.config = config
        self.detectors = []
        for weight_path in weight_paths:
            config['weights'] = weight_path
            self.detectors.append(Predictor(config))

    
    def predict_image(self, img_path, preprocess=False):
        img = Image.open(img_path).convert('RGB')
        if preprocess:         
            img_np = np.array(img)
            img_corrected = PerspectiveCorrection(img_np)
            img_pil = Image.fromarray(img_corrected)
        else:
            img_pil = img
            
        n = len(self.detectors)
        words_predicted_list = []
        full_prob = None
        for detector in self.detectors:
            words_predicted, output = detector.predict(img_pil, full_seq=True)
            words_predicted_list.append(words_predicted)
            if full_prob is None:  
                full_prob = output/n
            else:
                full_prob += output/n

        s, probs =  translate_full_prob(full_prob)
        words_predicted_ensemble = self.detectors[0].vocab.decode(s.tolist())
        return words_predicted_list, words_predicted_ensemble

weight1 = '/mnt/disk1/mbbank/OCR/CODE/VietOcr/weight/vietocr_V3.pth'
weight2 = '/mnt/disk1/mbbank/OCR/CODE/VietOcr/weight/vietocr_V2.pth'
config_path = '/mnt/disk1/mbbank/OCR/CODE/VietOcr/config/config_V3.yml'
model_v1 = RECOGNIZE_ENSEMBLE(weight_paths=[weight1, weight2], 
                              config_path=config_path,device='cuda:2')

v1_predict = {}
org_test = '/mnt/disk1/mbbank/OCR/DATA/data_quangnd/test'
bar = tqdm(os.listdir(org_test)[1000:1002])
for img_path in  bar:
    v1_predict[img_path] = model_v1.predict_image(org_test + '/' + img_path, 
                                                  preprocess=False)
v1_predict

LEN VOCAB 233


100%|██████████| 2/2 [00:04<00:00,  2.41s/it]


{'public_test_img_11348.jpg': (['quị', 'quị'], 'quị'),
 'public_test_img_22272.png': (['phấp', 'nhấp'], 'phấp')}

In [None]:
weight = '/mnt/disk1/mbbank/OCR/CODE/VietOcr/weight/vietocr_V3.pth'
model_v1 = RECOGNIZE(weight_path=weight, device='cuda:2')

v1_predict = {}
org_test = '/mnt/disk1/mbbank/OCR/DATA/data_quangnd/test'
bar = tqdm(os.listdir(org_test)[0:2])
for img_path in  bar:
    v1_predict[img_path] = model_v1.predict_image(org_test + '/' + img_path, 
                                                  preprocess=False)

# with open('/mnt/disk1/mbbank/OCR/CODE/VietOcr/puplictest_infer/vietocr_V3_beamsearch_perspective.txt', 'w') as f:
#     for key, value in v1_predict.items():
#         f.write('%s\t%s\n' % (key, value))

In [None]:
img_path = org_test + '/' + 'public_test_img_10774.jpg'
image = Image.open(img_path).convert('RGB')
plt.imshow(image)       
plt.title(v1_predict['public_test_img_10774.jpg'])

In [None]:

with open('/mnt/disk1/mbbank/OCR/CODE/VietOcr/puplictest_infer/vietocr_V3_beamsearch_perspective.txt', 'r') as f:
     lines = f.readlines()
for cnt, line in enumerate(lines):
     print(cnt, line.strip())
     command, description = line.strip().split(None, 1)
     print(cnt, command, description)
     