# Train/Fine Tune Tesseract model

### Import libraries

In [1]:
import os
import py7zr
import tarfile
import subprocess
from evaluate_model import *
from generate_output_text import process_images
from download_benchmarks import download_benchmark

### Install Tesseract

#### Install needed libraries

In [None]:
%pip install Pillow>=6.2.1 python-bidi>=0.4 matplotlib pandas

#### Install Tesseract and needed packages

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!sudo apt install bc

#### Clone Tesseract

In [None]:
!git clone https://github.com/tesseract-ocr/tesseract.git

#### Install make

In [None]:
cd tessract

In [None]:
!./autogen.sh

In [None]:
!./configure

In [None]:
!make

In [None]:
!sudo make install

In [None]:
!sudo ldconfig

In [None]:
!make training

In [None]:
!sudo make training-install

In [None]:
cd ..

### Clone tesstrain and tessdata_best

In [None]:
!git clone https://github.com/tesseract-ocr/tesstrain.git

In [5]:
!git config --global http.version HTTP/1.1

In [None]:
!git clone https://github.com/tesseract-ocr/tessdata_best.git

### Create langdata folder

In [None]:
cd tesstrain

#### Install required data

In [None]:
%pip install -r requirements.txt
!make tesseract-langdata

### Create the needed folders for the model

In [28]:
!mkdir ./data/ara
!mkdir ./data/Tesseract_F_JSTOR_TRDG_Shaden
!mkdir ./data/Tesseract_F_JSTOR_TRDG_Shaden-ground-truth

In [None]:
cd ..

### Make a copy of all files in the needed_files folders to the model folder in tesstrain

In [30]:
folder_cp_path = './needed_files/ara'
folder_paste_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG_Shaden'
for file in os.listdir(folder_cp_path):
    file_path = os.path.join(folder_cp_path, file)
    subprocess.run(f'cp {file_path} {folder_paste_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

In [31]:
folder_cp_path = './needed_files/rest_data'
folder_paste_path = './tesstrain/data'
for file in os.listdir(folder_cp_path):
    file_path = os.path.join(folder_cp_path, file)
    subprocess.run(f'cp {file_path} {folder_paste_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### Rename files in the model folder

In [32]:
model_name = 'Tesseract_F_JSTOR_TRDG_Shaden'
folder_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG_Shaden'
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    file_name_split = file.split('.')
    new_name_path = os.path.join(folder_path, model_name + '.' + file_name_split[1])
    subprocess.run(f'mv {file_path} {new_name_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### Unzip all datasets inside the ground truth

In [33]:
unzip_folder_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG_Shaden-ground-truth'
datasets_path = './datasets'
for ds in os.listdir(datasets_path):
    ds_path = os.path.join(datasets_path, ds)
    with py7zr.SevenZipFile(ds_path, mode='r') as z:
        z.extractall(path=unzip_folder_path)

#### Copy all files from every dataset folder to the ground truth

In [None]:
ground_truth_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG_Shaden-ground-truth'
datasets_folders = ['Blur_1p8', 'Distortion_T2', 'Distortion_T_3', 'Dist_Type2', 'Dist_Type_0', 'JSTORArabic-ground-truth', 'num_dataset_FINAL', 'outputbase', 'Rand_Skew_Ang3', 'rand_dataset_txt2img']

for dir in datasets_folders:
    dir_path = os.path.join(ground_truth_path, dir)
    subprocess.run(f'cp -a {dir_path}/. {ground_truth_path}/', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    subprocess.run(f'rm -fr {dir_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

subprocess.run(f'rm -fr ./tesstrain/data/Tesseract_F_JSTOR_TRDG_Shaden-ground-truth/.ipynb_checkpoints', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

#### Train/Fine Tune the model

In [None]:
cd tesstrain

need to change LEARNING_RATE, MAX_ITERATIONS and EPOCHS values

In [None]:
!make LANG_TYPE=RTL MODEL_NAME=Tesseract_F_JSTOR_TRDG_Shaden PSM=13 START_MODEL=ara TESSDATA=../tessdata_best FINETUNE_TYPE=Plus LEARNING_RATE=0.0001 MAX_ITERATIONS=10000 RATIO_TRAIN=0.80 EPOCHS=50 training -j12

In [None]:
cd ..

# Evaluate the model

#### Install ocreval tool

##### Install for macos

In [None]:
!brew install utf8proc

In [None]:
!git clone https://github.com/eddieantonio/ocreval.git

In [None]:
cd ocreval

In [None]:
!make

In [None]:
!sudo make install

#### Create folders

In [1]:
!mkdir ./benchmarks
!mkdir ./benchmarks/csv_benchmarks
!mkdir ./benchmarks/zipped_benchmarks
!mkdir ./benchmarks/output_benchmarks

#### Download Hegghammar's benchmakrs

In [None]:
# download ground truth
download_benchmark('https://zenodo.org/records/5068735/files/ground_truth.tar.lzma?download=1', './benchmarks/zipped_benchmarks')

benchmarks_names = ['yarmouk_01_col.tar.lzma', 'yarmouk_02_bin.tar.lzma', 'yarmouk_03_col_blur.tar.lzma', 
                    'yarmouk_04_col_weak.tar.lzma', 'yarmouk_05_col_snp.tar.lzma', 'yarmouk_06_col_wm.tar.lzma',
                    'yarmouk_07_col_scrib.tar.lzma', 'yarmouk_08_col_ink.tar.lzma', 'yarmouk_09_bin_blur.tar.lzma',
                    'yarmouk_10_bin_weak.tar.lzma', 'yarmouk_11_bin_snp.tar.lzma', 'yarmouk_12_bin_wm.tar.lzma',
                    'yarmouk_13_bin_scrib.tar.lzma', 'yarmouk_14_bin_ink.tar.lzma', 'yarmouk_15_col_blur_weak.tar.lzma',
                    'yarmouk_16_col_blur_snp.tar.lzma', 'yarmouk_17_col_blur_wm.tar.lzma', 'yarmouk_18_col_blur_scrib.tar.lzma',
                    'yarmouk_19_col_blur_ink.tar.lzma', 'yarmouk_20_col_weak_snp.tar.lzma', 'yarmouk_21_col_weak_wm.tar.lzma',
                    'yarmouk_22_col_weak_scrib.tar.lzma', 'yarmouk_23_col_weak_ink.tar.lzma', 'yarmouk_24_col_snp_wm.tar.lzma',
                    'yarmouk_25_col_snp_scrib.tar.lzma', 'yarmouk_26_col_snp_ink.tar.lzma', 'yarmouk_27_col_wm_scrib.tar.lzma',
                    'yarmouk_28_col_wm_ink.tar.lzma', 'yarmouk_29_col_scrib_ink.tar.lzma', 'yarmouk_30_bin_blur_weak.tar.lzma',
                    'yarmouk_31_bin_blur_snp.tar.lzma', 'yarmouk_32_bin_blur_wm.tar.lzma', 'yarmouk_33_bin_blur_scrib.tar.lzma',
                    'yarmouk_34_bin_blur_ink.tar.lzma', 'yarmouk_35_bin_weak_snp.tar.lzma', 'yarmouk_36_bin_weak_wm.tar.lzma',
                    'yarmouk_37_bin_weak_scrib.tar.lzma', 'yarmouk_38_bin_weak_ink.tar.lzma', 'yarmouk_39_bin_snp_wm.tar.lzma',
                    'yarmouk_40_bin_snp_scrib.tar.lzma', 'yarmouk_41_bin_snp_ink.tar.lzma', 'yarmouk_42_bin_wm_scrib.tar.lzma', 
                    'yarmouk_43_bin_wm_ink.tar.lzma', 'yarmouk_44_bin_scrib_ink.tar.lzma']

num_to_download = 3     # the number of benchmarks in the list to be downloaded
for benchmark in benchmarks_names[:num_to_download]:
    print(benchmark)
    download_benchmark(f'https://zenodo.org/records/5068735/files/{benchmark}?download=1', './benchmarks/zipped_benchmarks')
    

#### Unzip benchmarks

In [None]:
unzip_benchmark_folder_path = './benchmarks'
benchmarks_path = './benchmarks/zipped_benchmarks'
for benchmark in os.listdir(benchmarks_path):
    print(benchmark)
    benchmark_path = os.path.join(benchmarks_path, benchmark)
    with tarfile.open(benchmark_path, mode='r:xz') as z:
        z.extractall(path=unzip_benchmark_folder_path)

# rename yarmouk to yarmouk_benchmarks
subprocess.run('mv ./benchmarks/yarmouk ./benchmarks/yarmouk_benchmarks', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
subprocess.run('mv ./benchmarks/ground_truth ./benchmarks/benchmark_ground_truth', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

#### Generate output text files
the output text files are in ./benchmarks/output_benchmarks

In [None]:
benchmarks_folder_path = './benchmarks/yarmouk_benchmarks/'

for folder in os.listdir(benchmarks_folder_path):
    print(folder)
    process_images(folder_path=folder, model='ara', is_colab=True)

#### Use ISRI tool to evaluate the model

In [None]:
dataset_name_lst = []
file_name_lst = []
engine_name_lst = []
char_acc_lst = []
word_acc_lst = []

gt_path = './benchmarks/benchmark_ground_truth/yarmouk_gt'
output_benchmarks_path = './benchmarks/output_benchmarks'

for folder in os.listdir(output_benchmarks_path):
    print(folder)
    output_folder_path = os.path.join(output_benchmarks_path, folder)
    dataset_name, file_name, engine_name, char_acc, word_acc = generate_accuracy(ground_truth_path=gt_path, 
                                                                                 output_folder_path=output_folder_path, 
                                                                                 model_name='Tesseract_F_JSTOR_TRDG_Shaden')
    
    dataset_name_lst.extend(dataset_name)
    file_name_lst.extend(file_name)
    engine_name_lst.extend(engine_name)
    char_acc_lst.extend(char_acc)
    word_acc_lst.extend(word_acc)

create_df(csv_name='Tesseract_F_JSTOR_TRDG_Shaden', dataset_name_lst=dataset_name_lst, 
          file_name_lst=file_name_lst, engine_name_lst=engine_name_lst, 
          char_acc_lst=char_acc_lst, word_acc_lst=word_acc_lst)