# Train/Fine Tune Tesseract model

### Import libraries

In [1]:
import os
import py7zr
import subprocess
from utilities import *
from evaluate_model import *
from generate_output_text import process_images
from generate_word_char_error import generate_errors

### Install Tesseract

#### Install needed libraries

In [None]:
%pip install Pillow>=6.2.1 python-bidi>=0.4 matplotlib pandas

#### Install Tesseract and needed packages

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!sudo apt install bc

#### Clone Tesseract

In [None]:
!git clone https://github.com/tesseract-ocr/tesseract.git

#### Install make

In [None]:
cd tessract

In [None]:
!./autogen.sh

In [None]:
!./configure

In [None]:
!make

In [None]:
!sudo make install

In [None]:
!sudo ldconfig

In [None]:
!make training

In [None]:
!sudo make training-install

In [None]:
cd ..

### Clone tesstrain and tessdata_best

In [1]:
!git clone https://github.com/tesseract-ocr/tesstrain.git

Cloning into 'tesstrain'...
remote: Enumerating objects: 997, done.[K
remote: Counting objects: 100% (222/222), done.[K
remote: Compressing objects: 100% (91/91), done.[K
remote: Total 997 (delta 130), reused 208 (delta 127), pack-reused 775[K
Receiving objects: 100% (997/997), 13.41 MiB | 3.18 MiB/s, done.
Resolving deltas: 100% (572/572), done.


In [5]:
!git config --global http.version HTTP/1.1

In [2]:
!git clone https://github.com/tesseract-ocr/tessdata_best.git -j12

Cloning into 'tessdata_best'...
remote: Enumerating objects: 257, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 257 (delta 1), reused 4 (delta 0), pack-reused 250[K
Receiving objects: 100% (257/257), 1.30 GiB | 2.05 MiB/s, done.
Resolving deltas: 100% (42/42), done.
Updating files: 100% (168/168), done.


### Create langdata folder

In [3]:
cd tesstrain

/Users/joe/Documents/ocr/tesstrain


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


#### Install required data

In [5]:
%pip install -r requirements.txt
!make tesseract-langdata

Collecting Pillow>=6.2.1 (from -r requirements.txt (line 1))
  Downloading pillow-10.2.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.7 kB)
Collecting python-bidi>=0.4 (from -r requirements.txt (line 2))
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting matplotlib (from -r requirements.txt (line 3))
  Downloading matplotlib-3.8.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting contourpy>=1.0.1 (from matplotlib->-r requirements.txt (line 3))
  Downloading contourpy-1.2.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib->-r requirements.txt (line 3))
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->-r requirements.txt (line 3))
  Downloading fonttools-4.49.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.1/159.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01

### Create the needed folders for the model

In [None]:
!mkdir ./data/ara
!mkdir ./data/Tesseract_F_JSTOR_TRDG
!mkdir ./data/Tesseract_F_JSTOR_TRDG-ground-truth

In [7]:
cd ..

/Users/joe/Documents/ocr


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


### Make a copy of all files in the needed_files folders to the model folder in tesstrain

In [None]:
folder_cp_path = './needed_files/ara'
folder_paste_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG'
for file in os.listdir(folder_cp_path):
    file_path = os.path.join(folder_cp_path, file)
    subprocess.run(f'cp {file_path} {folder_paste_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

In [12]:
folder_cp_path = './needed_files/rest_data'
folder_paste_path = './tesstrain/data'
for file in os.listdir(folder_cp_path):
    file_path = os.path.join(folder_cp_path, file)
    subprocess.run(f'cp {file_path} {folder_paste_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### Rename files in the model folder

In [None]:
model_name = 'Tesseract_F_JSTOR_TRDG'
folder_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG'
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    file_name_split = file.split('.')
    new_name_path = os.path.join(folder_path, model_name + '.' + file_name_split[1])
    subprocess.run(f'mv {file_path} {new_name_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### Unzip all datasets inside the ground truth

In [None]:
unzip_folder_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG-ground-truth'
datasets_path = './datasets'
for ds in os.listdir(datasets_path):
    ds_path = os.path.join(datasets_path, ds)
    with py7zr.SevenZipFile(ds_path, mode='r') as z:
        z.extractall(path=unzip_folder_path)

#### Copy all files from every dataset folder to the ground truth

In [None]:
ground_truth_path = './tesstrain/data/Tesseract_F_JSTOR_TRDG-ground-truth'
datasets_folders = ['Blur_1p8', 'Distortion_T2', 'Distortion_T_3', 'Dist_Type2', 'Dist_Type_0', 'JSTORArabic-ground-truth', 'num_dataset_FINAL', 'outputbase', 'Rand_Skew_Ang3', 'rand_dataset_txt2img']

for dir in datasets_folders:
    dir_path = os.path.join(ground_truth_path, dir)
    subprocess.run(f'cp -a {dir_path}/. {ground_truth_path}/', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    subprocess.run(f'rm -fr {dir_path}', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

subprocess.run(f'rm -fr ./tesstrain/data/Tesseract_F_JSTOR_TRDG-ground-truth/.ipynb_checkpoints', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

#### Train/Fine Tune the model

In [None]:
cd tesstrain

need to change LEARNING_RATE, MAX_ITERATIONS and EPOCHS values

In [None]:
!make LANG_TYPE=RTL MODEL_NAME=Tesseract_F_JSTOR_TRDG PSM=13 START_MODEL=ara TESSDATA=../tessdata_best MAX_ITERATIONS=500 RATIO_TRAIN=0.80 EPOCHS=500 training -j12

In [None]:
cd ..