In [None]:
#OCR Text Recognition: Keras_OCR vs Pytesseract vs EasyOCR

In [None]:
! pip install -q kaggle
! kaggle datasets download -d robikscube/textocr-text-extraction-from-images-dataset
!unzip textocr-text-extraction-from-images-dataset.zip

In [None]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
annot = pd.read_parquet("//content//annot.parquet")
images = pd.read_parquet("//content//img.parquet")
images_fns = glob("//content//train_val_images//train_images//*")

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(plt.imread(images_fns[6]))
ax.axis('off')
plt.show()

In [None]:
image_id = images_fns[0].split('/')[-1].rstrip('.jpg')
annot.query('image_id == @image_id')

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(20, 20))
axs = axs.flatten()
for i in range(25):
    axs[i].imshow(plt.imread(images_fns[i]))
    axs[i].axis('off')
    image_id = images_fns[i].split('/')[-1].rstrip('.jpg')
    n_annot = len(annot.query('image_id == @image_id'))
    axs[i].set_title(f'{image_id} - {n_annot}')
plt.show()

In [None]:
#PYTESSERACT
!apt-get install tesseract-ocr
!pip install pytesseract

In [None]:
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [None]:
import pytesseract
print(pytesseract.image_to_string(images_fns[17], lang='eng'))

In [None]:
plt.imshow(plt.imread(images_fns[17]))

In [None]:
#EASY OCR!
pip install easyocr
import easyocr
reader = easyocr.Reader(['en'])

In [None]:
results = reader.readtext(images_fns[11])

In [None]:
pd.DataFrame(results, columns=['bbox','text','conf'])

In [None]:
reader = easyocr.Reader(['en'], gpu = True)
dfs = []
for img in tqdm(images_fns[:25]):
    result = reader.readtext(img)
    img_id = img.split('/')[-1].split('.')[0]
    img_df = pd.DataFrame(result, columns=['bbox','text','conf'])
    img_df['img_id'] = img_id
    dfs.append(img_df)
easyocr_df = pd.concat(dfs)

In [None]:
import pytesseract
from PIL import Image
import cv2

dfs = []
for img_path in tqdm(images_fns[:25]):
    img = Image.open(img_path
    text = pytesseract.image_to_string(img)
    img_id = img_path.split('/')[-1].split('.')[0]
    img_df = pd.DataFrame({'text': [text], 'img_id': [img_id]})
    dfs.append(img_df)

pytesseract_df = pd.concat(dfs)


In [None]:
def plot_compare(img_fn, easyocr_df, pytesseract_df):
    img_id = img_fn.split('/')[-1].split('.')[0]
    fig, axs = plt.subplots(1, 2, figsize=(15, 10))

    easy_results = easyocr_df.query('img_id == @img_id')['text'].values.tolist()
    img = cv2.imread(img_fn)
    for result in easy_results:
        cv2.putText(img, result, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    axs[0].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    axs[0].set_title('EasyOCR Results', fontsize=24)
    axs[0].axis('off')

    pytesseract_results = pytesseract_df.query('img_id == @img_id')['text'].values.tolist()
    img = cv2.imread(img_fn)
    for result in pytesseract_results:
        cv2.putText(img, result, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    axs[1].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    axs[1].set_title('Pytesseract Results', fontsize=24)
    axs[1].axis('off')

    plt.show()


In [None]:
# EASY OCR vs PYTESSERACT
for img_fn in images_fns[:25]:
    plot_compare(img_fn, easyocr_df, pytesseract_df)


In [None]:
#KERAS OCR
from IPython.display import clear_output
!pip install keras-ocr
clear_output()
!apt-get install -y poppler-utils

In [None]:
import keras_ocr
pipeline = keras_ocr.pipeline.Pipeline()

In [None]:
results = pipeline.recognize([images_fns[17]])

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
keras_ocr.tools.drawAnnotations(plt.imread(images_fns[17]), results[0], ax=ax)
ax.set_title('Keras OCR Result Example')
plt.show()

In [None]:
dfs = []
for img in tqdm(images_fns[:25]):
  results = pipeline.recognize([img])
  result = results[0]
  img_id = img.split('/')[-1].split('.')[0]
  img_df = pd.DataFrame(result, columns=['text', 'bbox'])
  img_df['img_id'] = img_id
  dfs.append(img_df)
kerasocr_df = pd.concat(dfs)

In [None]:
#Easy OCR vs Keras OCR TEST RESULTS COMPARISON
def plot_compare(img_fn, easyocr_df, kerasocr_df):
     img_id = img_fn.split('/')[-1].split('.')[0]
     fig, axs = plt.subplots(1, 2, figsize=(15, 10))

     easy_results = easyocr_df.query('img_id == @img_id')[['text','bbox']].values.tolist()
     easy_results = [(x[0], np.array(x[1])) for x in easy_results]
     keras_ocr.tools.drawAnnotations(plt.imread(img_fn),
                                     easy_results, ax=axs[0])
     axs[0].set_title('easyocr results', fontsize=24)

     keras_results = kerasocr_df.query('img_id == @img_id')[['text','bbox']].values.tolist()
     keras_results = [(x[0], np.array(x[1])) for x in keras_results]
     keras_ocr.tools.drawAnnotations(plt.imread(img_fn),
                                     keras_results, ax=axs[1])
     axs[1].set_title('keras_ocr results', fontsize=24)
     plt.show()

In [None]:
 # Loop over results
 for img_fn in images_fns[:25]:
     plot_compare(img_fn, easyocr_df, kerasocr_df)
