# Extracting Text from Images in Python

<img src="https://media.arxiv-vanity.com/render-output/6158804/images/fig_2_no_trademarks.png" alt="text to images" width="600" height="600">


In [1]:
import pandas as pd
import numpy as np

from glob import glob
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from PIL import Image

plt.style.use('ggplot')

# Outline
1. Take a look at the data
2. Extract text from images:
    - pytesseract
    - easyocr
    - keras_ocr
3. Run on a few examples and compare the results

In [2]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-9.0.0-cp38-cp38-win_amd64.whl (19.6 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-9.0.0


In [3]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-0.8.1-cp38-cp38-win_amd64.whl (617 kB)
Collecting cramjam>=2.3.0
  Downloading cramjam-2.5.0-cp38-none-win_amd64.whl (993 kB)
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.5.0 fastparquet-0.8.1


In [4]:
annot = pd.read_parquet('../input/textocr-text-extraction-from-images-dataset/annot.parquet')
imgs = pd.read_parquet('../input/textocr-text-extraction-from-images-dataset/img.parquet')
img_fns = glob('../input/textocr-text-extraction-from-images-dataset/train_val_images/train_images/*')

FileNotFoundError: [Errno 2] No such file or directory: '../input/textocr-text-extraction-from-images-dataset/annot.parquet'

# Plot Example Images

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(plt.imread(img_fns[1]))
ax.axis('off')
plt.show()

In [None]:
image_id = img_fns[0].split('/')[-1].split('.')[0]
annot.query('image_id == @image_id')

## Display for first 25 images

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(20, 20))
axs = axs.flatten()
for i in range(25):
    axs[i].imshow(plt.imread(img_fns[i]))
    axs[i].axis('off')
    image_id = img_fns[i].split('/')[-1].rstrip('.jpg')
    n_annot = len(annot.query('image_id == @image_id'))
    axs[i].set_title(f'{image_id} - {n_annot}')
plt.show()

keras ocr

In [None]:
!pip install keras-ocr -q

In [None]:
import keras_ocr

pipeline = keras_ocr.pipeline.Pipeline()

In [None]:
results = pipeline.recognize([img_fns[11]])

In [None]:
pd.DataFrame(results[0], columns=['text', 'bbox'])

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
keras_ocr.tools.drawAnnotations(plt.imread(img_fns[1]), results[0], ax=ax)
ax.set_title('Keras OCR Result Example')
plt.show()

# The End