# Text Recognition through the use of Tesseract-OCR

#### This script recognizes text from images through the use of Optical Character Recognition (Tesseract). It uses a neural network subsystem configured as a text line recognizer. The script does transformations to the images, such as resizing them and making them gray, then extracts the text and saves it to an excel file for further processing and cleaning. The dataset used for the experiment is an "Email" dataset containing 291 images. However, the script also works will other datasets, containing invoices, license plates and other containing images of text.

#### Installations

In [17]:
# For output cleaning
from IPython.display import clear_output

# Remove the comments in order to perform the necessary installations!

#!pip install opencv-python
#!pip install numpy
#!pip install pytesseract
#!pip install pandas

# Clear the output
clear_output()

#### Imports

In [18]:
# Import the necessary libraries.

import os
import cv2
import numpy as np
import pytesseract
import pandas as pd

# To use Tesseract OCR it must be installed separately 
# and the location bellow must be changed to the appropriate one on your pc.
# More information on how to install Tesseract can be found here:
# https://tesseract-ocr.github.io/tessdoc/Installation.html

# Tesseract location
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'

# Clear the output
clear_output()

#### Email text recognition through Tesseract OCR 

The following code will do image transformations to each image,
then through Tesseract extract the text from the image
and save it to an excel file for further processing and cleaning.

In [21]:
# Open Email Dataset folder
dr = 'D:/OCR_Datasets/RealWorldDocumentsCollections/email'

# Create dataframe
df = pd.DataFrame(columns = ['FileName', 'Text'])

# Populate the dataset
# Loop through all files in the directory 
for file in os.listdir(dr):
    # Check if file is image
    if file.endswith(".jpg") or file.endswith(".png") or file.endwith(".jpeg"):
        
        # Get image
        image = cv2.imread(os.path.join(dr, file))

        # Resize the image, through cubic interpolation for higher quality 
        image = cv2.resize(image, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)

        # Convert image to GrayScale
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply erosion and dilation to remove noise
        k = np.ones((1, 1), np.uint8)
        image = cv2.erode(image, k, iterations = 1)
        image = cv2.dilate(image, k, iterations = 1)

        # Apply median blur through adaptive threshold for areas that have changed lighting 
        cv2.adaptiveThreshold(cv2.medianBlur(image, 3), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)

        # Read text from image through Tesseract
        text = pytesseract.image_to_string(image)
        
        # Append to dataframe
        df = df.append({'FileName': file, 'Text': text}, ignore_index = True)
    # Skip if file is not an image
    else:
        continue

#### Save for further processing and data cleaning.

In [22]:
# Save dataframe as excel file
# Location and sheet name can be changed as preffered 
df.to_excel("D:/Email_Output.xlsx", sheet_name='Email_Dataset')