In [1]:
### 1. Mount Google Drive ###

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
### 2. Prepare data ###

!scp '/content/gdrive/My Drive/text_detection/text_data.zip' '/content/text_data.zip'

!unzip '/content/text_data.zip' -d '/content/text_data'

Archive:  /content/text_data.zip
   creating: /content/text_data/text_data/
  inflating: /content/text_data/text_data/advice.jpg  
  inflating: /content/text_data/text_data/aloha.jpg  
  inflating: /content/text_data/text_data/anxiety.jpg  
  inflating: /content/text_data/text_data/be_brilliant.jpg  
  inflating: /content/text_data/text_data/everyday_is_a_fresh_start.jpg  
  inflating: /content/text_data/text_data/fight_for_your_right.jpg  
  inflating: /content/text_data/text_data/frequently_asked_questions.jpg  
  inflating: /content/text_data/text_data/girls.jpg  
  inflating: /content/text_data/text_data/goal.jpg  
  inflating: /content/text_data/text_data/happy.jpg  
  inflating: /content/text_data/text_data/hello_there.jpg  
  inflating: /content/text_data/text_data/here_to_help.jpg  
  inflating: /content/text_data/text_data/leadership.jpg  
  inflating: /content/text_data/text_data/learn.jpg  
  inflating: /content/text_data/text_data/little_by_little.jpg  
  inflating: /conten

In [5]:
### 3. Install Dependencies ###

!apt install tesseract-ocr
!apt install libtesseract-dev

!pip install pytesseract
!pip install Pillow
!pip install easyocr
!pip install boto3

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libarchive-dev libleptonica-dev
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev
0 upgraded, 3 newly installed, 0 to remove and 2 not upgraded.
Need to get 3,743 kB of archives.
After this operation, 16.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.5 [581 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]


In [12]:
### 4. Text_detection ###

import pytesseract
from PIL import Image
from easyocr import Reader
import boto3

reader = Reader(['en'])

access_key = 'theKey'
secret_access_key = 'theKey'

textract_client = boto3.client('textract',
                               aws_access_key_id = access_key,
                               aws_secret_key_id = secret_access_key,
                               region_name = 'us-east-1')

def read_tesseract(image_path):
  text = pytesseract.image_to_string(Image.open(image_path), lang='eng')
  return text

def read_text_easyocr(image_path):
  text = ''
  results = reader.readtext(Image.open(image_path))
  for result in results:
    text = text + result[1] + ' '

  text = text[:-1]

  return text

def read_text_textract(image_path):
  with open(image_path, 'rb') as im:
    response = textract_client.detect_document_text(Document={'Bytes':im.read()})

  text = ''
  for item in response['Blocks']:
    if item['BlockType'] == 'LINE':
      text = text + item['Text'] + ''

  text = text[:-1]

  return text


 



In [1]:
### 5. Compare performance ###

import os

def jaccard_similarity(sentence1, sentence2):
  # Tokenize sentences into the set of words
  set1 = set(sentence1.lower().split())
  set2 = set(sentence2.lower().split())

  # Calculate Jaccard similarity
  intersection_size = len(set1.intersection(set2))
  union_size = len(set1.union(set2))

  # Avoid division by zero if both sets are empty
  similarity = intersection_size/ union_size if union_size! = 0 else 0.0

  return similarity

score_tesseract = 0
score_easyocr = 0
score_textract = 0

for image_path_ in os.listdir('/content/data'):
  image_path = os.path.join('/content/data', image_path)

  gt = image_path[:-4].replace('_', ' ').lower()

  score_tesseract += jaccard_similarity(gt, read_text_tesseract(image_path).lower().replace('\n', '').replace('!', ''),.replace('?', '').replace('.', ''))
  score_easyocr += jaccard_similarity(gt, read_text_easyocr(image_path).lower().replace('\n', '').replace('!', ''),.replace('?', '').replace('.', ''))
  score_textract += jaccard_similarity(gt, read_text_textract(image_path).lower().replace('\n', '').replace('!', ''),.replace('?', '').replace('.', ''))

  break

print('score tesseract:', score_tesseract/32)
print('score easyocr:', score_easyocr/32)
print('score textract:', score textract/32)
