In [2]:
# # Install pytorch
# #https://pytorch.org/get-started/locally/
!pip3 install torch torchvision torchaudio

# # Install easyocr 
!pip install easyocr

In [5]:
# Import libraries 
import easyocr
import cv2
from matplotlib import pyplot as plt
import numpy as np

import re
from datetime import datetime
from collections import defaultdict

### 1. Read image

In [6]:
# Set image path
IMAGE_PATH = 'image.png'

### 2. OCR the text

In [7]:
# Extract text from the image using EasyOCR Package 
reader = easyocr.Reader(['en'])
result = reader.readtext(IMAGE_PATH, detail=0, paragraph=False)
result

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


['From: Al Amri, Salim <salim amri@gmail.com>',
 'Sent: 25 August 2021 17.20',
 'To: Al Harthi, Mohammed <mohd4 king@rihal.om>',
 'Cc: Al hajri; Malik <hajri990@ocaa.co.om>; Omar, Naif <nnnn49@apple.com>',
 'Subject: Conference Rooms Booking Details',
 'Dear Mohammed,',
 'As per our last discussion these are the available conference rooms available for booking along',
 'with their rates for full day:',
 'Room: Luban_',
 'available on 26/09/2021',
 'Rate: S4540',
 'Room: Mazoon, available on 04/12/2021 and 13/02/2022. Rate: S3000',
 'Room: Dhofar: Available on 11/11/2021. Rate: S2500',
 'Room: Nizwa. Available on 13/12/2022. Rate: S1200',
 'Please let me know which ones vou are interested So we go through more details',
 'Best regards,',
 'Salim Al Amri']

### 3. Extract Outputs

In [53]:
text = result
data = defaultdict(list)

# i. Find all dates. 
# Standardize the output to this format YYYY-MM-DD 
for line in text:
    matches = re.findall('(\d{2}[\/ ](\d{2}|January|Jan|February|Feb|March|Mar|April|Apr|May|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)[\/ ]\d{2,4})', line)
    for match in matches:
        date = match[0]
        month = re.search('[A-Z|a-z]+[A-Z|a-z]+', date)
        if month:
            month_number = datetime.strptime(month.group(), '%B').month
            date = date.replace(month.group(), str(month_number)) 
            date = date.replace(' ', '/')  
        date = datetime.strptime(date, '%d/%m/%Y').date()
        data['Dates'].append(str(date))

# ii. Room Names
    start = re.search("Room:", line)
    if start:
        line = line[start.end()+1:]
        end = re.search('[.,_:;]', line)
        name = line[: end.start()]
        data['Rooms'].append(name)

# iii. Room Rates
    rate = re.search("Rate:", line)
    if rate:
        rate = line[rate.end()+2:]
        rates.append("$"+rate)
        data['Rates'].append("$"+rate)

# iv. All emails 
    find_emails = re.findall(r'[\w. -]+@[\w.-]+', line) ## ['alice@google.com', 'bob@abc.com']
    for email in find_emails:
        email = email.replace(' ','.')
        data['Emails'].append(email)
        
# v. Individual Names. 
# Standardize the output to this format "Firstname Lastname"
    name_index = re.search('To:|Cc:|From:', line)
    while name_index:
        end_index = re.search('<', line)
        name = line[name_index.end()+1 : end_index.start()]
        mid = re.search('[,;]', name).end()
        name = name[mid+1:]  + name[:mid-1]
        data['Names'].append(name)
        line = line[end_index.start()+1:]
        name_index = re.search('>;', line)


                
# Display results 
for key,value in data.items():
    print(key + ':')
    for i in value:
      print(i)
    print()

Emails:
salim.amri@gmail.com
mohd4.king@rihal.om
hajri990@ocaa.co.om
nnnn49@apple.com

Names:
Salim Al Amri
Mohammed Al Harthi
Malik Al hajri
Naif Omar

Dates:
2021-08-25
2021-09-26
2021-12-04
2022-02-13
2021-11-11
2022-12-13

Rooms:
Luban
Mazoon
Dhofar
Nizwa

Rates:
$4540
$3000
$2500
$1200

