### Convert PDF to Image using pdf2image for better image recognition

In [1]:
# install and import module to convert pdf to image
pip install pdf2image
from pdf2image import convert_from_path

In [2]:
# Convert pdf file from path: patient prescription file
pages = convert_from_path(r'resources\prescription\pre_1.pdf', 
                          poppler_path=r'c:\users\spinc\appdata\local\programs\python\poppler-23.11.0\Library\bin')

In [3]:
pages

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x22836BC3D30>]

In [4]:
len(pages)

1

In [6]:
# Show converted images
pages[0].show()

### Extract text from the image

In [7]:
# install and import module to extract text on image
pip install pytesseract
import pytesseract

In [8]:
# Extract text from the patient prescription image
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(pages[0], lang='eng')

In [9]:
# Text in dark region can not be extracted: next step-> enhance image resolution
print(text)

Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-111-2222

Name: Maria Sharapova Date: 5/11/2022

Address: 9 tennis court, new Russia, DC

—moemenmannenneneneunmnnnnennieesisiyoinnitniahadaaanniihsnseneneneeeernnttnnneenrenen:

Prednisone 20 mg
Lialda 2.4 gram

3 days,

or 1 month


### Preprocess Image: Resolve dark background issue

In [10]:
# install and import module to enhance image resolution
#pip install opencv-python
import cv2
from PIL import Image

Clear Image Example: Use Simple Thresholding to enhance contrast between text and background

In [13]:
# read clear image file, add grey scale flag
img = cv2.imread(r'resources\sample_images\clear_image.jpg', flags=cv2.IMREAD_GRAYSCALE)
img

array([[206, 206, 206, ..., 185, 185, 185],
       [206, 206, 206, ..., 185, 185, 185],
       [206, 206, 206, ..., 186, 185, 185],
       ...,
       [207, 207, 207, ..., 191, 190, 190],
       [207, 207, 207, ..., 191, 190, 190],
       [207, 207, 207, ..., 191, 191, 190]], dtype=uint8)

In [14]:
# show the image using module in python
Image.fromarray(img).show()

In [15]:
# Use Simple Thresholding on a clear image
_, new_img = cv2.threshold(img,150,255,cv2.THRESH_BINARY)

In [16]:
# show the new image: enhanced contrast for better text extraction
Image.fromarray(new_img).show()

Image with Shadow Example: Use Adaptive Thresholding to enhance contrast and avoid shadow

In [17]:
# read shadow image file, add grey scale flag
imgd = cv2.imread(r'resources\sample_images\dark_image.jpg', flags=cv2.IMREAD_GRAYSCALE)
imgd

array([[206, 206, 206, ..., 187, 186, 184],
       [206, 206, 206, ..., 186, 185, 185],
       [206, 206, 206, ..., 186, 185, 185],
       ...,
       [162, 157, 152, ..., 184, 184, 184],
       [160, 156, 152, ..., 184, 184, 184],
       [156, 158, 158, ..., 184, 184, 184]], dtype=uint8)

In [18]:
# show the shadow image
Image.fromarray(imgd).show()

In [19]:
# Use Simple Thresholding to process the shadow image
_, new_imgd = cv2.threshold(imgd,150,255,cv2.THRESH_BINARY)

In [20]:
# show the converted shadow image: shadow becomes black, not good for text extraction
Image.fromarray(new_imgd).show()

In [27]:
# Use Adaptive Thresholding on the shadow image
new_img2=cv2.adaptiveThreshold(
    imgd, 255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY,
    61,
    11
)

In [28]:
# show the new converted shadow image: shadow disappeared, ready for text extraction
Image.fromarray(new_img2).show()

### Define a function to auto process the raw images

In [29]:
import numpy as np

In [30]:
# Define a function called preprocess_image
def preprocess_image(img):
    # 1. Convert color images to grayscale
    gray = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
    # 2. Enlarge the image to get better contrast: increase size by 1.5 times and use linear interpolation to fill in the gaps
    resized = cv2.resize(gray, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_LINEAR)
    # 3. Apply adaptive thresholding to enhance image contrast
    processed_image = cv2.adaptiveThreshold(
                    resized, 255,
                    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                    cv2.THRESH_BINARY,
                    61,
                    11
                    )
    return processed_image

In [31]:
# Test new function: image is clear and ready for text extraction
img_ = preprocess_image(pages[0])
Image.fromarray(img_).show()

### Use the preprocessed Image for text extraction

In [33]:
# Text extraced from preprocessed image is much closer to actual text than before image preprocessing
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text1 = pytesseract.image_to_string(img_, lang='eng')
print(text1)

Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-111-2222

Name: Marta Sharapova Date: 5/11/2022

Address: 9 tennis court, new Russia, DC

K

Prednisone 20 mg
Lialda 2.4 gram

Directions:

Prednisone, Taper 5 mig every 3 days,
Finish in 2.5 weeks a
Lialda - take 2 pill everyday for 1 month

Refill: 2 times


### Extract Text from another image

In [22]:
#1 Convert pdf to image: shadow on the image
pages2 = convert_from_path(r'resources\prescription\pre_2.pdf', 
                          poppler_path=r'c:\users\spinc\appdata\local\programs\python\poppler-23.11.0\Library\bin')
pages2[0].show()

In [35]:
#2 Extract image without preprocess: very limited information is extracted because of dark area
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text2 = pytesseract.image_to_string(pages2[0], lang='eng')
print(text2)

Refill: 3 times


In [36]:
#3 Preprocessing image 2: Improved contrast and remove shadow
img_2 = preprocess_image(pages2[0])
Image.fromarray(img_2).show()

In [37]:
#2 Extract image after preprocess: most information is correctly extracted
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text_pre = pytesseract.image_to_string(img_2, lang='eng')
print(text_pre)

Dr John >mith, M.D

2 Non-Important street,
New York, Phone (900)-323- ~2222

Name:  Virat Kohli Date: 2/05/2022

Address: 2 cricket blvd, New Delhi

| Omeprazole 40 mg

Directions: Use two tablets daily for three months

Refill: 3 times


### Extract Information for specific fields using Regular Expression (Regex)

In [38]:
# import python module for regular expression
import re 

In [39]:
# Extract all the numbers from the text
t = "Patient's phone is 7321119999. Bill amount is 120$"

pattern = '\d+' # Test Regex in regex101 before scripting is a good idea

match = re.findall(pattern, t)
match

['7321119999', '120']

In [40]:
# Extract only the phone numbers from the text
pattern2 = '\d{10}'

match2 = re.findall(pattern2, t)
match2

['7321119999']

In [41]:
# Extract different forms of phone numbers
t2 = "Patient's phone is (732)-111-9999. spouse phone number 7326664444. Bill amount is 120$"
pattern3 ='\(\d{3}\)-\d{3}-\d{4}|\d{10}'
match3 = re.findall(pattern3, t2)
match3

['(732)-111-9999', '7326664444']

In [42]:
# Extract phone numbers and bill amount separately
pattern4 = '(\d{10})\D+(\d+)\$'
match4 = re.search(pattern4, t)
match4

<re.Match object; span=(19, 50), match='7321119999. Bill amount is 120$'>

In [43]:
# Put extracted values into variables
phone_number, bill_amount = match4.groups()

In [44]:
phone_number

'7321119999'

In [45]:
bill_amount

'120'

### Extract information form the patient files

In [46]:
#1. Extract patient name
pattern_n = 'Name:(.*)Date:'
match_n = re.findall(pattern_n, text1) # text1 contains the first image file
# Remove leading and trailing strings
match_n[0].strip()

'Marta Sharapova'

In [47]:
#2. Extract patient address
pattern_a = 'Address:(.*)\n'
match_a = re.findall(pattern_a, text1)
# Remove leading and trailing strings
match_a[0].strip()

'9 tennis court, new Russia, DC'

In [48]:
#3. Extract prescriptions
pattern_p = 'K[^\n]*(.*)Directions'
match_p = re.findall(pattern_p, text1, flags=re.DOTALL) # turn on dot matches new line flag
# Remove leading and trailing strings
print(match_p[0].strip())

Prednisone 20 mg
Lialda 2.4 gram


In [49]:
#4. Extract directions
pattern_d = 'Directions[^\n]*(.*)Refill'
match_d = re.findall(pattern_d, text1, flags=re.DOTALL) # turn on dot matches new line flag
# Remove leading and trailing strings
print(match_d[0].strip())

Prednisone, Taper 5 mig every 3 days,
Finish in 2.5 weeks a
Lialda - take 2 pill everyday for 1 month


In [50]:
#5. Extract refill times
pattern_r = 'Refill:(.*)times'
match_r = re.findall(pattern_r, text1) 
# Remove leading and trailing strings
print(match_r[0].strip())

2
