In [9]:
from pdf2image import convert_from_path

### Converting pdf into pages

In [10]:
pages= convert_from_path(r'docs\prescription\pre_1.pdf', poppler_path = r'C:\poppler-23.07.0\Library\bin')
pages

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1700x2200 at 0x22A525942E0>]

#### PIL is the popular python module that allows us to do image processing

In [11]:
len(pages)

1

### Opening the particular page

In [12]:
pages[0].show()

#### pytesseract is the python module to extract the text from an image

In [13]:
import pytesseract

### Straight extract of text from an image using pytesseract

In [14]:
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(pages[0], lang='eng')
print(text)

Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-111-2222

Name: Maria Sharapova Date: 5/11/2022

Address: 9 tennis court, new Russia, DC

—moemenmannenneneneunmnnnnennieesisiyoinnitniahadaaanniihsnseneneneeeernnttnnneenrenen:

Prednisone 20 mg
Lialda 2.4 gram

3 days,

or 1 month


###### It is sucessfully converted the image into a text now. But this text is not perfect. One thing is that, OCR is not going to be perfect and it will output lot of garbage and we have to deal with it. In the image whatever the thing in dark we couldn't get that. This means, we cannot directly apply tesseract on this image because the image is kind of bad. So, we need to some preprocessing on top of it using computer vision. OpenCV is the popular Framework for doing computer vision. So, we will use that. We will do some processing on the image and then we will execute this code once we have that processed image.

### Applying Adaptive thresholding technique.

#### Adaptive thresholding is a technique in computer vision to overcome unclear image issues

In [15]:
import cv2     # Importing OpenCV library to pre-process image
from PIL import Image
import numpy as np
def preprocess_image(img):
    gray =  cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, None, fx = 1.5, fy = 1.5)
    processed_image = cv2.adaptiveThreshold(
        resized,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        61,
        11
    )
    return processed_image

# Here we have done 3 steps:
# 1. we converted our image from colorful image to gray.
# 2. we resize it. we make it little bigger, so that we can see better 
# 3. Then, we applied adaptive thresholding. 

#### Calling the function and displaying an image

In [16]:
img = preprocess_image(pages[0])
Image.fromarray(img).show()

#### Now, we got image. our image is post-processed. It is better looking image.

### Applying Pytesseract on the processed image for better text

In [17]:
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(img, lang='eng')
print(text)

Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-111-2222

Name: Marta Sharapova Date: 5/11/2022

Address: 9 tennis court, new Russia, DC

K

Prednisone 20 mg
Lialda 2.4 gram

Directions:

Prednisone, Taper 5 mig every 3 days,
Finish in 2.5 weeks a
Lialda - take 2 pill everyday for 1 month

Refill: 2 times


### Extract required information from Prescription

In [18]:
text = '''
Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-111-2222

Name: Marta Sharapova Date: 5/11/2022

Address: 9 tennis court, new Russia, DC



Prednisone 20 mg
Lialda 2.4 gram

Directions:

Prednisone, Taper 5 mig every 3 days,
Finish in 2.5 weeks a
Lialda - take 2 pill everyday for 1 month

Refill: 2 times
'''

### Extracting Name 

In [19]:
import re

pattern = "Name:(.*)Date"

match = re.findall(pattern, text)
match

[' Marta Sharapova ']

In [20]:
pattern = "Name:(.*)Date"

match = re.findall(pattern, text)
match[0].strip()

'Marta Sharapova'

### Extracting Address

In [21]:
pattern = "Address:(.*)\n"
match = re.findall(pattern, text)
match[0].strip()

'9 tennis court, new Russia, DC'

### Extracting Medicines

In [22]:
pattern = "Address:[^\n]*(.*)Directions"
match = re.findall(pattern, text, flags = re.DOTALL)
match[0].strip()

'Prednisone 20 mg\nLialda 2.4 gram'

In [23]:
pattern = "Address:[^\n]*(.*)Directions"
match = re.findall(pattern, text, flags = re.DOTALL)
print(match[0].strip())

Prednisone 20 mg
Lialda 2.4 gram


### Extracting Directions

In [24]:
pattern = "Directions:(.*)Refill"

match = re.findall(pattern, text, flags = re.DOTALL)
print(match[0].strip())

Prednisone, Taper 5 mig every 3 days,
Finish in 2.5 weeks a
Lialda - take 2 pill everyday for 1 month


### Extracting Refill

In [25]:
pattern = "Refill:(.*)times"

match = re.findall(pattern, text)
print(match[0].strip())

2
