In [103]:
from pdf2image import convert_from_path

### Converting pdf into pages

In [104]:
pages= convert_from_path(r'docs\patient_details\pd_1.pdf', poppler_path = r'C:\poppler-23.07.0\Library\bin')
pages

[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1867x2000 at 0x26904F572E0>,
 <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=2000x1398 at 0x26904F570D0>]

#### PIL is the popular python module that allows us to do image processing

In [105]:
len(pages)

2

### Opening the particular page

In [106]:
pages[0].show()

#### pytesseract is the python module to extract the text from an image

In [107]:
import pytesseract

### Straight extract of text from an image using pytesseract

In [108]:
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(pages[0], lang='eng')
print(text)

47/12/2020

Patient Medical Record

Patient Information Birth Date
Kathy Crawford May 6 1972
(737) 988-0851 Weight
9264 Ash Dr 95
New York City, 10005 .
United States Height:
190
In Case of Emergency
m _ a _
Simeone Crawford 9266 Ash Dr
New York City, New York, 10005
Home phone United States
(990) 375-4621
Work phone
Genera! Medical History
. : a ee

Chicken Pox (Varicella):

IMMUNE

Have you had the Hepatitis B vaccination?

No

List any Medical Problems (asthma, seizures, headaches):

Migraine


###### It is sucessfully converted the image into a text now. But this text is not perfect. One thing is that, OCR is not going to be perfect and it will output lot of garbage and we have to deal with it. In the image whatever the thing in dark we couldn't get that. This means, we cannot directly apply tesseract on this image because the image is kind of bad. So, we need to some preprocessing on top of it using computer vision. OpenCV is the popular Framework for doing computer vision. So, we will use that. We will do some processing on the image and then we will execute this code once we have that processed image.

### Applying Adaptive thresholding technique.

#### Adaptive thresholding is a technique in computer vision to overcome unclear image issues

In [109]:
import cv2
from PIL import Image
import numpy as np  
def preprocess_image(img):
    gray =  cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, None, fx = 1.5, fy = 1.5 )
    processed_image = cv2.adaptiveThreshold(
        resized,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        61,
        11
    )
    return processed_image

#### Calling the function and displaying an image

In [51]:
img = preprocess_image(pages[0])
Image.fromarray(img).show()

#### Now, we got image. our image is post-processed. It is better looking image.

### Applying Pytesseract on the processed image for better text

In [110]:
pytesseract.pytesseract.tesseract_cmd=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(img, lang='eng')
print(text)

17/12/2020

Patient Medical Record

Patient Information Birth Date

Kathy Crawford May 6 1972

(737) 988-0851 Weight’

9264 Ash Dr 95

New York City, 10005 '

United States Height:
190

In Case of Emergency
ee J
Simeone Crawford 9266 Ash Dr
New York City, New York, 10005
Home phone United States
(990) 375-4621
Work phone
Genera! Medical History
nn i
Chicken Pox (Varicella): Measies:
IMMUNE

IMMUNE
Have you had the Hepatitis B vaccination?

No

List any Medical Problems (asthma, seizures, headaches}:

Migraine


### Extract required information from Patient Details

In [111]:
text = '''
    Patient Medical Record . : :

    Patient Information


    Birth Date
    Kathy Crawford May 6 1972
    (737) 988-0851 Weight:
    9264 Ash Dr 95
    New York City, 10005 a
    United States Height:
    190
    In Case of Emergency
    ee oe
    Simeone Crawford 9266 Ash Dr
    New York City, New York, 10005
    Home phone United States
    (990) 375-4621
    Work phone
    Genera! Medical History
    I i
    Chicken Pox (Varicella): Measies:
    IMMUNE IMMUNE

    Have you had the Hepatitis B vaccination?

    No

    List any Medical Problems (asthma, seizures, headaches):

    Migraine
    '''

### Extract Name

In [112]:
import re
# ? gives minimum number of matches
pattern = "Patient Information(.*?)\(\d{3}\)"
matches = re.findall(pattern, text, flags = re.DOTALL)
matches

['\n\n\n    Birth Date\n    Kathy Crawford May 6 1972\n    ']

In [113]:
# strip replaces leading and trailing white spaces
matches[0].strip()

'Birth Date\n    Kathy Crawford May 6 1972'

In [114]:
match = matches[0].replace("Birth Date", "").strip()
match

'Kathy Crawford May 6 1972'

In [115]:
date_pattern = "((Jan|Feb|March|April|May|June|July|Aug|Sep|Oct|Nov|Dec)[ \d]+)"
date_matches = re.findall(date_pattern, match)
date_matches

[('May 6 1972', 'May')]

In [116]:
date = date_matches[0][0]
date

'May 6 1972'

In [117]:
match.replace(date, "").strip()

'Kathy Crawford'

In [118]:
def remove_noise_from_name(name):
    name = name.replace("Birth Date", "").strip()
    date_pattern = "((Jan|Feb|March|April|May|June|July|Aug|Sep|Oct|Nov|Dec)[ \d]+)"
    date_matches = re.findall(date_pattern, name)
    
    if date_matches:
        date = date_matches[0][0]
        name = name.replace(date, "").strip()
    return name  

In [119]:
name = ' Birth Date\n\nKathy Crawford May 6 1972\n\n'
name = remove_noise_from_name(name)
name

'Kathy Crawford'

### Extract Phone

In [120]:
pattern = "Patient Information(.*?)(\(\d{3}\) \d{3}-\d{4})"
matches = re.findall(pattern, text, flags = re.DOTALL)
matches

[('\n\n\n    Birth Date\n    Kathy Crawford May 6 1972\n    ',
  '(737) 988-0851')]

In [121]:
matches[0][1]

'(737) 988-0851'

### Extract Vaccinne

In [122]:
pattern = "Have you had the Hepatitis B vaccination\?.*(yes|No)"
matches = re.findall(pattern, text, flags = re.DOTALL)
matches

['No']

In [123]:
matches[0].strip()

'No'

### Extract Medical Problems

In [124]:
pattern = "List any Medical Problems \(asthma, seizures, headaches\):(.*)"
matches = re.findall(pattern, text, flags = re.DOTALL)
matches

['\n\n    Migraine\n    ']

In [125]:
matches[0].strip()

'Migraine'