## Trying the Code for One Single PDF File

In [35]:
from pdf2image import convert_from_path
import pandas as pd
import fastapi
import pytesseract
import numpy as np

In [2]:
#Image Saves in the form of array in 'pages'
pages = convert_from_path(r'/Users/Prashanth/Desktop/Medical Project/Resources/prescription/pre_1.pdf',
                          poppler_path=r'/usr/local/bin/')

In [18]:
#Saves the image
pages[0].save('clear_image.jpg','JPEG')

In [9]:
#Converting pages array to string using pytesseract

pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
text = pytesseract.image_to_string(pages[0], lang='eng')
print(text)

Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-111-2222

Name: Maria Sharapova Date: 5/11/2022

Address: 9 tennis court, new Russia, DC

—moemenmannenneneneunmnnnnennieesisiyoinnitniahadaaanniihsnseneneneeeernnttnnneenrenen:

Prednisone 20 mg
Lialda 2.4 gram

3 days,

or 1 month



The text shows some descripencies due to the blurness/gray shadow in the image
### Preprocessing Using Thresholding OpenCV

In [10]:
import cv2
from PIL import Image

In [32]:
#Saves it to the NewImg numpy array using pixeling
NewImg = cv2.imread("clear_image.jpg",flags=cv2.IMREAD_GRAYSCALE)
NewImg

array([[208, 207, 207, ..., 185, 185, 185],
       [205, 205, 205, ..., 183, 183, 182],
       [203, 204, 204, ..., 185, 185, 186],
       ...,
       [155, 156, 158, ..., 176, 176, 176],
       [161, 162, 161, ..., 183, 183, 184],
       [152, 154, 156, ..., 178, 179, 179]], dtype=uint8)

In [33]:
_, NewImgThreshold = cv2.threshold(NewImg, 150, 255, cv2.THRESH_BINARY)
Image.fromarray(NewImgThreshold).show()

In [34]:
#Using cv2 Gaussian formula for getting clear image

# Apply adaptive thresholding
# blockSize: Size of pixel neighborhood (must be odd)
# constant: Value subtracted from the threshold (fine-tunes result)
thresh_img = cv2.adaptiveThreshold(NewImg,             # Input image
                                  255,                # Max pixel value (white)
                                  cv2.ADAPTIVE_THRESH_GAUSSIAN_C,  # Use Gaussian method
                                  cv2.THRESH_BINARY,   # Binary threshold (0 or 255)
                                  61,                 # Block size (local region)
                                  11)                 # Subtracted constant (fine-tuning)

Image.fromarray(thresh_img).show()

### Defining the Image Processing Function

In [42]:
# Function to process an image (grayscale, resize, and apply adaptive thresholding)
def ProcessingImage(img):
    # Convert image to grayscale
    gray = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)

    # Resize image (increase size by 1.5x)
    resizedimage = cv2.resize(gray, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_LINEAR)

    # Apply adaptive thresholding (Gaussian)
    processedimage = cv2.adaptiveThreshold(resizedimage,  # Input grayscale image
                                           255,           # Max pixel value (white)
                                           cv2.ADAPTIVE_THRESH_GAUSSIAN_C,  # Gaussian method
                                           cv2.THRESH_BINARY,   # Binary thresholding
                                           61,            # Block size (neighborhood size)
                                           11)            # Constant to subtract

    # Return processed image
    return processedimage

In [47]:
#Trying the function using previous image array stored in pytesseract
newimage = ProcessingImage(pages[0])
Image.fromarray(newimage).show()            

In [46]:
#Converting processed image to string using pytesseract

pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
text = pytesseract.image_to_string(newimage, lang='eng')
print(text)

Dr John Smith, M.D
2 Non-Important Street,
New York, Phone (000)-121-2222

Name: Marta Sharapova Date: 2/11/2022

Address: 9 tennis court, new Russia, DC

K

Prednisone 20 mg
Lialda 2.4 gram

Directions:
Prednisone, Taper 5 mg every 3 days,

Finish in 2.5 weeks -
Lialda - take 2 pill everyday for 1 month

Refill: 2 times



## Converting all the PDF files in the folder

In [48]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

# Function to process an image (grayscale, resize, and apply adaptive thresholding)
def ProcessingImage(img):
    gray = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)  # Convert image to grayscale
    resizedimage = cv2.resize(gray, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_LINEAR)  # Resize image
    processedimage = cv2.adaptiveThreshold(resizedimage,  # Apply adaptive thresholding
                                           255,           # Max pixel value (white)
                                           cv2.ADAPTIVE_THRESH_GAUSSIAN_C,  # Gaussian method
                                           cv2.THRESH_BINARY,   # Binary thresholding
                                           61,            # Block size (neighborhood size)
                                           11)            # Constant to subtract
    return processedimage

# Set the path for the directory containing PDF files
pdf_directory = '/Users/Prashanth/Desktop/Medical Project/Resources/prescription/'
output_directory = '/Users/Prashanth/Desktop/Medical Project/extracted_text/'

# Ensure output directory exists
os.makedirs(output_directory, exist_ok=True)

# Iterate through all PDF files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, filename)  # Get full path of the PDF file
        pages = convert_from_path(pdf_path, poppler_path='/usr/local/bin/')  # Convert PDF to images
        
        for i, page in enumerate(pages):
            newimage = ProcessingImage(page)  # Process the image
            text = pytesseract.image_to_string(newimage, lang='eng')  # Extract text
            
            # Save the extracted text to a .txt file
            text_filename = f"{os.path.splitext(filename)[0]}_page_{i + 1}.txt"  # Create text file name
            text_path = os.path.join(output_directory, text_filename)
            
            with open(text_path, 'w') as text_file:
                text_file.write(text)  # Write the text to the file

print("Text extraction completed and saved to text files.")

Text extraction completed and saved to text files.
