In [76]:
import boto3
import base64
import json
from pathlib import Path
from langchain_aws import ChatBedrock
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, SystemMessagePromptTemplate

In [97]:
# Initialize the Textract client
textract = boto3.client('textract')

llm = ChatBedrock(model="us.meta.llama3-2-90b-instruct-v1:0")

In [None]:
# Read the local file
with open('myopd-sample-rx-eng.png', 'rb') as file:
    image_bytes = file.read()

# Call Textract API
response = textract.detect_document_text(
    Document={'Bytes': image_bytes}
    )

# Print the response
print(json.dumps(response, indent=4))

{
    "DocumentMetadata": {
        "Pages": 1
    },
    "Blocks": [
        {
            "BlockType": "PAGE",
            "Geometry": {
                "BoundingBox": {
                    "Width": 1.0,
                    "Height": 1.0,
                    "Left": 0.0,
                    "Top": 0.0
                },
                "Polygon": [
                    {
                        "X": 5.674030489899451e-07,
                        "Y": 0.0
                    },
                    {
                        "X": 1.0,
                        "Y": 2.09909671866626e-06
                    },
                    {
                        "X": 1.0,
                        "Y": 1.0
                    },
                    {
                        "X": 0.0,
                        "Y": 1.0
                    }
                ]
            },
            "Id": "856ea651-0f63-4ebf-96bd-0cb61f8343bb",
            "Relationships": [
                {
                    "Type

In [55]:
# Extract the text from the response
extracted_text = ''
for item in response['Blocks']:
    if item['BlockType'] == 'LINE':
        extracted_text += item['Text'] + '\n'
print(extracted_text)

Dr. Onkar Bhave
Care Clinic
M.B.B.S., M.D., M.S. I Reg. No: 270988 I
Near Axis Bank, Kothrud, Pune -
Mob. No: 8983390126
411038
LOREM IPSUM
Ph: 094233 80390, Timing: 09:00
AM - 02:00 PM I Closed: Thursday
Date: 27-Apr-2020, 04:37 PM
ID: 266 - DEMO PATIENT (M)
Address: PUNE
Temp (deg): 36, BP: 120/80 mmHg
R
Medicine Name
Dosage
Duration
1) TAB. DEMO MEDICINE 1
1 Morning, 1 Night
10 Days
(Before Food)
(Tot:20 Tab)
2) CAP. DEMO MEDICINE 2
1 Morning, 1 Night
10 Days
(Before Food)
(Tot:20 Cap)
3) TAB. DEMO MEDICINE 3
1 Morning, 1 Aft, 1 Eve, 1 Night
10 Days
(After Food)
(Tot:40 Tab)
4) TAB. DEMO MEDICINE 4
1/2 Morning, 1/2 Night
10 Days
(After Food)
(Tot:10 Tab)
Advice Given:
* AVOID OILY AND SPICY FOOD
Follow Up: 12-05-2020
Charts
40
17
140
120
129
120
35
120
30
100
90
as
.
80
20
60
10
40
20
0
0
23-02- 09-03- 22-01- 05-04- 19-04- 03-05- 17-05-
23-02- 08-03- 22-03- 05-44 19.44 03-45 17-05
2020 2020 2020 2020 2020 2020 2020
2820 2520 2020 2020 2020 2020 2020
Date
Date
Signature
Dr. Onkar Bha

In [69]:
response = textract.analyze_document(
    Document={'Bytes': image_bytes},
    FeatureTypes=['TABLES']
)

In [71]:
def get_table_data(response):
    blocks = response['Blocks']
    blocks_map = {}
    table_blocks = []
    
    # Map block IDs to blocks
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)
    
    if len(table_blocks) <= 0:
        return []  # Return empty list if no tables found
    
    # List to store all tables
    all_tables = []
    
    # Process each table
    for table_index, table in enumerate(table_blocks):
        # Get rows and columns
        rows = {}
        for relationship in table['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    cell = blocks_map[child_id]
                    if cell['BlockType'] == 'CELL':
                        row_index = cell['RowIndex']
                        col_index = cell['ColumnIndex']
                        if row_index not in rows:
                            rows[row_index] = {}
                        
                        # Get text from cell
                        text = ""
                        if 'Relationships' in cell:
                            for rel in cell['Relationships']:
                                if rel['Type'] == 'CHILD':
                                    for word_id in rel['Ids']:
                                        word = blocks_map[word_id]
                                        if word['BlockType'] == 'WORD':
                                            text += word['Text'] + ' '
                        
                        rows[row_index][col_index] = text.strip()
        
        # Convert to list of lists format
        table_data = []
        for row_index in sorted(rows.keys()):
            row_data = []
            for col_index in sorted(rows[row_index].keys()):
                row_data.append(rows[row_index][col_index])
            table_data.append(row_data)
        
        all_tables.append(table_data)
    
    return all_tables

# Call the function with the response
tables = get_table_data(response)

# Now you can use the returned data however you want
for i, table in enumerate(tables):
    print(f"Table {i + 1}:")
    for row in table:
        print(",".join(row))


Table 1:
Medicine Name,Dosage,Duration
1) TAB. DEMO MEDICINE 1,1 Morning, 1 Night (Before Food),10 Days (Tot:20 Tab)
2) CAP. DEMO MEDICINE 2,1 Morning, 1 Night (Before Food),10 Days (Tot:20 Cap)
3) TAB. DEMO MEDICINE 3,1 Morning, 1 Aft, 1 Eve, 1 Night (After Food),10 Days (Tot:40 Tab)
4) TAB. DEMO MEDICINE 4,1/2 Morning, 1/2 Night (After Food),10 Days (Tot:10 Tab)


In [73]:
def get_table_data_as_string(response):
    blocks = response['Blocks']
    blocks_map = {}
    table_blocks = []
    
    # Map block IDs to blocks
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)
    
    if len(table_blocks) <= 0:
        return "No tables found"
    
    # String to store all table data
    result_string = ""
    
    # Process each table
    for table_index, table in enumerate(table_blocks):
        result_string += f"Table {table_index + 1}:\n"
        
        # Get rows and columns
        rows = {}
        for relationship in table['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    cell = blocks_map[child_id]
                    if cell['BlockType'] == 'CELL':
                        row_index = cell['RowIndex']
                        col_index = cell['ColumnIndex']
                        if row_index not in rows:
                            rows[row_index] = {}
                        
                        # Get text from cell
                        text = ""
                        if 'Relationships' in cell:
                            for rel in cell['Relationships']:
                                if rel['Type'] == 'CHILD':
                                    for word_id in rel['Ids']:
                                        word = blocks_map[word_id]
                                        if word['BlockType'] == 'WORD':
                                            text += word['Text'] + ' '
                        
                        rows[row_index][col_index] = text.strip()
        
        # Convert to CSV-like string
        for row_index in sorted(rows.keys()):
            csv_row = ""
            for col_index in sorted(rows[row_index].keys()):
                csv_row += f"{rows[row_index][col_index]},"
            result_string += csv_row + "\n"
        
        # Add a blank line between tables if there are multiple
        if table_index < len(table_blocks) - 1:
            result_string += "\n"
    
    return result_string

# Call the function with the response
table_string = get_table_data_as_string(response)

# Now you can use the returned string however you want
print(table_string)


Table 1:
Medicine Name,Dosage,Duration,
1) TAB. DEMO MEDICINE 1,1 Morning, 1 Night (Before Food),10 Days (Tot:20 Tab),
2) CAP. DEMO MEDICINE 2,1 Morning, 1 Night (Before Food),10 Days (Tot:20 Cap),
3) TAB. DEMO MEDICINE 3,1 Morning, 1 Aft, 1 Eve, 1 Night (After Food),10 Days (Tot:40 Tab),
4) TAB. DEMO MEDICINE 4,1/2 Morning, 1/2 Night (After Food),10 Days (Tot:10 Tab),



In [74]:
table_string

'Table 1:\nMedicine Name,Dosage,Duration,\n1) TAB. DEMO MEDICINE 1,1 Morning, 1 Night (Before Food),10 Days (Tot:20 Tab),\n2) CAP. DEMO MEDICINE 2,1 Morning, 1 Night (Before Food),10 Days (Tot:20 Cap),\n3) TAB. DEMO MEDICINE 3,1 Morning, 1 Aft, 1 Eve, 1 Night (After Food),10 Days (Tot:40 Tab),\n4) TAB. DEMO MEDICINE 4,1/2 Morning, 1/2 Night (After Food),10 Days (Tot:10 Tab),\n'

In [98]:
sys_prompt = """
You are a medical prescription assistant. You have been given a text extracted from a prescription.
It might be in a raw format or in a table format. Your task is to extract the information and give it in JSON.
GIVE THE OUTPUT STRICTLY IN A JSON FORMAT. DO NOT INCLUDE ANY OTHER TEXT OR EXPLANATION.
Output format:
                [
                    {
                        "medicine": "Medication A",
                        "frequency": "1-0-1",
                        "duration": "7 days",
                        "special_instructions": "Before food"
                    },
                    {
                        "medicine": "Medication B",
                        "frequency": "1-0-0",
                        "duration": "7 days",
                        "special_instructions": "After Food"
                    }
                ]
                So frequency is the time of day when the medicine should be taken, it may be mentioned as Morning, Night, Aft, Eve etc. or may be mentioned as 1-0-1, 0-1-0 etc. If it is mentioned as day and night, you should put it as 1-0-1. Just night means 0-0-1.
                Duration is the number of days for which the medicine should be taken. It may be mentioned as 7 days, 10 days etc.
                Special instructions are any special instructions given by the doctor. If there are no special instructions, please mention it as None.
"""

In [99]:
bot_response = llm.invoke([
    SystemMessage(content=sys_prompt),
    HumanMessage(content=table_string),
])

print(bot_response.content)

[
    {
        "medicine": "TAB. DEMO MEDICINE 1",
        "frequency": "1-0-1",
        "duration": "10 days",
        "special_instructions": "Before Food"
    },
    {
        "medicine": "CAP. DEMO MEDICINE 2",
        "frequency": "1-0-1",
        "duration": "10 days",
        "special_instructions": "Before Food"
    },
    {
        "medicine": "TAB. DEMO MEDICINE 3",
        "frequency": "1-1-1-1",
        "duration": "10 days",
        "special_instructions": "After Food"
    },
    {
        "medicine": "TAB. DEMO MEDICINE 4",
        "frequency": "0.5-0-0-0.5",
        "duration": "10 days",
        "special_instructions": "After Food"
    }
]
