In [6]:
# Install the fuzzywuzzy package
!pip install fuzzywuzzy[speedup]

import pandas as pd
from fuzzywuzzy import fuzz

# Load the data from the provided file paths
ocr_result_path = '/content/OCR Result.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

ocr_result_df = pd.read_excel(ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

# Function to extract PHY_NM and PHY_ID using fuzzy matching
def get_physician_info(text, physician_data):
    best_match = None
    best_score = 0
    for phy_nm in physician_data['PHY_NM']:
        score = fuzz.partial_ratio(phy_nm, text)
        if score > best_score:
            best_score = score
            best_match = phy_nm
    if best_score > 80:  # Threshold for fuzzy matching
        phy_id = physician_data.loc[physician_data['PHY_NM'] == best_match, 'PHY_ID'].values[0]
        return best_match, phy_id
    return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm, text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to determine the type (Slip or Prescription)
def determine_type(text, physician_data):
    for phy_nm in physician_data['PHY_NM']:
        if phy_nm in text:
            return "Prescription"
    return "Slip"

# Function to extract date from text (simplified)
def extract_date(text):
    import re
    date_pattern = r'\d{4}-\d{2}-\d{2}'  # Example pattern: YYYY-MM-DD
    match = re.search(date_pattern, text)
    return match.group(0) if match else None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['image_name']
    extracted_text = row['extracted_text']
    phy_nm, phy_id = get_physician_info(extracted_text, physician_data_df)
    inst_nm, inst_id = get_institution_info(extracted_text, institute_data_df)
    type_info = determine_type(extracted_text, physician_data_df)
    date_info = extract_date(extracted_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image name",
    "Extracted text from OCR",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
new_df_demo

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Install the fuzzywuzzy package
!pip install fuzzywuzzy[speedup]

In [8]:


import pandas as pd
from fuzzywuzzy import fuzz

# Load the data from the provided file paths
ocr_result_path = '/content/OCR Result.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

ocr_result_df = pd.read_excel(ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

# Function to extract PHY_NM and PHY_ID using fuzzy matching
def get_physician_info(text, physician_data):
    best_match = None
    best_score = 0
    for phy_nm in physician_data['PHY_NM']:
        score = fuzz.partial_ratio(phy_nm, text)
        if score > best_score:
            best_score = score
            best_match = phy_nm
    if best_score > 80:  # Threshold for fuzzy matching
        phy_id = physician_data.loc[physician_data['PHY_NM'] == best_match, 'PHY_ID'].values[0]
        return best_match, phy_id
    return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm, text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to determine the type (Slip or Prescription)
def determine_type(text, physician_data):
    for phy_nm in physician_data['PHY_NM']:
        if phy_nm in text:
            return "Prescription"
    return "Slip"

# Function to extract date from text (simplified)
def extract_date(text):
    import re
    date_pattern = r'\d{4}-\d{2}-\d{2}'  # Example pattern: YYYY-MM-DD
    match = re.search(date_pattern, text)
    return match.group(0) if match else None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['image_name']
    extracted_text = row['extracted_text']
    phy_nm, phy_id = get_physician_info(extracted_text, physician_data_df)
    inst_nm, inst_id = get_institution_info(extracted_text, institute_data_df)
    type_info = "Prescription" if phy_nm or any(phy in extracted_text for phy in physician_data_df['PHY_NM']) else "Slip"
    date_info = extract_date(extracted_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image name",
    "Extracted text from OCR",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


             Image name                            Extracted text from OCR  \
0  PRS207C1011672-2.jpg  **মুন্সীগঞ্জ সদর হাসপাতাল**\n**ঔষধ বিভাগ**\n\n...   
1    PRS208C6008963.jpg  ৫০ শয্যা বিশিষ্ট উপজেলা স্বাস্থ্য কমপ্লেক্স\nফ...   
2    PRS208C6014044.jpg  Apollo Hospitals Dhaka Limited\nA concern of A...   
3    PRS208C6020458.jpg  **Dr. Md. Noor Kutubul Alam**\nMBBS, BCS (Heal...   
4    PRS208C6009142.jpg  ৫০ শয্যা বিশিষ্ট উপজেলা স্বাস্থ্য কমপ্লেক্স\nফ...   

  PHY_NM from OCR PHY_ID Institution_NM Institution_ID  Type  Date  
0            None   None           None           None  Slip  None  
1            None   None           None           None  Slip  None  
2            None   None           None           None  Slip  None  
3            None   None           None           None  Slip  None  
4            None   None           None           None  Slip  None  


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:

!pip install googletrans==4.0.0-rc1




In [15]:
import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator

# Load the data from the provided file paths
ocr_result_path = '/content/OCR Result.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

ocr_result_df = pd.read_excel(ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Function to extract PHY_NM and PHY_ID using fuzzy matching
def get_physician_info(text, physician_data):
    best_match = None
    best_score = 0
    for phy_nm in physician_data['PHY_NM']:
        score = fuzz.partial_ratio(phy_nm, text)
        if score > best_score:
            best_score = score
            best_match = phy_nm
    if best_score > 80:  # Threshold for fuzzy matching
        phy_id = physician_data.loc[physician_data['PHY_NM'] == best_match, 'PHY_ID'].values[0]
        return best_match, phy_id
    return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm, text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to determine the type (Slip or Prescription)
def determine_type(text, physician_data):
    for phy_nm in physician_data['PHY_NM']:
        if phy_nm in text:
            return "Prescription"
    return "Slip"

# Function to extract date from text (simplified)
def extract_date(text):
    import re
    date_pattern = r'\d{4}-\d{2}-\d{2}'  # Example pattern: YYYY-MM-DD
    match = re.search(date_pattern, text)
    return match.group(0) if match else None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['image_name']
    extracted_text = row['extracted_text']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text + " " + translated_text

    print(translated_text)
    phy_nm, phy_id = get_physician_info(merged_text, physician_data_df)
    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    type_info = "Prescription" if phy_nm or any(phy in merged_text for phy in physician_data_df['PHY_NM']) else "Slip"
    date_info = extract_date(extracted_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image name",
    "Extracted text from OCR",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)

** Munshiganj Sadar Hospital **
** Drug Department **

** Patient Name: ** Md. A. Jalil
** Age: ** 3 years
** Sex: ** male
** Address: ** Munshiganj Sadar

** Name of drugs: **
1.Tablets: Lamicil (20 mg)
2.Tablet: CiftryAxone (1 mg)
1.Tablets: Metronidazole (1 mg)
1.Injection: SafetyAxone (1 gram)

** morning: **
1.Tablets: Lamicil (20 mg)
2.Tablet: CiftryAxone (1 mg)
1.Tablets: Metronidazole (1 mg)

** in the afternoon: **
1.Tablets: Lamicil (20 mg)
2.Tablet: CiftryAxone (1 mg)
1.Tablets: Metronidazole (1 mg)

** night: **
1.Injection: SafetyAxone (1 gram)

** 3 days **
3 beds Upazila Health Complex
Phulbaria, Magura.
登記 番号 8632
Logarithm Tickets in Bangladesh
Office: Fulbaria Upazila Health Complex
Name: Maya Queen
Age: 5 years
Male/Women: Children
Disease
Medicine
1. CEFIXIME 200mg-0-1-0
2. Napa-one-0-1-0
3. Tab.Voniza 20 mg-0-1
4. Tab.Bilastin-10-0-0-1
5. Tab.Rapasa Bar-0-0-1-0
Apollo hospitals dhaka limited
A concern of apollo hospitals enterprise limited
Level-6, Block-B, Bashund

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
# Install necessary packages
!pip install fuzzywuzzy[speedup]
!pip install googletrans==4.0.0-rc1

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
ocr_result_path = '/content/OCR Result.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

ocr_result_df = pd.read_excel(ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Function to extract PHY_NM and PHY_ID using fuzzy matching
def get_physician_info(text, physician_data):
    best_match = None
    best_score = 0
    for phy_nm in physician_data['PHY_NM']:
        score = fuzz.partial_ratio(phy_nm, text)
        if score > best_score:
            best_score = score
            best_match = phy_nm
    if best_score > 80:  # Threshold for fuzzy matching
        phy_id = physician_data.loc[physician_data['PHY_NM'] == best_match, 'PHY_ID'].values[0]
        return best_match, phy_id
    return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm, text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to determine the type (Slip or Prescription)
def determine_type(text, physician_data):
    for phy_nm in physician_data['PHY_NM']:
        if phy_nm in text:
            return "Prescription"
    return "Slip"

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['image_name']
    extracted_text = row['extracted_text']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text + " " + translated_text
    phy_nm, phy_id = get_physician_info(merged_text, physician_data_df)
    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    type_info = "Prescription" if phy_nm or any(phy in merged_text for phy in physician_data_df['PHY_NM']) else "Slip"
    date_info = extract_date(merged_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image name",
    "Extracted text from OCR",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


             Image name                            Extracted text from OCR  \
0  PRS207C1011672-2.jpg  **মুন্সীগঞ্জ সদর হাসপাতাল**\n**ঔষধ বিভাগ**\n\n...   
1    PRS208C6008963.jpg  ৫০ শয্যা বিশিষ্ট উপজেলা স্বাস্থ্য কমপ্লেক্স\nফ...   
2    PRS208C6014044.jpg  Apollo Hospitals Dhaka Limited\nA concern of A...   
3    PRS208C6020458.jpg  **Dr. Md. Noor Kutubul Alam**\nMBBS, BCS (Heal...   
4    PRS208C6009142.jpg  ৫০ শয্যা বিশিষ্ট উপজেলা স্বাস্থ্য কমপ্লেক্স\nফ...   

  PHY_NM from OCR PHY_ID Institution_NM Institution_ID  Type        Date  
0            None   None           None           None  Slip        None  
1            None   None           None           None  Slip        None  
2            None   None           None           None  Slip        None  
3            None   None           None           None  Slip  08/04/2024  
4            None   None           None           None  Slip        None  


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
ocr_result_path = '/content/OCR Result.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

ocr_result_df = pd.read_excel(ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Function to extract PHY_NM and PHY_ID using fuzzy matching
def get_physician_info(text, physician_data):
    best_match = None
    best_score = 0
    for phy_nm in physician_data['PHY_NM']:
        score = fuzz.partial_ratio(phy_nm, text)
        if score > best_score:
            best_score = score
            best_match = phy_nm
    if best_score > 80:  # Threshold for fuzzy matching
        phy_id = physician_data.loc[physician_data['PHY_NM'] == best_match, 'PHY_ID'].values[0]
        return best_match, phy_id
    return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm, text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to determine the type (Slip or Prescription)
def determine_type(text, physician_data):
    for phy_nm in physician_data['PHY_NM']:
        if phy_nm in text:
            return "Prescription"
    return "Slip"

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['image_name']
    extracted_text = row['extracted_text']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text + " " + translated_text
    phy_nm, phy_id = get_physician_info(merged_text, physician_data_df)



    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    type_info = "Prescription" if phy_nm or any(phy in merged_text for phy in physician_data_df['PHY_NM']) else "Slip"
    date_info = extract_date(merged_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image name",
    "Extracted text from OCR",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


In [20]:
# # Install necessary packages
# !pip install fuzzywuzzy[speedup]
# !pip install googletrans==4.0.0-rc1

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
ocr_result_path = '/content/OCR Result.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

ocr_result_df = pd.read_excel(ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Preprocess text for matching
def preprocess_text(text):
    text = text.replace('DR.', 'DR_')  # Temporarily replace 'DR.' to retain it
    text = re.sub(r'\.', '', text)  # Remove all other periods
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('DR_', 'DR.')  # Restore 'DR.'
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

# Preprocess text for exact matching
def preprocess_text_exact(text):
    text = re.sub(r'\.', '', text)  # Remove all periods
    text = re.sub(r'\s+', '', text)  # Remove all spaces
    text = re.sub(r'[^A-Za-z]', ''       , text)  # Remove non-alphabetic characters
    return text.strip()

# Function to process an image using fuzzy matching
def process_text(merged_text, phy_df):
    preprocessed_merged_text = preprocess_text(merged_text)

    # Print the preprocessed merged text
    print(f"Preprocessed Merged Text: {preprocessed_merged_text}")

    # Use fuzzy matching to find the best match for each physician name
    matched_docs = []
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_text)
        if match_score > 80:  # Using a high threshold to ensure accuracy
            matched_docs.append((row['PHY_NM'], phy_id, match_score))

    if matched_docs:
        best_match = max(matched_docs, key=lambda x: x[2])
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to determine the type (Slip or Prescription)
def determine_type(merged_text, phy_df):
    for phy_nm in phy_df['PHY_NM']:
        if phy_nm in merged_text:
            return "Prescription"
    return "Slip"

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['image_name']
    extracted_text = row['extracted_text']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text + " " + translated_text
    phy_nm, phy_id = process_text(merged_text, physician_data_df)
    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    type_info = "Prescription" if phy_nm else "Slip"
    date_info = extract_date(merged_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image name",
    "Extracted text from OCR",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
# from google.colab import files
# files.download(output_path_demo)


Preprocessed Merged Text: Munshiganj Sadar Hospital Drug Department Patient Name Md A Jalil Age years Sex male Address Munshiganj Sadar Name of drugs Tablets Lamicil mg Tablet CiftryAxone mg Tablets Metronidazole mg Injection SafetyAxone gram morning Tablets Lamicil mg Tablet CiftryAxone mg Tablets Metronidazole mg in the afternoon Tablets Lamicil mg Tablet CiftryAxone mg Tablets Metronidazole mg night Injection SafetyAxone gram days
Preprocessed Merged Text: Cefixime mg Napaone Tab Voniza mg Tab Bilastin Tab Rapasa Bar beds Upazila Health Complex Phulbaria Magura Logarithm Tickets in Bangladesh Office Fulbaria Upazila Health Complex Name Maya Queen Age years MaleWomen Children Disease Medicine CEFIXIME mg Napaone TabVoniza mg TabBilastin TabRapasa Bar
Preprocessed Merged Text: Apollo Hospitals Dhaka Limited A concern of Apollo Hospitals Enterprise Limited Level BlockB Bashundhara City Shopping Complex Panthapath Dhaka Phone Fax Email hasanromapollohealthcom Phone Email drhasancihbdorg

In [23]:
# # Install necessary packages
# !pip install fuzzywuzzy[speedup]
# !pip install googletrans==4.0.0-rc1

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
ocr_result_path = '/content/Ocr500.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

ocr_result_df = pd.read_excel(ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Preprocess text for matching
def preprocess_text(text):
    text = text.replace('DR.', 'DR_')  # Temporarily replace 'DR.' to retain it
    text = re.sub(r'\.', '', text)  # Remove all other periods
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('DR_', 'DR.')  # Restore 'DR.'
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

# Preprocess text for exact matching
def preprocess_text_exact(text):
    text = re.sub(r'\.', '', text)  # Remove all periods
    text = re.sub(r'\s+', '', text)  # Remove all spaces
    text = re.sub(r'[^A-Za-z]', '', text)  # Remove non-alphabetic characters
    return text.strip()

# Function to process an image using Code1
def process_image_code1(merged_string, phy_df, image_name, results):
    preprocessed_merged_string = preprocess_text(merged_string)
    print("preprocessed_merged_string (Code1) =", preprocessed_merged_string)

    # Use fuzzy matching to find the best match for each physician name
    matched_docs = []
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
        if match_score > 80:  # Using a high threshold to ensure accuracy
            matched_docs.append((row['PHY_NM'], phy_id, match_score))

    if matched_docs:
        for doc in matched_docs:
            print(f"Matched Doctor Name (Code1): {doc[0]}, ID: {doc[1]}, Score: {doc[2]}\n")
            results.append([len(results) + 1, image_name, doc[0], doc[1]])
        return True
    else:
        return False

# Function to process an image using Code2
def process_image_code2(merged_string, phy_df, image_name, results):
    preprocessed_merged_string = preprocess_text_exact(merged_string)
    print("preprocessed_merged_string (Code2) =", preprocessed_merged_string)

    # Check for exact matches in the physician names
    matched_docs = []
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        if phy_nm in preprocessed_merged_string:
            matched_docs.append((row['PHY_NM'], phy_id))

    if matched_docs:
        for doc in matched_docs:
            print(f"Matched Doctor Name (Code2): {doc[0]}, ID: {doc[1]}\n")
            results.append([len(results) + 1, image_name, doc[0], doc[1]])
        return True
    else:
        return False

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm, text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to determine the type (Slip or Prescription)
def determine_type(merged_text, phy_df):
    for phy_nm in phy_df['PHY_NM']:
        if phy_nm in merged_text:
            return "Prescription"
    return "Slip"

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

# Initialize the results list
results = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['Image Name']
    extracted_text = row['Extracted Text']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text + " " + translated_text

    # Print the preprocessed merged text
    print(f"Preprocessed Merged Text: {merged_text}")

    # Attempt matching using Code2 first
    match_found = process_image_code2(merged_text, physician_data_df, image_name, results)

    if not match_found:
        # If no match found using Code2, attempt matching using Code1
        match_found = process_image_code1(merged_text, physician_data_df, image_name, results)

        if not match_found:
            print("Both codes show No match found\n")

    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    type_info = "Prescription" if any([res[2] for res in results if res[1] == image_name]) else "Slip"
    date_info = extract_date(merged_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        [res[2] for res in results if res[1] == image_name],
        [res[3] for res in results if res[1] == image_name],
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image name",
    "Extracted text from OCR",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


Preprocessed Merged Text: শীতল ক্যান্টনমেন্ট ডায়াগনস্টিক সেন্টার, রংপুর
মেজর (ডাঃ) মোঃ আব্দুল্লাহিস্ সাফি
এমবিবিএস, এফসিপিএস, এমসিপিএস, ডি
থাইরয়েড নেক সার্জারী কানের মাইক্রোসার্জারীতে
উচ্চতর প্রশিক্ষণ প্রাপ্ত (সম্মিলিত সামরিক হাসপাতাল, ঢাকা)
নাক, কনা, গলা, মাথা-ঘাড় রোগ বিশেষজ্ঞজ্ঞ
সহকারী অধ্যাপক, নাক, কান, গলা বিভাগ
এন্ড সার্জন
আমি মেডিকেল কলেজ, রংপুর ও সিএমএইচ রংপুর
মোবাইল: ০১৭৩৭-৩২৮২৮১
CGH
Major (Dr) Md. Abdullahis Safi
MBBS, FCPS, MCPS, DLO
Higher training on Thyroid, Neck & Micro Ear Surgery
(Combined Military Hospital, Dhaka)
Nose, Ear, Throat, Head and Neck Specialist and Surgeon
Assistant Professor, Department of Nose, Ear, Throat
Army Medical College, Rangpur and CMH Rangpur.
Mobilel 01737-328281
Name:
Md. Mubarak Hossain
Age: 17 Y
Sex: Male
Date: 28/03/2024
ID: 28032405
On Examinations
• Nasal airway is
clear
Rx,
1. Cap. Orcef 200 mg
১+০+১ টি
• nasal suction and
clearance was
done
⚫ Nasal splint is
removed today
Diagnosis
• DNS with HIT (OP)
2. Tab. Alcet 5 mg
০ + ০ + ১ টি


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
# Install necessary packages
!pip install fuzzywuzzy[speedup]
!pip install googletrans==4.0.0-rc1

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
new_ocr_result_path = '/content/Ocr500.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

new_ocr_result_df = pd.read_excel(new_ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Preprocess text for matching
def preprocess_text(text):
    text = text.replace('DR.', 'DR_')  # Temporarily replace 'DR.' to retain it
    text = re.sub(r'\.', '', text)  # Remove all other periods
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('DR_', 'DR.')  # Restore 'DR.'
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip().upper()  # Convert to uppercase

# Preprocess text for exact matching
def preprocess_text_exact(text):
    text = re.sub(r'\.', '', text)  # Remove all periods
    text = re.sub(r'\s+', '', text)  # Remove all spaces
    text = re.sub(r'[^A-Za-z]', '', text)  # Remove non-alphabetic characters
    return text.strip().upper()  # Convert to uppercase

# New function to process an image using Code3
def process_image_code3(extracted_text, phy_df, image_name, results):
    # Separate Bengali and English text
    bangla_text = ''.join([char for char in extracted_text if '\u0980' <= char <= '\u09FF'])
    english_text = ''.join([char for char in extracted_text if char not in bangla_text])

    # Translate Bengali text to English
    translated_text = translate_text(bangla_text)

    # Remove special characters and spaces
    english_text = re.sub(r'[^A-Za-z\s]', '', english_text).replace(' ', '')
    translated_text = re.sub(r'[^A-Za-z\s]', '', translated_text).replace(' ', '')

    # Merge texts
    merged_text = (english_text + translated_text).upper()
    print(f"Preprocessed Merged Text (Code3): {merged_text}")

    # Use fuzzy matching to find the best match for each physician name
    matched_docs = []
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, merged_text)
        if match_score > 80:  # Using a high threshold to ensure accuracy
            matched_docs.append((row['PHY_NM'], phy_id, match_score))

    if matched_docs:
        for doc in matched_docs:
            print(f"Matched Doctor Name (Code3): {doc[0]}, ID: {doc[1]}, Score: {doc[2]}\n")
            results.append([len(results) + 1, image_name, doc[0], doc[1]])
        return True
    else:
        return False

# Function to process an image using Code1
def process_image_code1(merged_string, phy_df, image_name, results):
    preprocessed_merged_string = preprocess_text(merged_string)
    print("preprocessed_merged_string (Code1) =", preprocessed_merged_string)

    # Use fuzzy matching to find the best match for each physician name
    matched_docs = []
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
        if match_score > 80:  # Using a high threshold to ensure accuracy
            matched_docs.append((row['PHY_NM'], phy_id, match_score))

    if matched_docs:
        for doc in matched_docs:
            print(f"Matched Doctor Name (Code1): {doc[0]}, ID: {doc[1]}, Score: {doc[2]}\n")
            results.append([len(results) + 1, image_name, doc[0], doc[1]])
        return True
    else:
        return False

# Function to process an image using Code2
def process_image_code2(merged_string, phy_df, image_name, results):
    preprocessed_merged_string = preprocess_text_exact(merged_string)
    print("preprocessed_merged_string (Code2) =", preprocessed_merged_string)

    # Check for exact matches in the physician names
    matched_docs = []
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        if phy_nm in preprocessed_merged_string:
            matched_docs.append((row['PHY_NM'], phy_id))

    if matched_docs:
        for doc in matched_docs:
            print(f"Matched Doctor Name (Code2): {doc[0]}, ID: {doc[1]}\n")
            results.append([len(results) + 1, image_name, doc[0], doc[1]])
        return True
    else:
        return False

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm.upper(), text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = new_ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

# Initialize the results list
results = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['Image Name']
    extracted_text = row['Extracted Text']
    type_info = row['Type']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text.upper() + " " + translated_text.upper()

    # Print the preprocessed merged text
    print(f"Preprocessed Merged Text: {merged_text}")

    # Attempt matching using Code3 first
    match_found = process_image_code3(extracted_text, physician_data_df, image_name, results)

    if not match_found:
        # If no match found using Code3, attempt matching using Code2
        match_found = process_image_code2(merged_text, physician_data_df, image_name, results)

        if not match_found:
            # If no match found using Code2, attempt matching using Code1
            match_found = process_image_code1(merged_text, physician_data_df, image_name, results)

            if not match_found:
                print("All codes show No match found\n")

    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    date_info = extract_date(merged_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        [res[2] for res in results if res[1] == image_name],
        [res[3] for res in results if res[1] == image_name],
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image Name",
    "Extracted Text",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


Preprocessed Merged Text: শীতল ক্যান্টনমেন্ট ডায়াগনস্টিক সেন্টার, রংপুর
মেজর (ডাঃ) মোঃ আব্দুল্লাহিস্ সাফি
এমবিবিএস, এফসিপিএস, এমসিপিএস, ডি
থাইরয়েড নেক সার্জারী কানের মাইক্রোসার্জারীতে
উচ্চতর প্রশিক্ষণ প্রাপ্ত (সম্মিলিত সামরিক হাসপাতাল, ঢাকা)
নাক, কনা, গলা, মাথা-ঘাড় রোগ বিশেষজ্ঞজ্ঞ
সহকারী অধ্যাপক, নাক, কান, গলা বিভাগ
এন্ড সার্জন
আমি মেডিকেল কলেজ, রংপুর ও সিএমএইচ রংপুর
মোবাইল: ০১৭৩৭-৩২৮২৮১
CGH
MAJOR (DR) MD. ABDULLAHIS SAFI
MBBS, FCPS, MCPS, DLO
HIGHER TRAINING ON THYROID, NECK & MICRO EAR SURGERY
(COMBINED MILITARY HOSPITAL, DHAKA)
NOSE, EAR, THROAT, HEAD AND NECK SPECIALIST AND SURGEON
ASSISTANT PROFESSOR, DEPARTMENT OF NOSE, EAR, THROAT
ARMY MEDICAL COLLEGE, RANGPUR AND CMH RANGPUR.
MOBILEL 01737-328281
NAME:
MD. MUBARAK HOSSAIN
AGE: 17 Y
SEX: MALE
DATE: 28/03/2024
ID: 28032405
ON EXAMINATIONS
• NASAL AIRWAY IS
CLEAR
RX,
1. CAP. ORCEF 200 MG
১+০+১ টি
• NASAL SUCTION AND
CLEARANCE WAS
DONE
⚫ NASAL SPLINT IS
REMOVED TODAY
DIAGNOSIS
• DNS WITH HIT (OP)
2. TAB. ALCET 5 MG
০ + ০ + ১ টি


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
# Install necessary packages
!pip install fuzzywuzzy[speedup]
!pip install googletrans==4.0.0-rc1

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
new_ocr_result_path = '/content/Ocr500.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

new_ocr_result_df = pd.read_excel(new_ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Preprocess text for matching
def preprocess_text(text):
    text = text.replace('DR.', 'DR_')  # Temporarily replace 'DR.' to retain it
    text = re.sub(r'\.', '', text)  # Remove all other periods
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('DR_', 'DR.')  # Restore 'DR.'
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip().upper()  # Convert to uppercase

# Preprocess text for exact matching
def preprocess_text_exact(text):
    text = re.sub(r'\.', '', text)  # Remove all periods
    text = re.sub(r'\s+', '', text)  # Remove all spaces
    text = re.sub(r'[^A-Za-z]', '', text)  # Remove non-alphabetic characters
    return text.strip().upper()  # Convert to uppercase

# New function to process an image using Code3
def process_image_code3(extracted_text, phy_df):
    # Separate Bengali and English text
    bangla_text = ''.join([char for char in extracted_text if '\u0980' <= char <= '\u09FF'])
    english_text = ''.join([char for char in extracted_text if char not in bangla_text])

    # Translate Bengali text to English
    translated_text = translate_text(bangla_text)

    # Remove special characters and spaces
    english_text = re.sub(r'[^A-Za-z\s]', '', english_text).replace(' ', '')
    translated_text = re.sub(r'[^A-Za-z\s]', '', translated_text).replace(' ', '')

    # Merge texts
    merged_text = (english_text + translated_text).upper()
    print(f"Preprocessed Merged Text (Code3): {merged_text}")

    # Use fuzzy matching to find the best match for each physician name
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, merged_text)
        if match_score > best_score:  # Find the best match
            best_score = match_score
            best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code3): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to process an image using Code1
def process_image_code1(merged_string, phy_df):
    preprocessed_merged_string = preprocess_text(merged_string)
    print("preprocessed_merged_string (Code1) =", preprocessed_merged_string)

    # Use fuzzy matching to find the best match for each physician name
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
        if match_score > best_score:  # Find the best match
            best_score = match_score
            best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code1): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to process an image using Code2
def process_image_code2(merged_string, phy_df):
    preprocessed_merged_string = preprocess_text_exact(merged_string)
    print("preprocessed_merged_string (Code2) =", preprocessed_merged_string)

    # Check for exact matches in the physician names
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        if phy_nm in preprocessed_merged_string:
            match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
            if match_score > best_score:  # Find the best match
                best_score = match_score
                best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code2): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm.upper(), text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = new_ocr_result_df.head(50)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

# Initialize the results list
results = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['Image Name']
    extracted_text = row['Extracted Text']
    type_info = row['Type']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text.upper() + " " + translated_text.upper()

    # # Print the preprocessed merged text
    # print(f"Preprocessed Merged Text: {merged_text}")

    # Attempt matching using Code3 first
    phy_nm, phy_id = process_image_code3(extracted_text, physician_data_df)

    if not phy_nm:
        # If no match found using Code3, attempt matching using Code2
     if not phy_nm:
        # If no match found using Code3, attempt matching using Code2
        phy_nm, phy_id = process_image_code2(merged_text, physician_data_df)

        if not phy_nm:
            # If no match found using Code2, attempt matching using Code1
            phy_nm, phy_id = process_image_code1(merged_text, physician_data_df)

            if not phy_nm:
                print("All codes show No match found\n")

    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    date_info = extract_date(merged_text)

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image Name",
    "Extracted Text",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


Preprocessed Merged Text (Code3): 









CGH
MAJORDRMDABDULLAHISSAFI
MBBSFCPSMCPSDLO
HIGHERTRAININGONTHYROIDNECKMICROEARSURGERY
COMBINEDMILITARYHOSPITALDHAKA
NOSEEARTHROATHEADANDNECKSPECIALISTANDSURGEON
ASSISTANTPROFESSORDEPARTMENTOFNOSEEARTHROAT
ARMYMEDICALCOLLEGERANGPURANDCMHRANGPUR
MOBILEL
NAME
MDMUBARAKHOSSAIN
AGEY
SEXMALE
DATE
ID
ONEXAMINATIONS
NASALAIRWAYIS
CLEAR
RX
CAPORCEFMG

NASALSUCTIONAND
CLEARANCEWAS
DONE
NASALSPLINTIS
REMOVEDTODAY
DIAGNOSIS
DNSWITHHITOP
TABALCETMG

TABNAPAEXTENDMG

TABSANBURMG

TABCEEVITMG






HOS
MENTHOL




ANTAZOLDROP
X


NEBANOLOINTMENT



ADVICES

CGH


FOLLOWUP

MDABDULLAHISSAFI
AW
POWEREDBYDOCTOR










SHITALPURACMACHARCHANGPURAMOBILEDAYSDAYSDAYSDAYSDAYSDAYSDAYSDAYSDAYSDAYSTODAYSELIMINATINGTHEDAYTOTIMESINTHEDAYTIMEANDTHENEXTDAYOFTHEDAYTHENEXTDAYMOBILMOBILEMOBILEPMONAM
Matched Doctor Name (Code3): DR. MD ABDULLAHIS SAFI, ID: RNG25655, Score: 100

Preprocessed Merged Text (Code3): IBNSINA
IBNSINAMEDICALCOLLEGEHOSPITAL
BKALYANPURDHAKAADMINISTR

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
# Install necessary packages
!pip install fuzzywuzzy[speedup]
!pip install googletrans==4.0.0-rc1

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
new_ocr_result_path = '/content/Ocr500.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

new_ocr_result_df = pd.read_excel(new_ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Preprocess text for matching
def preprocess_text(text):
    text = text.replace('DR.', 'DR_')  # Temporarily replace 'DR.' to retain it
    text = re.sub(r'\.', '', text)  # Remove all other periods
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('DR_', 'DR.')  # Restore 'DR.'
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip().upper()  # Convert to uppercase

# Preprocess text for exact matching
def preprocess_text_exact(text):
    text = re.sub(r'\.', '', text)  # Remove all periods
    text = re.sub(r'\s+', '', text)  # Remove all spaces
    text = re.sub(r'[^A-Za-z]', '', text)  # Remove non-alphabetic characters
    return text.strip().upper()  # Convert to uppercase

# New function to process an image using Code3
def process_image_code3(extracted_text, phy_df):
    # Separate Bengali and English text
    bangla_text = ''.join([char for char in extracted_text if '\u0980' <= char <= '\u09FF'])
    english_text = ''.join([char for char in extracted_text if char not in bangla_text])

    # Translate Bengali text to English
    translated_text = translate_text(bangla_text)

    # Remove special characters and spaces
    english_text = re.sub(r'[^A-Za-z\s]', '', english_text).replace(' ', '')
    translated_text = re.sub(r'[^A-Za-z\s]', '', translated_text).replace(' ', '')

    # Merge texts
    merged_text = (english_text + translated_text).upper()
    print(f"Preprocessed Merged Text (Code3): {merged_text}")

    # Use fuzzy matching to find the best match for each physician name
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, merged_text)
        if match_score > best_score:  # Find the best match
            best_score = match_score
            best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code3): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to process an image using Code1
def process_image_code1(merged_string, phy_df):
    preprocessed_merged_string = preprocess_text(merged_string)
    print("preprocessed_merged_string (Code1) =", preprocessed_merged_string)

    # Use fuzzy matching to find the best match for each physician name
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
        if match_score > best_score:  # Find the best match
            best_score = match_score
            best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code1): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to process an image using Code2
def process_image_code2(merged_string, phy_df):
    preprocessed_merged_string = preprocess_text_exact(merged_string)
    print("preprocessed_merged_string (Code2) =", preprocessed_merged_string)

    # Check for exact matches in the physician names
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        if phy_nm in preprocessed_merged_string:
            match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
            if match_score > best_score:  # Find the best match
                best_score = match_score
                best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code2): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm.upper(), text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = new_ocr_result_df.head(20)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

# Initialize the results list
results = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['Image Name']
    extracted_text = row['Extracted Text']
    type_info = row['Type']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text.upper() + " " + translated_text.upper()

    # Print the preprocessed merged text
    print(f"Preprocessed Merged Text: {merged_text}")

    # Attempt matching using Code3 first
    phy_nm, phy_id = process_image_code3(extracted_text, physician_data_df)

    if not phy_nm:
        # If no match found using Code3, attempt
     if not phy_nm:
        # If no match found using Code3, attempt matching using Code2
        phy_nm, phy_id = process_image_code2(merged_text, physician_data_df)

        if not phy_nm:
            # If no match found using Code2, attempt matching using Code1
            phy_nm, phy_id = process_image_code1(merged_text, physician_data_df)

            if not phy_nm:
                print("All codes show No match found\n")

    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    date_info = extract_date(extracted_text)  # Search for dates directly in the main "Extracted Text"

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image Name",
    "Extracted Text",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


Preprocessed Merged Text: শীতল ক্যান্টনমেন্ট ডায়াগনস্টিক সেন্টার, রংপুর
মেজর (ডাঃ) মোঃ আব্দুল্লাহিস্ সাফি
এমবিবিএস, এফসিপিএস, এমসিপিএস, ডি
থাইরয়েড নেক সার্জারী কানের মাইক্রোসার্জারীতে
উচ্চতর প্রশিক্ষণ প্রাপ্ত (সম্মিলিত সামরিক হাসপাতাল, ঢাকা)
নাক, কনা, গলা, মাথা-ঘাড় রোগ বিশেষজ্ঞজ্ঞ
সহকারী অধ্যাপক, নাক, কান, গলা বিভাগ
এন্ড সার্জন
আমি মেডিকেল কলেজ, রংপুর ও সিএমএইচ রংপুর
মোবাইল: ০১৭৩৭-৩২৮২৮১
CGH
MAJOR (DR) MD. ABDULLAHIS SAFI
MBBS, FCPS, MCPS, DLO
HIGHER TRAINING ON THYROID, NECK & MICRO EAR SURGERY
(COMBINED MILITARY HOSPITAL, DHAKA)
NOSE, EAR, THROAT, HEAD AND NECK SPECIALIST AND SURGEON
ASSISTANT PROFESSOR, DEPARTMENT OF NOSE, EAR, THROAT
ARMY MEDICAL COLLEGE, RANGPUR AND CMH RANGPUR.
MOBILEL 01737-328281
NAME:
MD. MUBARAK HOSSAIN
AGE: 17 Y
SEX: MALE
DATE: 28/03/2024
ID: 28032405
ON EXAMINATIONS
• NASAL AIRWAY IS
CLEAR
RX,
1. CAP. ORCEF 200 MG
১+০+১ টি
• NASAL SUCTION AND
CLEARANCE WAS
DONE
⚫ NASAL SPLINT IS
REMOVED TODAY
DIAGNOSIS
• DNS WITH HIT (OP)
2. TAB. ALCET 5 MG
০ + ০ + ১ টি


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# Install necessary packages
!pip install fuzzywuzzy[speedup]
!pip install googletrans==4.0.0-rc1

import pandas as pd
from fuzzywuzzy import fuzz
from googletrans import Translator
import re

# Load the data from the provided file paths
new_ocr_result_path = '/content/Ocr500.xlsx'
physician_data_path = '/content/Extracted_Physician_Data.xlsx'
institute_data_path = '/content/Extracted_Institute_Data.xlsx'

new_ocr_result_df = pd.read_excel(new_ocr_result_path)
physician_data_df = pd.read_excel(physician_data_path)
institute_data_df = pd.read_excel(institute_data_path)

translator = Translator()

# Function to translate Bengali text to English
def translate_text(text):
    try:
        translated = translator.translate(text, src='bn', dest='en')
        return translated.text
    except Exception as e:
        return ""

# Preprocess text for matching
def preprocess_text(text):
    text = text.replace('DR.', 'DR_')  # Temporarily replace 'DR.' to retain it
    text = re.sub(r'\.', '', text)  # Remove all other periods
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-alphabetic characters
    text = text.replace('DR_', 'DR.')  # Restore 'DR.'
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip().upper()  # Convert to uppercase

# Preprocess text for exact matching
def preprocess_text_exact(text):
    text = re.sub(r'\.', '', text)  # Remove all periods
    text = re.sub(r'\s+', '', text)  # Remove all spaces
    text = re.sub(r'[^A-Za-z]', '', text)  # Remove non-alphabetic characters
    return text.strip().upper()  # Convert to uppercase

# New function to process an image using Code3
def process_image_code3(extracted_text, phy_df):
    # Separate Bengali and English text
    bangla_text = ''.join([char for char in extracted_text if '\u0980' <= char <= '\u09FF'])
    english_text = ''.join([char for char in extracted_text if char not in bangla_text])

    # Translate Bengali text to English
    translated_text = translate_text(bangla_text)

    # Remove special characters and spaces
    english_text = re.sub(r'[^A-Za-z\s]', '', english_text).replace(' ', '')
    translated_text = re.sub(r'[^A-Za-z\s]', '', translated_text).replace(' ', '')

    # Merge texts
    merged_text = (english_text + translated_text).upper()
    print(f"Preprocessed Merged Text (Code3): {merged_text}")

    # Use fuzzy matching to find the best match for each physician name
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, merged_text)
        if match_score > best_score:  # Find the best match
            best_score = match_score
            best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code3): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to process an image using Code1
def process_image_code1(merged_string, phy_df):
    preprocessed_merged_string = preprocess_text(merged_string)
    print("preprocessed_merged_string (Code1) =", preprocessed_merged_string)

    # Use fuzzy matching to find the best match for each physician name
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
        if match_score > best_score:  # Find the best match
            best_score = match_score
            best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code1): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to process an image using Code2
def process_image_code2(merged_string, phy_df):
    preprocessed_merged_string = preprocess_text_exact(merged_string)
    print("preprocessed_merged_string (Code2) =", preprocessed_merged_string)

    # Check for exact matches in the physician names
    best_match = None
    best_score = 0
    for index, row in phy_df.iterrows():
        phy_nm = preprocess_text_exact(row['PHY_NM'].upper())
        phy_id = row['PHY_ID']
        if phy_nm in preprocessed_merged_string:
            match_score = fuzz.partial_ratio(phy_nm, preprocessed_merged_string)
            if match_score > best_score:  # Find the best match
                best_score = match_score
                best_match = (row['PHY_NM'], phy_id)

    if best_score > 80:  # Threshold for fuzzy matching
        print(f"Matched Doctor Name (Code2): {best_match[0]}, ID: {best_match[1]}, Score: {best_score}\n")
        return best_match[0], best_match[1]
    else:
        return None, None

# Function to extract Institution_NM and Institution_ID using fuzzy matching
def get_institution_info(text, institute_data):
    best_match = None
    best_score = 0
    for inst_nm in institute_data['INS_NM1']:
        score = fuzz.partial_ratio(inst_nm.upper(), text)
        if score > best_score:
            best_score = score
            best_match = inst_nm
    if best_score > 80:  # Threshold for fuzzy matching
        inst_id = institute_data.loc[institute_data['INS_NM1'] == best_match, 'INSTCD'].values[0]
        return best_match, inst_id
    return None, None

# Function to extract date from text (both Bengali and English)
def extract_date(text):
    # English date patterns
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}', r'\d{2}-\d{2}-\d{4}', r'\d{2}/\d{2}/\d{4}', r'\d{4}/\d{2}/\d{2}',
        r'\d{4}\.\d{2}\.\d{2}', r'\d{2}\.\d{2}\.\d{4}'
    ]
    # Bengali date patterns (translated to their Unicode equivalents)
    bangla_digits = '০১২৩৪৫৬৭৮৯'
    date_patterns += [
        r'[{}]{{4}}-[{}]{{2}}-[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}-[{}]{{2}}-[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}/[{}]{{2}}/[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}/[{}]{{2}}/[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{4}}\.[{}]{{2}}\.[{}]{{2}}'.format(bangla_digits, bangla_digits, bangla_digits),
        r'[{}]{{2}}\.[{}]{{2}}\.[{}]{{4}}'.format(bangla_digits, bangla_digits, bangla_digits)
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)
    return None

# Reduce the number of entries for demonstration purposes
reduced_ocr_result_df = new_ocr_result_df.head(522)  # Select only the first 20 entries for demonstration

# Initialize an empty list to store the results
new_data_demo = []

# Initialize the results list
results = []

for index, row in reduced_ocr_result_df.iterrows():
    image_name = row['Image Name']
    extracted_text = row['Extracted Text']
    type_info = row['Type']
    translated_text = translate_text(extracted_text)
    merged_text = extracted_text.upper() + " " + translated_text.upper()

    # Print the preprocessed merged text
    print(f"Preprocessed Merged Text: {merged_text}")

    # Attempt matching using Code3 first
    phy_nm, phy_id = process_image_code3(extracted_text, physician_data_df)

    if not phy_nm:
        # If no
     if not phy_nm:
        # If no match found using Code3, attempt matching using Code2
        phy_nm, phy_id = process_image_code2(merged_text, physician_data_df)

        if not phy_nm:
            # If no match found using Code2, attempt matching using Code1
            phy_nm, phy_id = process_image_code1(merged_text, physician_data_df)

            if not phy_nm:
                print("All codes show No match found\n")

    inst_nm, inst_id = get_institution_info(merged_text, institute_data_df)
    date_info = extract_date(extracted_text)  # Search for dates directly in the main "Extracted Text"

    new_data_demo.append([
        image_name,
        extracted_text,
        phy_nm,
        phy_id,
        inst_nm,
        inst_id,
        type_info,
        date_info
    ])

# Convert the list to a DataFrame
new_df_demo = pd.DataFrame(new_data_demo, columns=[
    "Image Name",
    "Extracted Text",
    "PHY_NM from OCR",
    "PHY_ID",
    "Institution_NM",
    "Institution_ID",
    "Type",
    "Date"
])

# Display the new DataFrame
print(new_df_demo.head())

# Save the new DataFrame to an Excel file
output_path_demo = '/content/Processed_OCR_Data_Demo.xlsx'
new_df_demo.to_excel(output_path_demo, index=False)

# Provide a link to download the file
from google.colab import files
files.download(output_path_demo)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m






RX

TABESONIAMG
J

TABFERADINMG

TABCOSTAB




BANGLADESHFORMANGBEDSYEAROLDDISEASEACENTENARY
preprocessed_merged_string (Code2) = RXTABESONIAMGJTABFERADINMGTABCOSTABBANGLADESHFORMNOBEDOUTPATIENTMONEYHOSPITALCENTERREGNUMBERDATEACNEAGEMALEFEMALEDATERXTREATMENTTABESONIAMGJDAYSTABFERADINMGDAYSTABCOSTABDAYSNOBABACOVETEATINGFTANGYINGBNONMDRSCROREPRINTDESLENO
preprocessed_merged_string (Code1) = RX TAB ESONIA MG J TAB FERADIN MG TAB COSTA B BANGLADESH FORM NO BED OUTPATIENT MONEY HOSPITALCENTER REG NUMBER DATE ACNE AGE MALEFEMALE DATE RX TREATMENT TAB ESONIA MG J DAYS TAB FERADIN MG DAYS TABCOSTA B DAYS NO BABACOVETEATINGF TANG YING B NONMD RS CRORE PRINT DESLE NO
All codes show No match found

Preprocessed Merged Text: হাসপাতাল/ কো
CEFEN WWW
माघ
B
HAND
PAIN
বহির্বিভাগীয় রোগীর টিকিট
17893618/28
01724077
P
FOFARANT
16. DICLOFEN
WWW/F
22
(ENG)
50
CUP. FINIX 20
পদিন
दित
YOU, OSTOCALES
DICLOGEL
দৈনিক ৩
৪ ৰাৱ
-83/VB-8080, 35-


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>