In [3]:
import shutil
import pathlib
import zipfile
import os

def extract_images_from_excel(path, output_folder_name='extracted_images'):
    """Extracts images from an Excel file and stores them in a single folder.

    Args:
        path (pathlib.Path or str): Excel file path.
        output_folder_name (str): Name of the folder to store the extracted images. 
            Defaults to 'extracted_images'.

    Returns:
        new_paths (list[pathlib.Path]): List of paths to the extracted images.
    """
    if isinstance(path, str):
        path = pathlib.Path(path)

    if path.suffix != '.xlsx':
        raise ValueError('Path must be an xlsx file')

    name = path.stem  # Using .stem to get the filename without the extension

    # Create a new folder for the extracted images
    output_folder = path.parent / output_folder_name
    output_folder.mkdir(exist_ok=True)

    temp_dir = path.parent / 'temp'
    temp_dir.mkdir(exist_ok=True)

    try:
        # Unzip the Excel file
        with zipfile.ZipFile(path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)

        media_dir = temp_dir / 'xl' / 'media'

        image_index = 0
        new_paths = []

        for root, dirs, files in os.walk(media_dir):
            for file in files:
                image_index += 1
                image_path = pathlib.Path(root) / file
                new_path = output_folder / f'{name}-{str(image_index)}.png'
                shutil.copy(image_path, new_path)
                new_paths.append(new_path)

    finally:
        # Cleanup: Remove the temporary directory
        shutil.rmtree(temp_dir)

    return new_paths

In [4]:
excel_file_path = "C:/Users/Shreshtha/Downloads/Project UHC/Trial 2.xlsx"
extracted_image_paths = extract_images_from_excel(excel_file_path)

In [7]:
import cv2
import pytesseract
from PIL import Image
import os

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Path to the directory containing the images
images_directory = "C:/Users/Shreshtha/Downloads/Project UHC/extracted_images"

# Function to extract tabular data from an image using OpenCV
def extract_tabular_data(image_path):
    # Read the image using OpenCV
    img = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Use a simple thresholding technique to convert to binary image
    _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV)

    # Use pytesseract to extract text from the binary image
    text_data = pytesseract.image_to_string(Image.fromarray(binary))

    return text_data.strip()

# Dictionary to store text with filenames as keys
text_dict = {}

# Iterate through each image in the directory
for filename in os.listdir(images_directory):
    if filename.endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(images_directory, filename)
        
        # Extract regular text
        text_data_regular = extract_text_from_image(image_path)
        
        # Extract tabular data
        text_data_tabular = extract_tabular_data(image_path)

        # Combine the extracted text
        combined_text = f"Regular Text:\n{text_data_regular}\n\nTabular Data:\n{text_data_tabular}"

        text_dict[filename] = combined_text

# Print or further process the organized text data
for filename, text_data in text_dict.items():
    print(f"Text extracted from {filename}:\n{text_data}\n")


Text extracted from Trial 2-1.png:
Regular Text:
Group Heme: Aurora Products
Group Number (or TBD): 1264561
Request Date: 7214/2020
Effective Date of NSB: 31/2020

Please Note: All tems isied on this form are subject to review, Please highlight the NSB's in yellow,

Benefits Currently in Place Requested Benefits

Current Plan 4: Current Plan 2:
0100001 BUY UP 0100002 CORE

Base (Similar standard Plan) Tracking ID # (REQUIRED)
+ Reach out to your Underwriter

+ 4 14
# of Employees Enrolled: 39 2 25 % 26 15
Group State (NY, NJ, CT) cr cr cr. cr cr cr
Market (Large or Small Group) Large Large Large Large Large Large
Product (e.9: HHO, POS, Classic, Access, Direct, OxUSA, Value Option
or EPO) HMO Huo HSA Huo HMO HSA
Access (Gated or Non-Gated) Non-Gated Gated Non-Gated Non-Gated Gates Non-Gated
Network (Freedom, Liberty, or Choice Pius) Freedom Freedom Freedom Freedom Freedom Freedom
Current Carrier (Oxford or Competitor) Polaris Polaris Polaris Polaris

Nore INE
PCPISpecialist OV Copay 30

In [6]:
import pandas as pd
from openpyxl import load_workbook

# Path to the Excel workbook
excel_file_path = "C:/Users/Shreshtha/Downloads/Project UHCTrial 2.xlsx"

# Load the Excel workbook
workbook = load_workbook(excel_file_path, read_only=True, data_only=True)

# Dictionary to store data frames with sheet names as keys
data_frames_dict = {}

# Iterate through each sheet in the workbook
for sheet_name in workbook.sheetnames:
    # Read the sheet into a DataFrame
    sheet_df = pd.read_excel(excel_file_path, sheet_name=sheet_name, header=None)
    
    # Store the DataFrame in the dictionary
    data_frames_dict[sheet_name] = sheet_df

# Access and print or further process the organized data
for sheet_name, df in data_frames_dict.items():
    print(f"Data from {sheet_name}:\n{df}\n")


Data from Sheet1:
                                                    0  \
0                                         Group Name:   
1                            2 Group Number (or TBD):   
2                                     3 Request Date:   
3                               Effective Date of NSB   
4   Please Note: All tersisted on this form are su...   
5                                                 NaN   
6                                                 NaN   
7                             $ General exformatiqued   
8                                      CSP (Optional)   
9   Base (Similar standard Plan) Tracking ID# (REQ...   
10                            # of Employees Enrolled   
11                           Group State (NY, NJ, CT)   
12                      Market (Large or Small Group)   
13  Product (eg HMO, POS, Classic Access, Drect, O...   
14                        Access (Gated or Non-Gated)   
15          Network Freedom, Liberty, or Choice Plus)   
16           