### 1. Extract the images from the excel workbook and store in a folder. 

In [2]:
import shutil # Used to copy the images to new path and to remove the temp directory
import pathlib # Helps in making the paths consistent and helps to join different directories or folders for transferring data
import zipfile # Works when excel files are zipped
import os # Helps to interact with the operating system

def extract_images_from_excel(path, output_folder_name='extracted_images_again'):
    """
    Extracts images from an Excel file and stores them in a single folder.

    Args:
        path (pathlib.Path or str): Excel file path.
        output_folder_name (str): Name of the folder to store the extracted images.
            Defaults to 'extracted_images_again'.

    Returns:
        new_paths (list[pathlib.Path]): List of paths to extracted images.
    """
    # Convert path to pathlib.Path if it's a string
    if isinstance(path, str):
        path = pathlib.Path(path)

    # Check if the file has the '.xlsx' extension
    if path.suffix != '.xlsx':
        raise ValueError('Path must be an xlsx file')

    # Extract the filename (excluding the extension) using .stem
    name = path.stem

    # Create a new folder for the extracted images
    output_folder = path.parent / output_folder_name
    output_folder.mkdir(exist_ok=True)  # Create folder if it doesn't exist

    # Create a temporary directory for unzipping the Excel file
    temp_dir = path.parent / 'temp'
    temp_dir.mkdir(exist_ok=True)  # Create folder if it doesn't exist

    try:
        # Unzip the Excel file into the temporary directory
        with zipfile.ZipFile(path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)

        # Locate the 'media' directory within the unzipped content
        media_dir = temp_dir / 'xl' / 'media'

        image_index = 0  # Initialize an index for the images
        new_paths = []  # List to store the paths of the extracted images

        # Iterate through the files in the 'media' directory
        for root, dirs, files in os.walk(media_dir):
            for file in files:
                image_index += 1  # Increment the image index for each image found

                # Construct paths for the original image and the new destination
                image_path = pathlib.Path(root) / file
                new_path = output_folder / f'{name}-{str(image_index)}.png'

                # Copy the image to the output folder with a new name
                shutil.copy(image_path, new_path)

                # Store the new path in the list
                new_paths.append(new_path)

    finally:
        # Cleanup: Remove the temporary directory
        shutil.rmtree(temp_dir)

    # Return the list of paths to the extracted images
    return new_paths

In [3]:
excel_file_path = "C:/Users/Shreshtha/Downloads/Project UHC/Trial 2.xlsx"
extracted_image_paths = extract_images_from_excel(excel_file_path)

In [5]:
# Path to the directory containing the images
images_directory = "C:/Users/Shreshtha/Downloads/Project UHC/extracted_images_again"

### 2. Use K-Means image clustering to group the images which look similar and create subfolders for the same.

In [6]:
import cv2
import os
import shutil
import numpy as np
from sklearn.cluster import KMeans

def resize_image(image, target_size=(300, 300)):
    try:
        if image is not None and image.size != 0:
            return cv2.resize(image, target_size)
        else:
            return image
    except Exception as e:
        print(f"Error resizing image: {e}")
        return None

def extract_features(image_path, target_size=(300, 300)):
    try:
        image = cv2.imread(image_path)
        resized_image = resize_image(image, target_size)

        if resized_image is not None:
            # Convert the image to grayscale
            gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)

            # Flatten the 2D array into a 1D array
            flattened = gray.flatten()

            return flattened
        else:
            return None
    except Exception as e:
        print(f"Error extracting features: {e}")
        return None

def group_similar_images_kmeans(input_folder, output_folder, num_clusters=5):
    try:
        image_files = [f for f in os.listdir(input_folder) if f.endswith(('.jpg', '.png', '.jpeg'))]

        feature_vectors = []

        for img_file in image_files:
            features = extract_features(os.path.join(input_folder, img_file))
            if features is not None:
                feature_vectors.append(features)

        if feature_vectors:
            feature_vectors = np.array(feature_vectors)

            # Apply k-means clustering
            kmeans = KMeans(n_clusters=num_clusters, random_state=42)
            cluster_labels = kmeans.fit_predict(feature_vectors)

            # Create output folders for each cluster
            for i in range(num_clusters):
                cluster_folder = os.path.join(output_folder, f"Cluster_{i + 1}")
                os.makedirs(cluster_folder, exist_ok=True)

            # Move images to their respective cluster folders
            for img_file, label in zip(image_files, cluster_labels):
                cluster_folder = os.path.join(output_folder, f"Cluster_{label + 1}")
                shutil.move(os.path.join(input_folder, img_file), cluster_folder)

    except Exception as e:
        print(f"Error grouping similar images: {e}")

# Specify your input and output folder paths
input_folder_path = "C:/Users/Shreshtha/Downloads/Project UHC/extracted_images_again"
output_folder_path = "C:/Users/Shreshtha/Downloads/Project UHC/Output Folder 2"

# Call the function with a specified number of clusters
group_similar_images_kmeans(input_folder_path, output_folder_path, num_clusters=5)

  super()._check_params_vs_input(X, default_n_init=10)


### 3. Use Tesseract OCR to extract text from images, and also extract data from excel sheets and store all the extracted data in a new workbook.

In [7]:
from openpyxl import Workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.utils import get_column_letter
from PIL import Image
import pytesseract
import cv2
import os
import pandas as pd

# Path to the directory containing the groups of images
groups_directory = "C:/Users/Shreshtha/Downloads/Project UHC/Output Folder 2"

# Path to the existing Excel workbook
existing_excel_file_path = "C:/Users/Shreshtha/Downloads/Project UHC/Trial 2.xlsx"

# Path to the new Excel workbook
new_excel_file_path = "C:/Users/Shreshtha/Downloads/Project UHC/Combined Trial Final.xlsx"

# Load the existing Excel data
existing_workbook = load_workbook(existing_excel_file_path, read_only=True)
existing_excel_data_dict = {}

# Iterate through each sheet in the existing workbook
for sheet_name in existing_workbook.sheetnames:
    existing_excel_data_df = pd.read_excel(existing_excel_file_path, sheet_name=sheet_name, header=None)
    existing_excel_data_dict[sheet_name] = existing_excel_data_df

# Create a new Excel workbook
new_workbook = Workbook()

# Function to extract text from an image using OCR
def extract_text_from_image(image_path):
    try:
        text = pytesseract.image_to_string(Image.open(image_path))
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from image {image_path}: {e}")
        return None

# Function to process a group of images and extract text for each image
def process_image_group(group_folder):
    group_text_dict = {}

    for filename in os.listdir(group_folder):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(group_folder, filename)

            # Extract text from the image
            text_data = extract_text_from_image(image_path)

            # Store the extracted text for each image
            group_text_dict[filename] = text_data

    return group_text_dict

# Iterate through each group folder in the groups directory
for group_folder_name in os.listdir(groups_directory):
    group_folder = os.path.join(groups_directory, group_folder_name)

    if os.path.isdir(group_folder):
        # Process each group of images
        group_text_dict = process_image_group(group_folder)

        if group_text_dict:
            # Create a new sheet for each group's text data
            group_sheet = new_workbook.create_sheet(title=f"Group_{group_folder_name}")

            # Write the extracted text to the sheet
            for filename, text_data in group_text_dict.items():
                # Split the text into lines
                lines = text_data.split('\n')

                # Write each line to a separate row in the sheet
                for line in lines:
                    group_sheet.append([filename, line])

                # Add a blank row to create a gap between text of different images
                group_sheet.append([])

# Include existing Excel data in the new workbook
for sheet_name, df in existing_excel_data_dict.items():
    sheet = new_workbook.create_sheet(title=f"Existing_{sheet_name}")

    # Write the existing Excel data to the sheet
    for row in dataframe_to_rows(df, index=False, header=False):
        sheet.append(row)

# Save the new workbook
new_workbook.save(new_excel_file_path)
print(f"Texts extracted and stored in {new_excel_file_path}")

Texts extracted and stored in C:/Users/Shreshtha/Downloads/Project UHC/Combined Trial Final.xlsx
