In [1]:
import os
import spacy
import re
from dateutil.parser import *
import textract
import json

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
folder_path = r'C:\Users\karth\Desktop\Capstone\NER\Data\DealSheets'
all_files = os.listdir(folder_path)
all_pdfs = [file for file in all_files if file.endswith('.pdf') or file.endswith('.docx')]
print("PDF files in the folder:")
for pdf in all_pdfs:
    print(pdf)

PDF files in the folder:
RED BULL CHERRY FLAVOR LAUNCH.docx


In [4]:
import queue
import os
from spire.doc import *
from spire.doc.common import *

def extract_images_from_docx(input_file, output_path):
    # Create a Document instance
    document = Document()
    # Load the input Word document
    document.LoadFromFile(input_file)

    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Create a list to store the extracted image data
    images = []

    # Initialize a queue to store document elements for traversal
    nodes = queue.Queue()
    nodes.put(document)

    # Traverse through the document elements
    while not nodes.empty():
        node = nodes.get()
        for i in range(node.ChildObjects.Count):
            obj = node.ChildObjects[i]
            # Find the images
            if isinstance(obj, DocPicture):
                picture = obj
                # Append the image data to the list
                data_bytes = picture.ImageBytes
                images.append(data_bytes)
            elif isinstance(obj, ICompositeObject):
                nodes.put(obj)

    # Save the image data to image files
    for i, image_data in enumerate(images):
        file_name = f"Image-{i}.png"
        with open(os.path.join(output_path, file_name), 'wb') as image_file:
            image_file.write(image_data)

    document.Close()

## Packaging it all together

In [5]:
for file in all_pdfs:

    file = folder_path+"\\"+file
    print("Processing File :", file)
    extract_images_from_docx(file,'./Data/DocumentImages')

Processing File : C:\Users\karth\Desktop\Capstone\NER\Data\DealSheets\RED BULL CHERRY FLAVOR LAUNCH.docx


In [6]:
# import os
# from PIL import Image

# def image_to_hex(image_path):
#     img = Image.open(image_path)
#     img = img.resize((100, 100))  # Resize the image if needed
#     img = img.convert('L')  # Convert to grayscale if needed
#     hex_representation = img.tobytes().hex()
#     return hex_representation

# def process_images(folder_path, output_file):
#     with open(output_file, 'w') as f:
#         for filename in os.listdir(folder_path):
#             if filename.endswith(('.png', '.jpg', '.jpeg')):
#                 image_path = os.path.join(folder_path, filename)
#                 hex_value = image_to_hex(image_path)
#                 f.write(f"{filename}: {hex_value}\n")

# folder_path = './Data/DocumentImages'
# output_file = './Data/hex_values.txt'
# process_images(folder_path, output_file)
# print("Hex values saved to:", output_file)


## Converting the images to hex value

In [7]:
from PIL import Image
import imagehash
import os

def calculate_image_hash1(image_path):
    img = Image.open(image_path)
    return str(imagehash.average_hash(img))

def process_images1(folder_path, output_file):
    with open(output_file, 'w') as f:
        for filename in os.listdir(folder_path):
            if filename.endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(folder_path, filename)
                image_hash = calculate_image_hash1(image_path)
                f.write(f"{filename}: {image_hash}\n")

folder_path = './Data/DocumentImages'
output_file = './Data/hex_values.txt'
process_images1(folder_path, output_file)
print("Image hashes saved to:", output_file)


Image hashes saved to: ./Data/hex_values.txt


In [8]:
import os
from PIL import Image
import redis

def image_to_hex(image_path):
    img = Image.open(image_path)
    return str(imagehash.average_hash(img))

def process_images(folder_path, redis_host, redis_port, redis_db):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            hex_value = image_to_hex(image_path)
            # Store hex value in Redis with filename as key
            r.set(hex_value, filename)

# Example usage
folder_path = './Data/DocumentImages'
redis_host = 'localhost'  # Replace with your Redis server host
redis_port = 6379  # Replace with your Redis server port
redis_db = 0  # Replace with your Redis database index
process_images(folder_path, redis_host, redis_port, redis_db)
print("Hex values stored in Redis.")


Hex values stored in Redis.


In [9]:
def display_all_keys(redis_host, redis_port, redis_db):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    keys = r.keys()
    for key in keys:
        value = r.get(key).decode('utf-8')  # Decode bytes to string
        print(f"Key: {key.decode('utf-8')}, Value: {value}")
        
display_all_keys(redis_host, redis_port, redis_db)

Key: ffa7e7e3c3c7c7ff, Value: Image-0.png
Key: 7cfce21282e0f8fe, Value: Image-1.png


## To delet all the keys from the Redis

In [23]:
def delete_all_keys(redis_host, redis_port, redis_db):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    r.flushdb()
    print("All keys deleted from the Redis database.")
    
delete_all_keys(redis_host, redis_port, redis_db)

All keys deleted from the Redis database.


## Checking if the image is duplicated

In [10]:
import os
from PIL import Image
import redis

def image_to_hex(image_path):
    img = Image.open(image_path)
    return str(imagehash.average_hash(img))

def process_images(folder_path, redis_host, redis_port, redis_db):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            hex_value = image_to_hex(image_path)
            # Check if the key already exists in Redis
            if r.exists(hex_value):
                print(f"Image {filename} is duplicated.")
            else:
                # Store hex value in Redis with filename as key
                r.set(hex_value, filename)

# Example usage
folder_path = './Data/DocumentImages'
redis_host = 'localhost'  # Replace with your Redis server host
redis_port = 6379  # Replace with your Redis server port
redis_db = 0  # Replace with your Redis database index
process_images(folder_path, redis_host, redis_port, redis_db)


Image Image-0.png is duplicated.
Image Image-1.png is duplicated.
