In [1]:
import os
import spacy
import re
from dateutil.parser import *
import textract
import json

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
folder_path = r'C:\Users\karth\Desktop\Capstone\NER\Data\DealSheets'
all_files = os.listdir(folder_path)
all_pdfs = [file for file in all_files if file.endswith('.pdf') or file.endswith('.docx')]
print("PDF files in the folder:")
for pdf in all_pdfs:
    print(pdf)

PDF files in the folder:
RED BULL CHERRY FLAVOR LAUNCH.docx


#### Text Cleaning

In [4]:
import PyPDF2
import docx
import pdfplumber

def extract_text_from_pdf(file):
    text = ""
    with open(file, 'rb') as f:
        reader = PyPDF2.PdfFileReader(f)
        for page_num in range(reader.numPages):
            text += reader.getPage(page_num).extractText()
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                text += cell.text + "\t"
            text += "\n"
    return text

def clean_text(text):
    # Replace "\r\n" with spaces
    text = text.replace("\r\n", " ")
    # Remove any double spaces
    text = re.sub(" +", " ", text)
    return text

def Clean_Text(file):
    if file.endswith('.pdf'):
        text = extract_text_from_pdf(file)
    elif file.endswith('.docx'):
        text = extract_text_from_docx(file)
    else:
        raise ValueError("Unsupported file format")
    
    return clean_text(text)

In [5]:
import re
import fitz
import docx

def extract_text_from_pdf(file):
    text = ""
    with fitz.open(file) as pdf_file:
        for page_num in range(len(pdf_file)):
            text += pdf_file[page_num].get_text()
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def Get_Invoice_Number(file):
    if file.endswith('.pdf'):
        text = extract_text_from_pdf(file)
    elif file.endswith('.docx'):
        text = extract_text_from_docx(file)
    else:
        raise ValueError("Unsupported file format")
    
    # Preprocess the text
    text = text.replace("\r\n", "|")
    text = re.sub(" +", " ", text)

    tokens = [i for i in text.split('|') if len(i) > 1]
    text = '|'.join(tokens)

    text = text.lower()
    text = re.sub(" number:| no\.| n\.", " number", text)

    # Extract the Invoice Number
    valid_matches = []
    keywords = ["invoice number", "lading number", "invoice", "lading", "number"]
    for keyword in keywords:
        token = Extract_Tag_Data(keyword, text, 'reverse')
        if len(token) > 0:
            valid_matches.append(token)

    valid_matches = [i.upper() for i in valid_matches]

    if len(valid_matches) > 0:
        invoice_number = valid_matches[0]
        if len(invoice_number) > 20:
            invoice_number = invoice_number.split('/')[0]
    else:
        invoice_number = "No Invoice Number found"
    
    return invoice_number

In [6]:
def Get_Deal_Dates(text):
    possible_dates = []

    # Extract entities labeled as "DATE"
    doc = nlp(text)
    possible_dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]

    # Extract start and end dates from possible dates
    start_date = None
    end_date = None

    for date in possible_dates:
        try:
            parsed_date = parse(date)
            if start_date is None or parsed_date < start_date:
                start_date = parsed_date
            if end_date is None or parsed_date > end_date:
                end_date = parsed_date
        except:
            continue

    if start_date and end_date:
        return start_date.strftime("%B %d, %Y"), end_date.strftime("%B %d, %Y")
    else:
        return "No Start and End Dates Found"

In [7]:
def extract_collaborators(text):
    collaborators = {}
    lines = text.split('\n')

    for line_index, line in enumerate(lines):
        if "Name:" in line:
            name_index = line.index("Name:")
            name = line[name_index + len("Name:"):].strip()
            if name:
                if line_index < len(lines) - 1:
                    address = lines[line_index + 1].strip()
                    collaborators[name] = address

    return collaborators

In [8]:
# def extract_integrations(text):
#     integrations = []

#     # Find all occurrences of the keyword "Integrations" (case-insensitive)
#     integration_matches = re.finditer(r'(\w+\s+Integrations):\s*\n', text, re.IGNORECASE)

#     for match in integration_matches:
#         integration_type = match.group(1)
#         # integration_section = re.search(rf'{integration_type}\s*:(.*?)\n[ivx]{1,3}\.', text, re.IGNORECASE | re.DOTALL)
#         integrations.append(integration_type)
#         # if integration_section:
#         #     integration_elements = re.findall(r'Element\s+(.*?)\n', integration_section.group(1), re.IGNORECASE)
#         #     integrations[integration_type] = [element.strip() for element in integration_elements]

#     return integrations

In [9]:
import re

def extract_integrations(file_text):
    integrations = {}

    # Define integration types to look for
    integration_types = ["In-Store Integrations", "Car Display Integrations", "Online Integrations"]

    for integration_type in integration_types:
        start_index = file_text.find(integration_type)
        if start_index != -1:
            next_integration_index = len(file_text)
            for next_type in integration_types:
                if next_type != integration_type:
                    next_start_index = file_text.find(next_type, start_index + len(integration_type))
                    if next_start_index != -1 and next_start_index < next_integration_index:
                        next_integration_index = next_start_index
            integrations[integration_type] = file_text[start_index:next_integration_index].strip()

    return integrations


In [10]:
import queue
import os
from spire.doc import *
from spire.doc.common import *

def extract_images_from_docx(input_file, output_path):
    # Create a Document instance
    document = Document()
    # Load the input Word document
    document.LoadFromFile(input_file)

    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Create a list to store the extracted image data
    images = []

    # Initialize a queue to store document elements for traversal
    nodes = queue.Queue()
    nodes.put(document)

    # Traverse through the document elements
    while not nodes.empty():
        node = nodes.get()
        for i in range(node.ChildObjects.Count):
            obj = node.ChildObjects[i]
            # Find the images
            if isinstance(obj, DocPicture):
                picture = obj
                # Append the image data to the list
                data_bytes = picture.ImageBytes
                images.append(data_bytes)
            elif isinstance(obj, ICompositeObject):
                nodes.put(obj)

    # Save the image data to image files
    for i, image_data in enumerate(images):
        file_name = f"Image-{i}.png"
        with open(os.path.join(output_path, file_name), 'wb') as image_file:
            image_file.write(image_data)

    document.Close()

## Packaging it all together

In [11]:
for file in all_pdfs:

    file = folder_path+"\\"+file
    print("Processing File :", file)
    
    file_text = Clean_Text(file)
    Date = Get_Deal_Dates(file_text)
    collaborators = extract_collaborators(file_text)
    
    print("The collaborators")
    i=0
    for x in collaborators.keys():
        i+=1
        print(str(i)+"->",x)
        print("    Address:",collaborators[x])
    print("\n")
    
    print("Start Date :", Date[0])
    print("End Date :", Date[1])
    print("\n")
    
    integrations = extract_integrations(file_text)

    print("Integrations:")
    for x in integrations:
        print(x)
    
    
    extract_images_from_docx(file,'./Data/DocumentImages')

Processing File : C:\Users\karth\Desktop\Capstone\NER\Data\DealSheets\RED BULL CHERRY FLAVOR LAUNCH.docx
The collaborators
1-> Red Bull GmbH
    Address: Address: Am Brunnen 1, 5330 Fuschl am See, Austria
2-> Costco Wholesale Corporation
    Address: Address: 999 Lake Drive, Issaquah, WA 98027, USA


Start Date : January 01, 2024
End Date : March 01, 2024


Integrations:
In-Store Integrations
Car Display Integrations
Online Integrations


In [12]:
# import os
# from PIL import Image

# def image_to_hex(image_path):
#     img = Image.open(image_path)
#     img = img.resize((100, 100))  # Resize the image if needed
#     img = img.convert('L')  # Convert to grayscale if needed
#     hex_representation = img.tobytes().hex()
#     return hex_representation

# def process_images(folder_path, output_file):
#     with open(output_file, 'w') as f:
#         for filename in os.listdir(folder_path):
#             if filename.endswith(('.png', '.jpg', '.jpeg')):
#                 image_path = os.path.join(folder_path, filename)
#                 hex_value = image_to_hex(image_path)
#                 f.write(f"{filename}: {hex_value}\n")

# folder_path = './Data/DocumentImages'
# output_file = './Data/hex_values.txt'
# process_images(folder_path, output_file)
# print("Hex values saved to:", output_file)


## Converting the images to hex value

In [13]:
from PIL import Image
import imagehash
import os

def calculate_image_hash1(image_path):
    img = Image.open(image_path)
    return str(imagehash.average_hash(img))

def process_images1(folder_path, output_file):
    with open(output_file, 'w') as f:
        for filename in os.listdir(folder_path):
            if filename.endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(folder_path, filename)
                image_hash = calculate_image_hash1(image_path)
                f.write(f"{filename}: {image_hash}\n")

folder_path = './Data/DocumentImages'
output_file = './Data/hex_values.txt'
process_images1(folder_path, output_file)
print("Image hashes saved to:", output_file)


Image hashes saved to: ./Data/hex_values.txt


In [14]:
import os
from PIL import Image
import redis

def image_to_hex(image_path):
    img = Image.open(image_path)
    return str(imagehash.average_hash(img))

def process_images(folder_path, redis_host, redis_port, redis_db):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            hex_value = image_to_hex(image_path)
            # Store hex value in Redis with filename as key
            r.set(hex_value, filename)

# Example usage
folder_path = './Data/DocumentImages'
redis_host = 'localhost'  # Replace with your Redis server host
redis_port = 6379  # Replace with your Redis server port
redis_db = 0  # Replace with your Redis database index
process_images(folder_path, redis_host, redis_port, redis_db)
print("Hex values stored in Redis.")


Hex values stored in Redis.


In [15]:
def display_all_keys(redis_host, redis_port, redis_db):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    keys = r.keys()
    for key in keys:
        value = r.get(key).decode('utf-8')  # Decode bytes to string
        print(f"Key: {key.decode('utf-8')}, Value: {value}")
        
display_all_keys(redis_host, redis_port, redis_db)

Key: 7cfce21282e0f8fe, Value: Image-1.png
Key: ffa7e7e3c3c7c7ff, Value: Image-0.png


## To delet all the keys from the Redis

In [23]:
# def delete_all_keys(redis_host, redis_port, redis_db):
#     r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
#     r.flushdb()
#     print("All keys deleted from the Redis database.")
    
# delete_all_keys(redis_host, redis_port, redis_db)

All keys deleted from the Redis database.


## Checking if the image is duplicated

In [16]:
import os
from PIL import Image
import redis

def image_to_hex(image_path):
    img = Image.open(image_path)
    return str(imagehash.average_hash(img))

def process_images(folder_path, redis_host, redis_port, redis_db):
    r = redis.Redis(host=redis_host, port=redis_port, db=redis_db)
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)
            hex_value = image_to_hex(image_path)
            # Check if the key already exists in Redis
            if r.exists(hex_value):
                print(f"Image {filename} is duplicated.")
            else:
                # Store hex value in Redis with filename as key
                r.set(hex_value, filename)

# Example usage
folder_path = './Data/DocumentImages'
redis_host = 'localhost'  # Replace with your Redis server host
redis_port = 6379  # Replace with your Redis server port
redis_db = 0  # Replace with your Redis database index
process_images(folder_path, redis_host, redis_port, redis_db)


Image Image-0.png is duplicated.
Image Image-1.png is duplicated.
