In [26]:
# Import necessary libraries
import os
import PyPDF2
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from skimage.metrics import structural_similarity as ssim
from PIL import Image


In [27]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts all text from a PDF file.
    
    Args:
        pdf_path (str): The path to the PDF file.
        
    Returns:
        str: Extracted text from the PDF.
    """
    text = ''
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

def extract_features(text):
    """
    Extracts features from the extracted text of an invoice.
    
    Args:
        text (str): The text content of the invoice.
        
    Returns:
        dict: A dictionary containing extracted features such as invoice number, date, amount, and keywords.
    """
    features = {}
    invoice_number = re.search(r'Invoice Number: (\d+)', text)
    date = re.search(r'Date: (\d{2}/\d{2}/\d{4})', text)
    amount = re.search(r'Total Amount: (\d+\.\d{2})', text)

    features['invoice_number'] = invoice_number.group(1) if invoice_number else None
    features['date'] = date.group(1) if date else None
    features['amount'] = amount.group(1) if amount else None

    keywords = re.findall(r'\b\w+\b', text)
    features['keywords'] = set(keywords)
    return features


In [28]:
def calculate_cosine_similarity(features1, features2):
    """
    Calculates cosine similarity between two invoices based on their keyword sets.
    
    Args:
        features1 (dict): Features of the first invoice.
        features2 (dict): Features of the second invoice.
        
    Returns:
        float: Cosine similarity score between 0 and 1.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([' '.join(features1['keywords']), ' '.join(features2['keywords'])])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return cosine_sim

def calculate_jaccard_similarity(features1, features2):
    """
    Calculates Jaccard similarity between two invoices based on their keyword sets.
    
    Args:
        features1 (dict): Features of the first invoice.
        features2 (dict): Features of the second invoice.
        
    Returns:
        float: Jaccard similarity score between 0 and 1.
    """
    set1 = features1['keywords']
    set2 = features2['keywords']
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def compare_images(image1_path, image2_path):
    """
    Compares two images and calculates their structural similarity.
    
    Args:
        image1_path (str): Path to the first image.
        image2_path (str): Path to the second image.
        
    Returns:
        float: Structural similarity score between 0 and 1.
    """
    image1 = Image.open(image1_path).convert('L')
    image2 = Image.open(image2_path).convert('L')
    
    image1 = np.array(image1)
    image2 = np.array(image2)
    
    score, _ = ssim(image1, image2, full=True)
    return score


In [29]:
# Define the path where the PDF files are located
pdf_directory = r'F:\Downloads\document similarity test and train files\document similarity\train'

# List of training PDFs
training_pdfs = [f'invoice_{i}.pdf' for i in range(1, 6)]

# Extract text and features from each training PDF
training_texts = [extract_text_from_pdf(os.path.join(pdf_directory, pdf)) for pdf in training_pdfs]
training_features = [extract_features(text) for text in training_texts]


In [30]:
# List of testing PDFs
test_pdfs = [r'F:\Downloads\document similarity test and train files\document similarity\train\invoice_77098.pdf',
             r'F:\Downloads\document similarity test and train files\document similarity\train\invoice_102857.pdf']

# Extract text and features from each testing PDF
test_texts = [extract_text_from_pdf(pdf) for pdf in test_pdfs]
test_features = [extract_features(text) for text in test_texts]


In [31]:
# In-memory database for training invoices
invoice_db = list(zip(training_pdfs, training_features))


In [32]:
def find_most_similar_invoice(test_features, test_image_path=None):
    """
    Finds the most similar invoice from the database to the given test invoice.
    
    Args:
        test_features (dict): Features of the test invoice.
        test_image_path (str, optional): Path to the test invoice's image for structural comparison.
        
    Returns:
        tuple: The most similar invoice filename and the similarity score.
    """
    highest_similarity = 0
    most_similar_invoice = None
    
    for pdf_name, features in invoice_db:
        # Calculate text-based similarities
        cosine_sim = calculate_cosine_similarity(test_features, features)
        jaccard_sim = calculate_jaccard_similarity(test_features, features)
        
        # Initialize similarity score with text-based similarity
        similarity = max(cosine_sim, jaccard_sim) * 100  # Convert to percentage
        
        # If an image path is provided, compare images
        if test_image_path:
            db_image_path = os.path.join(pdf_directory, pdf_name.replace('.pdf', '.png'))
            if os.path.exists(db_image_path):
                image_sim = compare_images(test_image_path, db_image_path) * 100  # Convert to percentage
                similarity = max(similarity, image_sim)
        
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_invoice = pdf_name
    
    return most_similar_invoice, highest_similarity


In [33]:
# Find and print the most similar invoices for each test PDF
for i, test_feature in enumerate(test_features):
    # Assuming the image files are named similarly to PDF files
    test_image_path = test_pdfs[i].replace('.pdf', '.png')
    most_similar_invoice, similarity_score = find_most_similar_invoice(test_feature, test_image_path)
    print(f"Most similar invoice to '{os.path.basename(test_pdfs[i])}': {most_similar_invoice} with similarity score {similarity_score:.4f}%")


Most similar invoice to 'invoice_77098.pdf': invoice_4.pdf with similarity score 79.8780%
Most similar invoice to 'invoice_102857.pdf': invoice_5.pdf with similarity score 64.1704%
