In [56]:
import pytesseract
from PIL import Image
import os
from sentence_transformers import SentenceTransformer, util
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from torch.nn.functional import normalize
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [57]:
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

In [58]:
def extract_texts_from_folder(folder_path):
    texts = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(folder_path, file_name)
            texts[file_name] = extract_text_from_image(file_path)
    return texts

In [59]:
def extract_visual_features(image_path, model, transform):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = model(image).squeeze()
    return normalize(features, dim=0)

In [60]:
def extract_visual_features_from_folder(folder_path, model, transform):
    features = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(folder_path, file_name)
            features[file_name] = extract_visual_features(file_path, model, transform)
    return features

In [61]:
def extract_text_embeddings(texts):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode(list(texts.values()), convert_to_tensor=True)
    return embeddings, list(texts.keys())

In [62]:
def compare_document(uploaded_text, uploaded_visual, folder_text_embeddings, folder_visual_features, folder_filenames):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    uploaded_text_embedding = model.encode([uploaded_text], convert_to_tensor=True)
    
    text_scores = util.pytorch_cos_sim(uploaded_text_embedding, folder_text_embeddings)[0]
    visual_scores = torch.tensor([torch.dot(uploaded_visual, folder_visual_features[fname]) for fname in folder_filenames])
    
    combined_scores = 0.5 * text_scores + 0.5 * visual_scores  # You can adjust the weights as needed
    comparison_scores = {folder_filenames[i]: combined_scores[i].item() for i in range(len(folder_filenames))}
    
    return comparison_scores

In [63]:
def process_upload_and_compare(uploaded_file_path, folder_path):
    # Extract text and visual features from the uploaded document
    uploaded_text = extract_text_from_image(uploaded_file_path)
    uploaded_visual = extract_visual_features(uploaded_file_path, resnet_model, transform)
    
    # Extract texts and embeddings from the folder
    folder_texts = extract_texts_from_folder(folder_path)
    folder_text_embeddings, folder_filenames = extract_text_embeddings(folder_texts)
    
    # Extract visual features from the folder
    folder_visual_features = extract_visual_features_from_folder(folder_path, resnet_model, transform)
    
    # Compare the uploaded document against the folder documents
    comparison_scores = compare_document(uploaded_text, uploaded_visual, folder_text_embeddings, folder_visual_features, folder_filenames)
    
    return comparison_scores

In [64]:
def process_upload_and_compare(uploaded_file_path, folder_path):
    # Extract text and visual features from the uploaded document
    uploaded_text = extract_text_from_image(uploaded_file_path)
    uploaded_visual = extract_visual_features(uploaded_file_path, resnet_model, transform)
    
    # Extract texts and embeddings from the folder
    folder_texts = extract_texts_from_folder(folder_path)
    folder_text_embeddings, folder_filenames = extract_text_embeddings(folder_texts)
    
    # Extract visual features from the folder
    folder_visual_features = extract_visual_features_from_folder(folder_path, resnet_model, transform)
    
    # Compare the uploaded document against the folder documents
    comparison_scores = compare_document(uploaded_text, uploaded_visual, folder_text_embeddings, folder_visual_features, folder_filenames)
    
    # Get the top 3 highest scores
    sorted_scores = sorted(comparison_scores.items(), key=lambda item: item[1], reverse=True)[:3]
    
    return sorted_scores

In [68]:
folder_path = "C:/Users/Vivenns/OneDrive - Victorian Institute of Technology/Desktop/compare/document"
uploaded_file_path = "C:/Users/Vivenns/OneDrive - Victorian Institute of Technology/Desktop/compare/document/pass (2).jpeg"



In [69]:
# Initialize ResNet model and transform
resnet_model = resnet50(pretrained=True)
resnet_model.eval()
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [71]:
comparison_scores = process_upload_and_compare(uploaded_file_path, folder_path)
for filename, score in comparison_scores:
    print(f"{filename}: {score:.4f}")

non_add (1).jpeg: 1.0000
pass (2).jpeg: 1.0000
bank (190).jpg: 0.7143
