<a href="https://colab.research.google.com/github/Sidhtang/a-screen-pet/blob/main/assignment_of_deep_logic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install --upgrade PyPDF2 # Update PyPDF2 to the latest version
import PyPDF2


def extract_text_from_pdf(file_path):
    pdf_file_obj = open(file_path, 'rb')
    # Use PdfReader instead of PdfFileReader
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
    num_pages = pdf_reader.numPages
    text = ''
    for page in range(num_pages):
        page_obj = pdf_reader.getPage(page)
        text += page_obj.extractText()
    pdf_file_obj.close()
    return text




In [5]:
import re
import nltk
from nltk.tokenize import word_tokenize

def extract_features(text):
    keywords = re.findall(r'\b\w+\b', text)
    invoice_number = re.search(r'Invoice Number: (\d+)', text)
    date = re.search(r'Date: (\d{2}/\d{2}/\d{4})', text)
    amount = re.search(r'Amount: (\d{1,3}(?:,\d{3})*(?:\.\d+)?)', text)
    features = {
        'keywords': keywords,
        'invoice_number': invoice_number.group(1) if invoice_number else None,
        'date': date.group(1) if date else None,
        'amount': amount.group(1) if amount else None
    }
    return features

In [6]:
import numpy as np

def calculate_similarity(features1, features2):
    vector1 = np.array([features1['keywords'], features1['invoice_number'], features1['date'], features1['amount']])
    vector2 = np.array([features2['keywords'], features2['invoice_number'], features2['date'], features2['amount']])
    dot_product = np.dot(vector1, vector2)
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    similarity = dot_product / (magnitude1 * magnitude2)
    return similarity

In [9]:
database = []

def add_invoice_to_database(file_path):
    text = extract_text_from_pdf(file_path)
    features = extract_features(text)
    database.append(features)

def find_most_similar_invoice(file_path):
    text = extract_text_from_pdf(file_path)
    features = extract_features(text)
    max_similarity = 0
    most_similar_invoice = None
    for invoice in database:
        similarity = calculate_similarity(features, invoice)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_invoice = invoice
    return most_similar_invoice, max_similarity

In [14]:
import PyPDF2
import re
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

def extract_text_from_pdf(file_path):
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
    num_pages = len(pdf_reader.pages)
    text = ''
    for page in range(num_pages):
        page_obj = pdf_reader.getPage(page)
        text += page_obj.extractText()
    pdf_file_obj.close()
    return text

def extract_features(text):
    keywords = re.findall(r'\b\w+\b', text)
    invoice_number = re.search(r'Invoice Number: (\d+)', text)
    date = re.search(r'Date: (\d{2}/\d{2}/\d{4})', text)
    amount = re.search(r'Amount: (\d{1,3}(?:,\d{3})*(?:\.\d+)?)', text)
    features = {
        'keywords': keywords,
        'invoice_number': invoice_number.group(1) if invoice_number else None,
        'date': date.group(1) if date else None,
        'amount': amount.group(1) if amount else None
    }
    return features

def calculate_similarity(features1, features2):
    vector1 = np.array([len(features1['keywords']), features1['invoice_number'], features1['date'], features1['amount']])
    vector2 = np.array([len(features2['keywords']), features2['invoice_number'], features2['date'], features2['amount']])
    dot_product = np.dot(vector1, vector2)
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    similarity = dot_product / (magnitude1 * magnitude2)
    return similarity

database = []

def add_invoice_to_database(file_path):
    text = extract_text_from_pdf(file_path)
    features = extract_features(text)
    database.append((file_path, features))

def find_most_similar_invoice(file_path):
    text = extract_text_from_pdf(file_path)
    features = extract_features(text)
    max_similarity = 0
    most_similar_invoice = None
    for invoice in database:
        similarity = calculate_similarity(features, invoice[1])
        if similarity > max_similarity and invoice[0] != file_path:
            max_similarity = similarity
            most_similar_invoice = invoice
    return most_similar_invoice, max_similarity

def main():
    # Add invoices to database
    add_invoice_to_database('/content/invoice_102856.pdf')
    add_invoice_to_database('/content/invoice_77073.pdf')


    # Find most similar invoice
    file_path = '/content/invoice_102856.pdf'
    most_similar_invoice, similarity = find_most_similar_invoice(file_path)
    print('Most similar invoice:', most_similar_invoice[0])
    print('Similarity score:', similarity)

if __name__ == '__main__':
    main()

DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead.

In [17]:
!pip install pdfplumber
import pdfplumber
import re
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

def extract_text_from_pdf(file_path):
    text = ''
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_features(text):
    keywords = re.findall(r'\b\w+\b', text)
    invoice_number = re.search(r'Invoice Number: (\d+)', text)
    date = re.search(r'Date: (\d{2}/\d{2}/\d{4})', text)
    amount = re.search(r'Amount: (\d{1,3}(?:,\d{3})*(?:\.\d+)?)', text)
    features = {
        'keywords': keywords,
        'invoice_number': invoice_number.group(1) if invoice_number else None,
        'date': date.group(1) if date else None,
        'amount': amount.group(1) if amount else None
    }
    return features


def calculate_similarity(features1, features2):
    # Convert None values to 0 for numerical operations
    vector1 = np.array([len(features1['keywords']),
                        features1['invoice_number'] if features1['invoice_number'] is not None else 0,
                        features1['date'] if features1['date'] is not None else 0,
                        features1['amount'] if features1['amount'] is not None else 0])
    vector2 = np.array([len(features2['keywords']),
                        features2['invoice_number'] if features2['invoice_number'] is not None else 0,
                        features2['date'] if features2['date'] is not None else 0,
                        features2['amount'] if features2['amount'] is not None else 0])
    dot_product = np.dot(vector1, vector2)
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    # Handle potential division by zero
    if magnitude1 == 0 or magnitude2 == 0:
        return 0  # or another appropriate value indicating no similarity
    similarity = dot_product / (magnitude1 * magnitude2)
    return similarity
database = []

def add_invoice_to_database(file_path):
    text = extract_text_from_pdf(file_path)
    features = extract_features(text)
    database.append((file_path, features))

def find_most_similar_invoice(file_path):
    text = extract_text_from_pdf(file_path)
    features = extract_features(text)
    max_similarity = 0
    most_similar_invoice = None
    for invoice in database:
        similarity = calculate_similarity(features, invoice[1])
        if similarity > max_similarity and invoice[0] != file_path:
            max_similarity = similarity
            most_similar_invoice = invoice
    return most_similar_invoice[0], max_similarity

def main():
    # Add invoices to database
    add_invoice_to_database('/content/invoice_102856.pdf')
    add_invoice_to_database('/content/invoice_77073.pdf')
    add_invoice_to_database('/content/invoice_102857.pdf')

    # Find most similar invoice
    file_path = '/content/invoice_102856.pdf'
    most_similar_invoice, similarity = find_most_similar_invoice(file_path)
    print('Most similar invoice:', most_similar_invoice)
    print('Similarity score:', similarity)

if __name__ == '__main__':
    main()



Most similar invoice: /content/invoice_77073.pdf
Similarity score: 1.0
