# The code connects to a database, receives a new file, compares the new file with all the files in the database, identifies the document with the highest similarity and returns it.

## The codue uses sqlite3, TfidfVectorizer and cosine_similarity

In [1]:
# Install necessary libraries
#!pip install python-docx scikit-learn

# Import required libraries
import sqlite3
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import FileLink, display
from pathlib import Path
import os



In [51]:
conn = sqlite3.connect('/Users/sedastepanyan/new_documents.db')
conn.text_factory = str  # Set text_factory to handle UTF-8 encoding
cursor = conn.cursor()


In [52]:
cursor.execute('''CREATE TABLE IF NOT EXISTS documents
                  (id INTEGER PRIMARY KEY AUTOINCREMENT,
                   name TEXT,
                   content TEXT)''')


<sqlite3.Cursor at 0x122d315c0>

In [53]:
def add_document(name, content):
    try:
        cursor.execute('INSERT INTO documents (name, content) VALUES (?, ?)', (name, content))
        conn.commit()
        print("Document added successfully.")
    except Exception as e:
        print(f"Error adding document: {e}")

In [54]:
def get_all_documents():
    cursor.execute('SELECT id, name, content FROM documents')
    return cursor.fetchall()

In [55]:
def compare_documents(new_content):
    # Fetch existing documents from the database
    existing_documents = get_all_documents()

    if not existing_documents:
        print("No existing documents in the database.")
        return None

    # Compute TF-IDF vectors for documents
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([doc[2] for doc in existing_documents])

    # Calculate cosine similarity between the new document and existing documents
    similarity_scores = cosine_similarity(vectorizer.transform([new_content]), vectors)

    # Identify the document with the highest similarity
    best_match_index = similarity_scores.argmax()
    best_match_id = existing_documents[best_match_index][0]
    best_match_name = existing_documents[best_match_index][1]

    return best_match_id, best_match_name



In [61]:
# Example usage: Upload a new document
new_document_name = "copy.docx"
new_document_path = "/Users/sedastepanyan/Downloads/attachments/copy.docx"

# Extract text content from the new document
new_doc = Document(new_document_path)
new_content = " ".join([paragraph.text for paragraph in new_doc.paragraphs])

# Compare the new document with existing documents
best_match_info = compare_documents(new_content)

if best_match_info:
    best_match_id, best_match_name = best_match_info
    print(f"The best match for the new document is: Document ID {best_match_id}, Name: {best_match_name}")

# Add the new document to the database
add_document(new_document_name, new_content)

The best match for the new document is: Document ID 3, Name: DISQO - Walmart Draft 20231114.docx
Document added successfully.


In [None]:
# Specify the folder containing all Word documents
folder_path = "/path/to/your/word/documents"

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".docx"):
        # Construct the full path to the Word document
        document_path = os.path.join(folder_path, filename)

        # Extract text content from the Word document
        new_doc = Document(document_path)
        new_content = " ".join([paragraph.text for paragraph in new_doc.paragraphs])

        # Add the document to the database
        add_document(filename, new_content)

In [None]:
# For testing purposes, you can manually set the path to a Word document and run the upload_and_add_document function.
# Replace './path/to/uploaded/document.docx' with the actual path to your uploaded Word document.

# Example:
# upload_and_add_document('./path/to/your/uploaded/document.docx')
