In [1]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [2]:
from docx import Document
import pandas as pd
from difflib import Differ

In [3]:
from docx import Document
import pandas as pd
from difflib import Differ

# Function to read text from a Word document
def read_docx(file_path):
    """Reads text from a .docx file and returns a list of paragraphs."""
    doc = Document(file_path)
    return [para.text.strip() for para in doc.paragraphs if para.text.strip()]

# Function to compare texts from two documents and filter for differences
def compare_texts_for_differences(original, updated):
    """Compares two lists of strings and returns differences as a DataFrame."""
    differ = Differ()
    diff = list(differ.compare(original, updated))
    original_content = []
    updated_content = []

    for line in diff:
        if line.startswith('- '):  # Line in original but not in updated
            original_content.append(line[2:])
            updated_content.append("")
        elif line.startswith('+ '):  # Line in updated but not in original
            original_content.append("")
            updated_content.append(line[2:])
        # Skip identical lines (those starting with '  ')

    # Create a DataFrame for only the differences
    differences_df = pd.DataFrame({
        "Original Content": original_content,
        "Updated Content": updated_content
    }).dropna(how='all')  # Drop rows where both columns are empty
    
    return differences_df

# Main execution
def generate_differences_result(original_file, updated_file, output_file):
    """Generates a differences report between two .docx files and saves it to Excel."""
    # Read the Word documents
    original_text = read_docx(original_file)
    updated_text = read_docx(updated_file)
    
    # Compare the documents
    differences_result = compare_texts_for_differences(original_text, updated_text)
    
    # Save the differences result to Excel
    differences_result.to_excel(output_file, index=False)
    print(f"Differences result saved to: {output_file}")

# File paths
original_file = "/Users/hoangminh/Downloads/old_doc.docx"
updated_file = "/Users/hoangminh/Downloads/new_doc.docx"
output_file = "/Users/hoangminh/Downloads/differences_result.xlsx"

# Generate the differences result
generate_differences_result(original_file, updated_file, output_file)


Differences result saved to: /Users/hoangminh/Downloads/differences_result.xlsx


In [5]:
import pandas as pd
from fuzzywuzzy import fuzz

def adjust_fuzzy_matching(input_file, output_file, threshold=80):
    """Adjust rows based on fuzzy matching and include probabilities."""
    # Read the input Excel file
    df = pd.read_excel(input_file)
    
    # Add a new column for probability
    df["Fuzzy Match Probability"] = None
    
    # Iterate through the DataFrame to adjust rows
    for index, row in df.iterrows():
        if pd.notnull(row["Original Content"]) and pd.isnull(row["Updated Content"]):
            # Compare with the next row in "Updated Content"
            next_index = index + 1
            if next_index < len(df):
                next_row_content = df.at[next_index, "Updated Content"]
                if pd.notnull(next_row_content):
                    # Fuzzy match
                    similarity = fuzz.ratio(row["Original Content"], next_row_content)
                    if similarity >= threshold:
                        # Move the next "Updated Content" row up
                        df.at[index, "Updated Content"] = next_row_content
                        df.at[next_index, "Updated Content"] = None
                        # Add similarity score to the current row
                        df.at[index, "Fuzzy Match Probability"] = similarity
                    else:
                        # Add similarity score even if not moved
                        df.at[index, "Fuzzy Match Probability"] = similarity
                else:
                    # If there's no content to compare, set similarity as NaN
                    df.at[index, "Fuzzy Match Probability"] = None
            else:
                # If no next row exists, set similarity as NaN
                df.at[index, "Fuzzy Match Probability"] = None
    
    # Save the adjusted DataFrame to an Excel file
    df.to_excel(output_file, index=False)
    print(f"Adjusted differences saved to: {output_file}")

# File paths
input_file = "/Users/hoangminh/Downloads/differences_result.xlsx"
output_file = "/Users/hoangminh/Downloads/differences_result_new_1.xlsx"

# Adjust rows with fuzzy matching
adjust_fuzzy_matching(input_file, output_file)

Adjusted differences saved to: /Users/hoangminh/Downloads/differences_result_new_1.xlsx
