In [1]:
import glob, os
from colorama import Fore, Style, init


> The project is to compare multiple documents with an original document, identify the number of differences (marked_count), and return the documents along with their IDs when the count exceeds a certain threshold.
---

In [2]:
# first step we're need to see all not same lines
# 1 step - identified the lines that are not the same between the original and document to compare.
def parse_documents():
    for infile in glob.glob(os.path.join( 'data/*.txt')):
        print("Current File Being Processed is: " + infile)
        with open(infile, encoding='utf-8') as f:
            lines = [line.strip() for line in f.readlines()]
            lines = [line for line in lines if line]
            yield lines

doc_to_compare, original = [*parse_documents()]
original, doc_to_compare

Current File Being Processed is: data\document.txt
Current File Being Processed is: data\original.txt


(['Hello, [ClientName].',
  'How are you!',
  'this is order of [Number] to our deal.',
  'Reragd your [EmpName].'],
 ['Hi, [Oleg].',
  'How are you!',
  'this is my order of [№1122] to our deal.',
  'Reragd your [Distributor-Mike].'])

In [3]:
# second step need to compare each line n mark using color green to
# 2 step - mark the differences between the original n other documents lines using color (e.g. green)
def color_diff(original_line, compared_line):
    original_words = original_line.split()
    compared_words = compared_line.split()

    for o, c in zip(original_words, compared_words):
        if o == c:
            yield f'{Fore.WHITE}{c}'
        else:
            yield f'{Fore.GREEN}{c}{Style.RESET_ALL}'



doc_to_compare, original = [*parse_documents()]
original, doc_to_compare
for line, line2 in zip(original, doc_to_compare):
    if line != line2:
        colored_diff = ' '.join(color_diff(line, line2))
        print(line, colored_diff)

Current File Being Processed is: data\document.txt
Current File Being Processed is: data\original.txt
Hello, [ClientName]. [32mHi,[0m [32m[Oleg].[0m
this is order of [Number] to our deal. [37mthis [37mis [32mmy[0m [32morder[0m [32mof[0m [32m[№1122][0m [32mto[0m [32mour[0m
Reragd your [EmpName]. [37mReragd [37myour [32m[Distributor-Mike].[0m


In [9]:

# 3 step need to skip not colored_diff if word in the compared_line framed to [word]
# 3 step - skip applying color to words framed within square bracketes [word] in the compared line
marked_count = 0
def color_diff(original_line, compared_line):
    global marked_count
    original_words = original_line.split()
    compared_words = compared_line.split()


    for o, c in zip(original_words, compared_words):
        if o == c:
            yield f'{Fore.WHITE}{c}'
        else:
            if c.startswith("[") or c.startswith("]"):
                yield f'{Fore.WHITE}{c}'
            else:
                marked_count += 1
                yield f'{Fore.GREEN}{c}{Style.RESET_ALL}'

    return marked_count


doc_to_compare, original = [*parse_documents()]
original, doc_to_compare
for line, line2 in zip(original, doc_to_compare):
    if line != line2:
        colored_diff = ' '.join(color_diff(line, line2))
        print(line, colored_diff)

marked_count


Current File Being Processed is: data\document.txt
Current File Being Processed is: data\original.txt
Hello, [ClientName]. [32mHi,[0m [37m[Oleg].
this is order of [Number] to our deal. [37mthis [37mis [32mmy[0m [32morder[0m [32mof[0m [37m[№1122] [32mto[0m [32mour[0m
Reragd your [EmpName]. [37mReragd [37myour [37m[Distributor-Mike].


6

In [10]:

# 4 step need to count how much words was marked to Green color
# 4 step - to count how  many words were marked in green color.
# 5 step need to count only at write color till green not each colored word only colored sentence with green till not change color to white
# 5 step - to count only continuous sequences of green-colored words until a change to white occurs.

marked_count = 0
def color_diff(original_line, compared_line):
    global marked_count
    original_words = original_line.split()
    compared_words = compared_line.split()
    in_green_sequence = False

    for o, c in zip(original_words, compared_words):
        if o == c:
            in_green_sequence = False
            yield f'{Fore.WHITE}{c}'
        else:
            if c.startswith("[") or c.startswith("]"):
                in_green_sequence = False
                yield f'{Fore.WHITE}{c}'
            else:
                if not in_green_sequence:
                    in_green_sequence = True
                    marked_count += 1
                yield f'{Fore.GREEN}{c}{Style.RESET_ALL}'

    return marked_count


doc_to_compare, original = [*parse_documents()]
original, doc_to_compare
for line, line2 in zip(original, doc_to_compare):
    if line != line2:
        colored_diff = ' '.join(color_diff(line, line2))
        print(line, colored_diff)

marked_count


Current File Being Processed is: data\document.txt
Current File Being Processed is: data\original.txt
Hello, [ClientName]. [32mHi,[0m [37m[Oleg].
this is order of [Number] to our deal. [37mthis [37mis [32mmy[0m [32morder[0m [32mof[0m [37m[№1122] [32mto[0m [32mour[0m
Reragd your [EmpName]. [37mReragd [37myour [37m[Distributor-Mike].


3

In [None]:
# not just if marked_count > 3: 
    ...
    # we're got majority document

In [None]:
import glob
import os
from colorama import Fore, Style

def parse_documents_lazy():
    for infile in glob.glob(os.path.join('data/*.txt')):
        print("Current File Being Processed is: " + infile)
        with open(infile, encoding='utf-8') as f:
            for line in f:
                if line.strip():  # Skip empty lines
                    yield line.strip(), infile  # Return both line and document ID

marked_count_threshold = 3 # TODO: modified to find_majority(): formula return coefficient of mistakes 
documents_with_errors = []

def color_diff_lazy(original_line, compared_line):
    global marked_count
    original_words = original_line.split()
    compared_words = compared_line.split()
    in_green_sequence = False

    for o, c in zip(original_words, compared_words):
        if o == c:
            in_green_sequence = False
            yield f'{Fore.WHITE}{c}'
        else:
            if c.startswith("[") or c.startswith("]"):
                in_green_sequence = False
                yield f'{Fore.WHITE}{c}'
            else:
                if not in_green_sequence:
                    in_green_sequence = True
                    marked_count += 1
                yield f'{Fore.GREEN}{c}{Style.RESET_ALL}'

    return marked_count

doc_to_compare_generator = parse_documents_lazy()
original_generator = parse_documents_lazy()

for line, (line2, doc_id) in zip(original_generator, doc_to_compare_generator):
    marked_count = 0  # Reset marked_count for each document comparison
    if line != line2:
        colored_diff = ' '.join(color_diff_lazy(line, line2))
        print(line, colored_diff)
        if marked_count > marked_count_threshold:
            documents_with_errors.append((doc_id, marked_count))

print(f"\nDocuments with more than {marked_count_threshold} errors:")
for doc_id, errors in documents_with_errors:
    print(f"Document ID: {doc_id}, Errors: {errors}")


---

<img src="img/compareDocuments.png">

In [None]:
# n step - 
# other libraries to compare diff documents with etalon
import difflib

def compare_documents(etalon, documents):
    similarity_scores = []

    for doc in documents:
        seq_matcher = difflib.SequenceMatcher(None, etalon, doc)
        similarity_scores.append(seq_matcher.ratio())

    return similarity_scores

etalon_document = "The etalon document content."
other_documents = [
    "Similar content in document one.",
    "Different content in document two.",
    "Content in document three that matches the etalon.",
]

# Compare documents and get similarity scores
similarity_scores = compare_documents(etalon_document, other_documents)

# Identify the document with the highest similarity (majority)
max_similarity_score = max(similarity_scores)
index_of_majority = similarity_scores.index(max_similarity_score)
majority_document = other_documents[index_of_majority]

print(f"The document with the majority of content similar to the etalon is:\n{majority_document}")
