In [None]:
import re
import heapq

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s.]', '', text)
    sentences = text.split('.')
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

def levenshtein_distance(s1, s2):
    len_s1, len_s2 = len(s1), len(s2)
    dp = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

    for i in range(len_s1 + 1):
        dp[i][0] = i
    for j in range(len_s2 + 1):
        dp[0][j] = j

    for i in range(1, len_s1 + 1):
        for j in range(1, len_s2 + 1):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = min(dp[i - 1][j] + 1,
                               dp[i][j - 1] + 1,
                               dp[i - 1][j - 1] + 1)

    return dp[len_s1][len_s2]

class State:
    def __init__(self, idx1, idx2, cost, parent=None):
        self.idx1 = idx1
        self.idx2 = idx2
        self.cost = cost
        self.parent = parent

    def __lt__(self, other):
        return self.cost < other.cost

def a_star_search(doc1, doc2):
    start_state = State(0, 0, 0)
    open_list = []
    heapq.heappush(open_list, (0, start_state))
    closed_set = set()

    while open_list:
        _, current_state = heapq.heappop(open_list)

        if current_state.idx1 == len(doc1) and current_state.idx2 == len(doc2):
            return current_state

        closed_set.add((current_state.idx1, current_state.idx2))

        if current_state.idx1 < len(doc1) and current_state.idx2 < len(doc2):
            sentence1 = doc1[current_state.idx1]
            sentence2 = doc2[current_state.idx2]
            cost = current_state.cost + levenshtein_distance(sentence1, sentence2)
            next_state = State(current_state.idx1 + 1, current_state.idx2 + 1, cost, current_state)
            if (next_state.idx1, next_state.idx2) not in closed_set:
                heapq.heappush(open_list, (next_state.cost, next_state))

        if current_state.idx1 < len(doc1):
            cost = current_state.cost + 1
            next_state = State(current_state.idx1 + 1, current_state.idx2, cost, current_state)
            if (next_state.idx1, next_state.idx2) not in closed_set:
                heapq.heappush(open_list, (next_state.cost, next_state))

        if current_state.idx2 < len(doc2):
            cost = current_state.cost + 1
            next_state = State(current_state.idx1, current_state.idx2 + 1, cost, current_state)
            if (next_state.idx1, next_state.idx2) not in closed_set:
                heapq.heappush(open_list, (next_state.cost, next_state))

    return None

def backtrack_alignment(goal_state, doc1, doc2):
    alignment = []
    current_state = goal_state
    while current_state:
        idx1, idx2 = current_state.idx1, current_state.idx2
        if idx1 > 0 and idx2 > 0:
            alignment.append((doc1[idx1 - 1], doc2[idx2 - 1]))
        elif idx1 > 0:
            alignment.append((doc1[idx1 - 1], "-"))
        elif idx2 > 0:
            alignment.append(("-", doc2[idx2 - 1]))
        current_state = current_state.parent
    alignment.reverse()
    return alignment

def detect_plagiarism(doc1, doc2):
    preprocessed_doc1 = preprocess_text(doc1)
    preprocessed_doc2 = preprocess_text(doc2)
    goal_state = a_star_search(preprocessed_doc1, preprocessed_doc2)

    if goal_state is None:
        print("No alignment found.")
        return []

    alignment = backtrack_alignment(goal_state, preprocessed_doc1, preprocessed_doc2)

    plagiarism_detected = []
    for sentence1, sentence2 in alignment:
        if sentence1 != "-" and sentence2 != "-":
            dist = levenshtein_distance(sentence1, sentence2)
            print(f"Comparing: '{sentence1}' with '{sentence2}', Edit Distance: {dist}")
            if dist < len(sentence1) * 0.5:
                plagiarism_detected.append((sentence1, sentence2))

    return plagiarism_detected

# Test Cases
doc1 = "This is a sample document. It contains several sentences. This is an example for testing."
doc2 = "This is a sample text. It includes multiple sentences. This is a test case example."

# Test Case 1: Identical Documents
doc1_identical = doc1
doc2_identical = doc1
print("Test Case 1: Identical Documents")
print(detect_plagiarism(doc1_identical, doc2_identical))  # Expected: All sentences should align with zero edit distance.

# Test Case 2: Slightly Modified Document
print("\nTest Case 2: Slightly Modified Document")
print(detect_plagiarism(doc1, doc2))  # Expected: Low edit distance for most sentences.

# Test Case 3: Completely Different Documents
doc3 = "Another completely different document. Nothing similar here."
print("\nTest Case 3: Completely Different Documents")
print(detect_plagiarism(doc1, doc3))  # Expected: High edit distance, indicating no plagiarism.

# Test Case 4: Partial Overlap
doc4 = "This is a sample document. Another random sentence. This is an example for testing."
print("\nTest Case 4: Partial Overlap")
print(detect_plagiarism(doc1, doc4))  # Expected: Some overlapping content with low edit distance.


Test Case 1: Identical Documents
Comparing: 'this is a sample document' with 'this is a sample document', Edit Distance: 0
Comparing: 'it contains several sentences' with 'it contains several sentences', Edit Distance: 0
Comparing: 'this is an example for testing' with 'this is an example for testing', Edit Distance: 0
[('this is a sample document', 'this is a sample document'), ('it contains several sentences', 'it contains several sentences'), ('this is an example for testing', 'this is an example for testing')]

Test Case 2: Slightly Modified Document
Comparing: 'it contains several sentences' with 'this is a sample text', Edit Distance: 21
Comparing: 'it contains several sentences' with 'it includes multiple sentences', Edit Distance: 14
Comparing: 'this is an example for testing' with 'it includes multiple sentences', Edit Distance: 22
Comparing: 'this is an example for testing' with 'this is a test case example', Edit Distance: 19
[('it contains several sentences', 'it includes m