In [32]:
import re
from typing import List, Tuple

def tokenize(text: str) -> List[str]:
    # Tokenize while keeping whitespace, especially newlines
    tokens = re.split(r'(\s+)', text)
    tokens = [t for t in tokens if t.strip() != '' or t == '\n' or t.isspace()]
    return tokens

def diff_words(old_words: List[str], new_words: List[str]) -> List[Tuple[str, str]]:
    print(old_words)
    print(new_words)
    # Initialize the DP table
    dp = [[0] * (len(new_words) + 1) for _ in range(len(old_words) + 1)]

    # Fill the DP table for LCS
    for i in range(1, len(old_words) + 1):
        for j in range(1, len(new_words) + 1):
            if old_words[i - 1] == new_words[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    # Backtrack to get the LCS
    lcs = []
    i, j = len(old_words), len(new_words)
    while i > 0 and j > 0:
        if old_words[i - 1] == new_words[j - 1]:
            lcs.insert(0, (old_words[i - 1], i - 1, j - 1))
            i -= 1
            j -= 1
        elif dp[i - 1][j] > dp[i][j - 1]:
            i -= 1
        else:
            j -= 1

    # Build the result with highlights
    result = []
    old_index = new_index = lcs_index = 0

    while old_index < len(old_words) or new_index < len(new_words):
        if (lcs_index < len(lcs) and 
            old_index == lcs[lcs_index][1] and 
            new_index == lcs[lcs_index][2]):
            result.append(('common', lcs[lcs_index][0]))
            old_index += 1
            new_index += 1
            lcs_index += 1
        else:
            if old_index < len(old_words) and (lcs_index >= len(lcs) or old_index < lcs[lcs_index][1]):
                result.append(('deleted', old_words[old_index]))
                old_index += 1
            if new_index < len(new_words) and (lcs_index >= len(lcs) or new_index < lcs[lcs_index][2]):
                result.append(('added', new_words[new_index]))
                new_index += 1

    return result

In [36]:
old_answer = "Before cataract surgery, you’ll usually be asked not to eat or drink anything for at least 6 hours beforehand to reduce the risk of complications from anesthesia. If your surgery is in the morning, you’ll likely need to skip breakfast. Some clinics allow small sips of water or clear fluids up to 2 hours before. Always follow the exact instructions your doctor or surgical team gives you—they’ll tell you what’s safe for your specific case. If you’re unsure, call the clinic to confirm. Additionally, if you have any medical conditions or take medications, discuss these with your doctor as they may have specific guidelines for you."

new_answer = """
- You’ll usually be asked not to eat or drink anything for at least 6 hours before cataract surgery to reduce the risk of complications from anesthesia.
- If your surgery is in the morning, you’ll likely need to skip breakfast.
- Some clinics allow small sips of water or clear fluids up to 2 hours before.
- Always follow the exact instructions your doctor or surgical team gives you—they’ll tell you what’s safe for your specific case.
- If you’re unsure, call the clinic to confirm.
- If you have any medical conditions or take medications, discuss these with your doctor as they may have specific guidelines for you.
"""

In [37]:
results = diff_words(tokenize(old_answer), tokenize(new_answer))

['Before', ' ', 'cataract', ' ', 'surgery,', ' ', 'you’ll', ' ', 'usually', ' ', 'be', ' ', 'asked', ' ', 'not', ' ', 'to', ' ', 'eat', ' ', 'or', ' ', 'drink', ' ', 'anything', ' ', 'for', ' ', 'at', ' ', 'least', ' ', '6', ' ', 'hours', ' ', 'beforehand', ' ', 'to', ' ', 'reduce', ' ', 'the', ' ', 'risk', ' ', 'of', ' ', 'complications', ' ', 'from', ' ', 'anesthesia.', ' ', 'If', ' ', 'your', ' ', 'surgery', ' ', 'is', ' ', 'in', ' ', 'the', ' ', 'morning,', ' ', 'you’ll', ' ', 'likely', ' ', 'need', ' ', 'to', ' ', 'skip', ' ', 'breakfast.', ' ', 'Some', ' ', 'clinics', ' ', 'allow', ' ', 'small', ' ', 'sips', ' ', 'of', ' ', 'water', ' ', 'or', ' ', 'clear', ' ', 'fluids', ' ', 'up', ' ', 'to', ' ', '2', ' ', 'hours', ' ', 'before.', ' ', 'Always', ' ', 'follow', ' ', 'the', ' ', 'exact', ' ', 'instructions', ' ', 'your', ' ', 'doctor', ' ', 'or', ' ', 'surgical', ' ', 'team', ' ', 'gives', ' ', 'you—they’ll', ' ', 'tell', ' ', 'you', ' ', 'what’s', ' ', 'safe', ' ', 'for', ' ', '

In [35]:
results

[('common', 'After'),
 ('common', ' '),
 ('common', 'cataract'),
 ('deleted', ' '),
 ('deleted', 'surgery,'),
 ('common', ' '),
 ('deleted', 'do'),
 ('added', 'surgery:'),
 ('added', '\n\n'),
 ('added', 'Do:'),
 ('added', '\n'),
 ('added', '-'),
 ('common', ' '),
 ('deleted', 'rest'),
 ('added', 'Rest'),
 ('common', ' '),
 ('common', 'your'),
 ('common', ' '),
 ('deleted', 'eyes,'),
 ('added', 'eyes'),
 ('added', '\n'),
 ('added', '-'),
 ('common', ' '),
 ('deleted', 'use'),
 ('added', 'Use'),
 ('common', ' '),
 ('common', 'prescribed'),
 ('common', ' '),
 ('common', 'eye'),
 ('deleted', ' '),
 ('deleted', 'drops,'),
 ('common', ' '),
 ('deleted', 'and'),
 ('added', 'drops'),
 ('added', '\n'),
 ('added', '-'),
 ('common', ' '),
 ('deleted', 'wear'),
 ('added', 'Wear'),
 ('common', ' '),
 ('common', 'the'),
 ('common', ' '),
 ('common', 'protective'),
 ('common', ' '),
 ('common', 'shield'),
 ('common', ' '),
 ('common', 'as'),
 ('common', ' '),
 ('deleted', 'advised.'),
 ('added', 'adv