In [1]:
from typing import List, Tuple
from enum import Enum, auto

In [2]:
class Action(Enum):
    ADD = auto()
    DELETE = auto()
    CHANGE = auto()
    NO_CHANGE = auto()

In [3]:
class Edit_table_cell:
    def __init__(self, i, j, parent = None):
        self.cost = i if j == 0 else j if i == 0 else None
        self.parent = parent
        self.action = \
            Action.DELETE if (j == 0 and i > 0) \
            else Action.ADD if (i == 0 and j > 0)\
            else Action.NO_CHANGE

        self.i = i
        self.j = j

    def __repr__(self):
        return str(self.cost)

    def __str__(self):
        return str(self.i) + ", " + str(self.j) + " -> " + str(self.cost) + " " + str(self.action)\
               + "\n" + str(self.parent)

def edit_distance(x, y, delta = lambda x, y : 0 if x == y else 1) -> Tuple[List[List[Edit_table_cell]], int]:
    edit_table = [[Edit_table_cell(i, j) for j in range(len(y) + 1)] for i in range(len(x) + 1)]


    for i in range (1, len(x) + 1):
        edit_table[i][0].parent = edit_table[i-1][0]
        x_i = x[i-1]
        for j in range(1, len(y) + 1):
            if i == 0:
                edit_table[0][j].parent = edit_table[0][j-1]

            y_j = y[j-1]

            cost = min(
                edit_table[i][j - 1].cost + 1,
                edit_table[i - 1][j].cost + 1,
                edit_table[i-1][j-1].cost + delta(x_i, y_j)
            )

            if cost == edit_table[i][j-1].cost + 1:
                cell = edit_table[i][j-1]
                edit_table[i][j].action = Action.ADD

            elif cost == edit_table[i-1][j].cost + 1:
                cell = edit_table[i-1][j]
                edit_table[i][j].action = Action.DELETE
            else:
                cell = edit_table[i-1][j-1]
                edit_table[i][j].action = Action.NO_CHANGE if delta(x_i, y_j) == 0 else Action.CHANGE

            edit_table[i][j].cost = cost
            edit_table[i][j].parent = cell

    return edit_table, edit_table[len(x)][len(y)].cost

def get_edit_sequence(x, y,  delta = lambda x, y : 0 if x == y else 1):
    edit_table, distance = edit_distance(x,y, delta)

    cell = edit_table[len(x)][len(y)]

    sequence = []

    while cell is not None:
        i, j = cell.i, cell.j
        if i == 0 and j == 0:
            break
        sequence.append((i, j, x[i-1], y[j-1], cell.action))
        cell = cell.parent
    # pprint(sequence[::-1])
    return sequence[::-1], distance

def visualize(x, y, delta = lambda x, y : 0 if x == y else 1):
    text = ""
    sequence, distance = get_edit_sequence(x, y, delta)
    print("\n" + x , "to", y, "distance:", distance, "\n")

    for i, j, lx, ly, action in sequence:
        # print(i, j, lx, ly, action)
        if action == Action.NO_CHANGE:
            text += lx

        if action == Action.CHANGE:
            print(text + " {"+ lx + "-->" + ly + "} " + x[j:])
            text += ly

        if action == Action.DELETE:
            print(text + " {--", lx+"} " + x[j+1:])

        if action == Action.ADD:
            print(text + " {++", ly+"} " + x[i:])
            text += ly

In [4]:
def lcs(x, y):
    return (len(x) + len(y) - edit_distance(x, y, lambda x, y: 0 if x==y else 2)[1])/2

In [5]:
test_cases = [('los', 'kloc'),
              ('Łódź', 'Lodz'),
              ('kwintesencja', 'quintessence'),
              ('ATGAATCTTACCGCCTCG', 'ATGAGGCTCTGGCCCCTG') ]

In [6]:
for f, t in test_cases:
    visualize(f, t, lambda x, y: 0 if x==y else 1)
    print("\nLCS",lcs(f, t),"\n")


los to kloc distance: 2 

 {++ k} los
klo {s-->c} 

LCS 2.0 


Łódź to Lodz distance: 3 

 {Ł-->L} ódź
L {ó-->o} dź
Lod {ź-->z} 

LCS 1.0 


kwintesencja to quintessence distance: 5 

 {k-->q} wintesencja
q {w-->u} intesencja
quintes {++ s} encja
quintessenc {j-->e} 
quintessence {-- a} 

LCS 8.0 


ATGAATCTTACCGCCTCG to ATGAGGCTCTGGCCCCTG distance: 7 

ATGA {A-->G} TCTTACCGCCTCG
ATGAG {T-->G} CTTACCGCCTCG
ATGAGGCT {++ C} TACCGCCTCG
ATGAGGCTCT {A-->G} CGCCTCG
ATGAGGCTCTG {++ G} CCGCCTCG
ATGAGGCTCTGGCC {-- G} TCG
ATGAGGCTCTGGCCCCT {-- C} 

LCS 13.0 



In [7]:
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from random import random

def remove_tokens(tokens):
    return [t for t in tokens if random() > 0.03 or t.text.isspace()]

with open('romeo-i-julia-700.txt') as file:
    text = file.read()
    vocab = Language(Vocab()).vocab
    tokenizer = Tokenizer(vocab)
    tokens = tokenizer(text)
    text1 = remove_tokens(tokens)
    text2 = remove_tokens(tokens)
    with open('text1.txt', 'w') as new_file:
        for token in text1:
            new_file.write(token.text_with_ws)
    with open('text2.txt', 'w') as new_file:
        for token in text2:
            new_file.write(token.text_with_ws)

In [8]:
def diff(x, y):
    L = [[0 if j == 0 or i == 0 else None for i in range(len(y) + 1)] for j in range(len(x) + 1)]

    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            if x[i - 1] == y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

    def print_diff(i, j):
        if i >= 0 and j >= 0 and x[i] == y[j]:
            print_diff(i-1, j-1)

        elif j >= 0 and (i == 0 or L[i][j-1] >= L[i-1][j]):
            print_diff(i, j-1)
            print("x> ["+ str(j)+"]", y[j])

        elif i >= 0 and (j == 0 or L[i][j-1] < L[i-1][j]):
            print_diff(i-1, j)
            print("y< ["+ str(i)+"]", x[i])

    print_diff(len(x) - 1, len(y) - 1)

In [9]:
text1 = open("text1.txt").readlines()
text2 = open("text2.txt").readlines()

diff(text1, text2)

y< [3] tłum. Józef Paszkowski

x> [3] tłum. Paszkowski

y< [10]  * ESKALUS — panujący w Weronie

x> [10]  * ESKALUS — książę panujący w Weronie

y< [13]  * STARZEC — stryjeczny Kapuleta

x> [13]  * STARZEC — stryjeczny brat Kapuleta

y< [20]  * BALTAZAR — służący Romea

x> [20]  * — Romea

y< [28]  * PANI MONTEKI małżonka Montekiego

x> [28]  * PANI MONTEKI — małżonka Montekiego

y< [32]  * Obywatele weroneńscy, różne osoby płci obojej, liczący się do przyjaciół obu domów, maski, straż wojskowa i inne osoby.

x> [32]  * Obywatele weroneńscy, różne osoby płci liczący się do przyjaciół obu domów, maski, straż wojskowa i inne osoby.

y< [37] Rzecz odbywa się przez większą część sztuki w przez część piątego aktu w Mantui.

x> [37] Rzecz odbywa się przez większą część sztuki w Weronie, przez część piątego aktu w Mantui.

y< [43] Przełożył Jan Kasprowicz

x> [43] Przełożył Kasprowicz

y< [46] Tam, gdzie się rzecz rozgrywa, w Weronie,

x> [46] Tam, gdzie się rzecz ta w Weronie,

y< [47] Do no