In [1]:
# Для того, чтобы pymorphy запускался на версиях Python 3.10+
# В качестве альтернативы можно снести Python на версию раньше 3.10
import inspect
from collections import namedtuple

if not hasattr(inspect, "getargspec"):
    FullArgSpec = inspect.getfullargspec      
    ArgSpec = namedtuple('ArgSpec',
                         'args varargs keywords defaults')

    def getargspec(func):
        spec = FullArgSpec(func)
        return ArgSpec(spec.args, spec.varargs,
                       spec.varkw,  spec.defaults)

    inspect.getargspec = getargspec           

In [None]:
import json
from sentence_transformers import SentenceTransformer
from pathlib import Path
from typing import List, Dict
import re
import numpy as np
from rapidfuzz import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from pymorphy2 import MorphAnalyzer

In [None]:
nltk.download("punkt", quiet = True)

In [4]:
stemmer = SnowballStemmer("russian")
morph = MorphAnalyzer()

In [5]:
bert = SentenceTransformer("all-MiniLM-L6-v2")

In [6]:
emb_keys = ["Наименование продукта", "Описание контракта"]

In [7]:
def crop_json(obj: Dict):
    trimmed = {}
    for k, v in obj.items():
        trimmed[k] = v
        if k == 'Фамилии':
            break
    return trimmed

In [8]:
def normalize_arrays(obj):
    if isinstance(obj, list):
        return sorted({json.dumps(x, sort_keys = True, ensure_ascii = False) for x in obj})
    if isinstance(obj, dict):
        return {k: normalize_arrays(v) for k, v in obj.items()}
    return obj

In [19]:
def load_json(fp: Path) -> dict:
    obj = json.load(open(fp, encoding="utf-8"))
    obj = crop_json(obj)          
    obj = normalize_arrays(obj)   
    return obj

In [20]:
def split_rest(j1, j2):
    a = {k: v for k, v in j1.items() if k not in emb_keys}
    b = {k: v for k, v in j2.items() if k not in emb_keys}
    return json.dumps(a, sort_keys=True, ensure_ascii=False), \
           json.dumps(b, sort_keys=True, ensure_ascii=False)

In [21]:
_TOKEN_RE = re.compile(r"[A-Za-zА-Яа-яЁё0-9]{2,}")

In [22]:
def tokens(text: str, mode: str = "stem") -> List[str]:
    tokens = _TOKEN_RE.findall(text.lower())
    if mode == "stem":
        return [stemmer.stem(t) for t in tokens]
    elif mode == "lemma":
        return [morph.parse(t)[0].normal_form for t in tokens]
    else:
        raise ValueError("Mode must be 'stem' or 'lemma'")

In [35]:
def compare_bow_similarity(js1, js2, mode="stem"):
    s1, s2 = split_rest(js1, js2)
    tfidf = TfidfVectorizer(tokenizer=lambda s: tokens(s, mode),
                            lowercase=False).fit_transform([s1, s2])
    return float(cosine_similarity(tfidf[0], tfidf[1])[0, 0])

In [29]:
def compare_fuzzy(js1, js2):
    s1, s2 = split_rest(js1, js2)
    return fuzz.token_sort_ratio(s1, s2) / 100.0

In [37]:
def compare_lcs_len(a, b):
    m, n = len(a), len(b)
    dp = [0]*(n+1)
    for i in range(1, m+1):
        prev = 0
        for j in range(1, n+1):
            cur = dp[j]
            dp[j] = prev+1 if a[i-1]==b[j-1] else max(dp[j], dp[j-1])
            prev = cur
    return dp[n]

def compare_lcs(js1, js2):
    s1, s2 = split_rest(js1, js2)
    return compare_lcs_len(s1, s2) / max(len(s1), len(s2), 1)

In [52]:
def compare_bert_embedding(js1: Dict, js2: Dict):
    s1 = " ".join(str(js1.get(k, "")) for k in emb_keys)
    s2 = " ".join(str(js2.get(k, "")) for k in emb_keys)
    v  = bert.encode([s1, s2], normalize_embeddings=True, convert_to_numpy=True)
    return float(np.clip(np.dot(v[0], v[1]), -1.0, 1.0))   

In [53]:
methods = {
    "embedding" : lambda e,o: compare_bert_embedding(load_json(e), load_json(o)),
    "bow_stem"  : lambda e,o: compare_bow_similarity(load_json(e), load_json(o), "stem"),
    "bow_lemma" : lambda e,o: compare_bow_similarity(load_json(e), load_json(o), "lemma"),
    "fuzzy"     : lambda e,o: compare_fuzzy(load_json(e), lo ad_json(o)),
    "lcs"       : lambda e,o: compare_lcs(load_json(e), load_json(o)),
}

def compare_jsons(exp: Path, out: Path, method: str) -> float:
    if method not in methods:
        raise ValueError(f"Unknown method {method}")
    return methods[method](exp, out)

In [58]:
exp = Path("expected_4.json")
out = Path("output_4.json")

for method in methods:
    print(method.ljust(10), "=", compare_jsons(exp, out, method))

embedding  = 1.0
bow_stem   = 0.7989489812902651
bow_lemma  = 0.7989489812902651
fuzzy      = 0.7713310580204777
lcs        = 0.7021617293835068
