<a href="https://colab.research.google.com/github/aknip/Coding-Cheatsheets/blob/main/Python-Compare-Text-or-JSON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Compare Texts, Compare JSONS

Usecase: Compare and benchmark LLM results

In [108]:
!pip install numpy python-benedict icecream levenshtein --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
import json
import textwrap
from icecream import ic

# Compare Texts

## 1. Levenshtein Distance

- https://towardsdatascience.com/text-similarity-w-levenshtein-distance-in-python-2f7478986e75#
- Notebook: https://github.com/vatsal220/medium_articles/blob/main/levenshtein_distance/lev_dist.ipynb
- Github: The Levenshtein Python C extension module contains functions for fast computation of Levenshtein distance and string similarity https://github.com/rapidfuzz/Levenshtein

###Two implementations, same results...

In [33]:
# First implementation, using numpy

import numpy as np

def levenshtein(seq1, seq2):
    # source: https://github.com/vatsal220/medium_articles/blob/main/levenshtein_distance/lev_dist.ipynb
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    #ic(matrix)
    return (matrix[size_x - 1, size_y - 1])

In [None]:
# Second implementation, using Levenshtein Python C extension module https://github.com/rapidfuzz/

from Levenshtein import distance

In [127]:
# 1. test: Simple one word
string1 = 'stamp'
string2 = 'stomp'
ic(levenshtein(string1, string2))
ic(distance(string1, string2))

# 2. test: Sentences, case, whitespace
string1 = 'Hello world, a longer example. With two sentences. Punctuation and upper-/lower case count. Spaces, too!'
string2 = 'Hello world, a longer example. With two sentences, punctuation and upper-/lower case count.    Spaces, too!'
ic(levenshtein(string1, string2))
ic(distance(string1, string2))

# 3. test: JSON, converted to string - and sorted for better comparison results
string1 = json.dumps({'dict1': {'foo': 1, 'bar': 2}, 'dict2': {'baz': 'lorem ipsum', 'quux': 3}}, sort_keys = True)
string2 = json.dumps({'dict1': {'bar': 9, 'foo': 1}, 'dict2': {'baz': 'lorem ipsum', 'quux': 3}}, sort_keys = True)
ic(levenshtein(string1, string2))
ic(distance(string1, string2))



ic| levenshtein(string1, string2): 1.0
ic| distance(string1, string2): 1
ic| levenshtein(string1, string2): 5.0
ic| distance(string1, string2): 5
ic| levenshtein(string1, string2): 1.0
ic| distance(string1, string2): 1


1

In [113]:
json_target = {"Versicherungssumme": "5000000", "Versicherungsnehmer_Name": "Zorrsen Beteiligungsgesellschaft mbH", "test": {"testA": [{"key1": "value1", "key2": "value2"}, {"key3": "value3", "key4": "value4"}], "testB": 34}}
json_result = {"Versicherungsnehmer_Name": "Zorrsen Beteiligungsgesellschaft", "test": {'testB': 34, "testA": [{"key3": "value3", "key1": "value1", "key2": "value2"}, {"key4": "value4", "key3": "value3"}]}, "Versicherungssumme": "5500000"}

json_target_str = json.dumps(json_target, sort_keys = True)
json_result_str = json.dumps(json_result, sort_keys = True)

ic(levenshtein(json_target_str,json_result_str))
ic(distance(json_target_str,json_result_str))

ic| levenshtein(json_target_str,json_result_str): 23.0
ic| distance(json_target_str,json_result_str): 23


23

# Idea: Create flat text-list from JSON

sort, then compare

In [98]:
from benedict import benedict

# source: https://stackoverflow.com/a/58611501

def flatten_dict (my_dict):
  # removes hierarchy and arrays in dict, return flat string array
  dict_benedict = benedict(my_dict)
  global flat_item # hacky solution by using 'global'...
  flat_item = []
  def traverse_item(dct, key, value):
    global flat_item
    check = (type(value) == str or type(value) == int or type(value) == float)
    if check:
      flat_item.append('{}: {}'.format(key, value)) # string template for result
  dict_benedict.traverse(traverse_item)
  return flat_item

def array_to_sorted_string (string_array='', string_truncation=9999):
  # sorts an array of strings, trunctes strings and finally convert everything into one string
  my_array_sorted = sorted(string_array)
  res = ''
  for txt in my_array_sorted:
    res = res + txt[:string_truncation] + ', '
  return res[:-2]

In [106]:
d = {'dict1': {'foo': 1, 'bar': 2}, 'dict2': {'baz': 'hello very long text here which can be truncated by function', 'quux': 4}}
d_flat = flatten_dict(d)
print(json.dumps(d_flat, indent=2))
print(array_to_sorted_string(string_array=d_flat))
print(array_to_sorted_string(string_array=d_flat, string_truncation=20))

[
  "foo: 1",
  "bar: 2",
  "baz: hello very long text here which can be truncated by function",
  "quux: 4"
]
bar: 2, baz: hello very long text here which can be truncated by function, foo: 1, quux: 4
bar: 2, baz: hello very long, foo: 1, quux: 4


In [107]:
d_flat = flatten_dict(json_result)
print(json.dumps(d_flat, indent=2))
print(array_to_sorted_string(string_array=d_flat))
print(array_to_sorted_string(string_array=d_flat, string_truncation=50))

[
  "Versicherungsnehmer_Name: Zorrsen Beteiligungsgesellschaft",
  "testB: 34",
  "key3: value3",
  "key1: value1",
  "key2: value2",
  "key4: value4",
  "key3: value3",
  "Versicherungssumme: 5500000"
]
Versicherungsnehmer_Name: Zorrsen Beteiligungsgesellschaft, Versicherungssumme: 5500000, key1: value1, key2: value2, key3: value3, key3: value3, key4: value4, testB: 34
Versicherungsnehmer_Name: Zorrsen Beteiligungsgese, Versicherungssumme: 5500000, key1: value1, key2: value2, key3: value3, key3: value3, key4: value4, testB: 34
