In [1]:
import polars as pl

In [2]:
import re


def find_ngrams(text: str, number: int=3) -> set:
    """
    returns a set of ngrams for the given string
    :param text: the string to find ngrams for
    :param number: the length the ngrams should be. defaults to 3 (trigrams)
    :return: set of ngram strings
    """

    if not text:
        return set()

    words = [f'  {x} ' for x in re.split(r'\W+', text.lower()) if x.strip()]

    ngrams = set()

    for word in words:
        for x in range(0, len(word) - number + 1):
            ngrams.add(word[x:x+number])

    return ngrams


def similarity(text1: str, text2: str, number: int=3) -> float:
    """
    Finds the similarity between 2 strings using ngrams.
    0 being completely different strings, and 1 being equal strings
    """

    ngrams1 = find_ngrams(text1, number)
    ngrams2 = find_ngrams(text2, number)

    num_unique = len(ngrams1 | ngrams2)
    num_equal = len(ngrams1 & ngrams2)

    return float(num_equal) / float(num_unique)


def only_similarity(text1: str, text2: str, number: int=3) -> float:
    """
    Finds the similarity between 2 strings using ngrams.
    0 being completely different strings, and 1 being equal strings
    """

    num_unique = len(text1 | text2)
    num_equal = len(text1 & text2)

    return float(num_equal) / float(num_unique)

In [5]:
df = pl.read_csv('08andmore_duplicates.csv').sort('sim')
df

id,idn,sim
i64,i64,f64
12687,9915,0.6
125950,98134,0.6
165158,9632,0.6
406459,217535,0.6
1190667,612441,0.6
…,…,…
1955500,1884303,0.795918
2011395,1353392,0.7962963
1451928,797702,0.797386
2258124,1988484,0.7979798


In [7]:
final = ''
for row in df.iter_rows():
    final = f'{final} {row[0]} {row[1]}'
final

' 12687 9915 125950 98134 165158 9632 406459 217535 1190667 612441 1784836 1665653 2478569 615957 1772793 1683171 2126809 1568387 2417845 801913 2292204 2113806 1270969 1267297 9624 6563 1695583 1482709 2056624 463937 125834 12814 9741 5308 1906171 1772266 587543 238898 9637 5138 12656 106 1938403 1820120 2078330 1820120 1470739 1362467 112898 104196 297871 288634 1131946 1355 1859013 1686689 2507024 2405867 2466914 1510427 2511700 2448055 321749 317599 516840 395298 619789 100281 1195562 287942 1197412 906910 866515 648348 225570 96119 225570 187703 285139 73 2210569 1765989 1729703 1694630 965479 681271 628011 77119 1587047 1510427 9741 6616 1695693 406286 2488440 2109320 2466914 2409732 6563 5376 587543 385850 965479 443032 787116 377134 29478 9731 1240231 994730 1382404 129974 806530 504999 1648603 1466729 1597199 1084422 615368 12850 673591 402496 1197513 583050 591403 372027 48722 12420 1470409 9995 1863113 1496641 276051 77119 681271 443032 1951362 1920340 2266630 2214979 203852

In [8]:
df.write_csv('06_08similarity.csv')
