In [1]:
from typing import Iterable
from pprint import pprint
import numpy as np
from collections import Counter
from math import inf

In [2]:
def lc_substring(x, y):
    arr = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    res = 0

    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            if x[i-1] == y[j-1]:
                arr[i][j] = arr[i-1][j-1] + 1
                res = max(res, arr[i][j])
    return res

In [3]:
def n_gram(tokens, n):
    res = Counter()

    for i in range(len(tokens) - n + 1):
        if type(tokens) is str:
            seq = tokens[i:i+n]
        elif type(tokens[i]) is str:
            seq = ' '.join(tokens[i:i+n])
        else:
            seq = tuple(tokens[i:i+n])

        res[seq] += 1

    return res

n_gram('AGCTTCGA', 2)

Counter({'AG': 1, 'GC': 1, 'CT': 1, 'TT': 1, 'TC': 1, 'CG': 1, 'GA': 1})

In [4]:
n_gram("Mężny bądź, chroń pułk twój i sześć flag".split(), 2)

Counter({'Mężny bądź,': 1,
         'bądź, chroń': 1,
         'chroń pułk': 1,
         'pułk twój': 1,
         'twój i': 1,
         'i sześć': 1,
         'sześć flag': 1})

In [5]:

def dice_metric(x, y, n=2):
    x_n_gram = n_gram(x, n)
    y_n_gram = n_gram(y, n)

    if len(x_n_gram) + len(y_n_gram) == 0:
        return inf
    return 1 - 2 * len(x_n_gram.keys() & y_n_gram.keys()) / (len(x_n_gram) + len(y_n_gram))

def cosine_metric(x, y, n=2):
    x_n_gram = n_gram(x, n)
    y_n_gram = n_gram(y, n)

    s = sum(x_n_gram[g] * y_n_gram[g] for g in x_n_gram.keys() & y_n_gram.keys())

    _len = lambda ngram: sum(v*2 for v in ngram.values()) ** 0.5

    if _len(x_n_gram) * _len(y_n_gram) == 0:
        return inf

    return 1 - s / (_len(x_n_gram) * _len(y_n_gram))

def euclidean_metric(x, y, n=2):
    x_n_gram = n_gram(x, n)
    y_n_gram = n_gram(y, n)
    # pprint(x_n_gram)

    keys = x_n_gram.keys() | y_n_gram.keys()
    subtract = lambda ngram1, ngram2, g: ngram1.get(g, 0) - ngram2.get(g, 0)
    res = sum(subtract(x_n_gram, y_n_gram, g) ** 2 for g in keys) ** 0.5
    # print(res)
    return res

In [6]:
x = [c for c in 'AGCTTCGA']
y = [c for c in 'ABCDEFGH']
for metric in [lc_substring, dice_metric, cosine_metric, euclidean_metric]:
    print('{:>20} {:<5.2f}'.format(metric.__name__, metric(x, y)))



        lc_substring 1.00 
         dice_metric 1.00 
       cosine_metric 1.00 
    euclidean_metric 3.74 


In [7]:
def stop_list_from_counted(counted, freq=0.1):
    stop_list = set()
    if freq < 1:
        total = sum(counted.values())
        stop_list.update(w for w in counted if counted[w] / total >= freq)
    else:
        stop_list.update(w for w in counted if counted[w] >= freq)

    # print(stop_list)
    return stop_list

def get_stop_list(text='', freq=0.1):
    counted = Counter(text.split())

    return stop_list_from_counted(counted, freq=freq)

def get_stop_list_prepared(prepared, freq=0.1):
    idk = []

    for l in prepared:
        for w in l:
            idk.append(w)

    counted = Counter(idk)
    return stop_list_from_counted(counted, freq=freq)

In [8]:
from sklearn.cluster import DBSCAN

def get_text(clustering):
    clusters = [[] for _ in range(max(clustering) + 1)]
    for i, c in enumerate(clustering):
        if c != -1:
            clusters[c].append(data[i])
    return clusters

def print_clusters(clustering):
    clusters = get_text(clustering)

    for c in clusters[:5]:
        for el in c:
            print(el)
        print('\n*******************\n')

from copy import deepcopy
data = None
data_prepared = None
def prepare_data(n=None, stop_list=False, freq=0.0001):
    global data, data_prepared

    with open('lines.txt') as f:
        data = f.readlines()
        # print(len(data))

    if n is not None:
        data = data[:n]

    data_prepared = deepcopy(data)

    for i in range(len(data)):
        # data_prepared[i] = data_prepared[i].replace(',', ', ').replace('.', '. ').split()
        data_prepared[i] = data_prepared[i].split()

    if stop_list:
        prepared_stop_list = get_stop_list_prepared(data_prepared, freq=freq)
        print(len(prepared_stop_list), prepared_stop_list)
        data_prepared = [[w for w in l if w not in prepared_stop_list] for l in data_prepared]


In [9]:
def run_tests(tests):
    res = {}
    for metric_func, eps in tests.items():
        def metric(x, y):
            i, j = int(x[0]), int(y[0])
            return metric_func(data_prepared[i], data_prepared[j])

        X = np.arange(len(data)).reshape(-1, 1)
        res[metric_func] = DBSCAN(metric=metric, eps=eps, min_samples=2).fit_predict(X)
    # pprint(res)

    for func, clustering in res.items():
        print("\n\n###################################\n\n")
        print(func.__name__, end = '\n\n#######################\n\n')
        print_clusters(clustering)

In [13]:
tests = {
    euclidean_metric : 1.3,
    cosine_metric : 0.8,
    dice_metric : 0.65,
}
prepare_data(n=300, stop_list=False)
run_tests(tests)



###################################


euclidean_metric

#######################

"ELECTROGROUP" (OOO),190068.RUSSIA,SAINT-PETERSBURG,UL.BOLSHAYA PODYACHESKAYA,5,LIT.A,POM.4-N

"ELECTROGROUP" (OOO),190068.RUSSIA,SAINT-PETERSBURG,UL.BOLSHAYA  PODYACHESKAYA,5,LIT.A,POM.4-N


*******************

"EXPRESS CO. LTD." RUSSIA 155101 IVANOVSKAYA REGION, LEZHNEVSKIY RAION, D. KOROVIHA, CENTRALNAYA STR. 4ARUSSIA

"EXPRESS CO. LTD."  RUSSIA 155101 IVANOVSKAYA REGION, LEZHNEVSKIY RAION, D. KOROVIHA, CENTRALNAYA STR. 4ARUSSIA


*******************

"Goldens" LLC

1.AS CONSIGNEE.

1.MCT

1. MCT

1.MCT 2.VKASPB@GMAIL.COM

1 MCT

ACERINOX S.A.


*******************

"SEVROLL-SYSTEM" SP.Z O.O. PLAC CZERWCA 1976 ROKU NR 1B  02-495 WARSZAWA URSUS  TEL: (022) 312-31-39

"SEVROLL-SYSTEM" SP.Z O.O. PLAC CZERWCA 1976 ROKU NR 1B  02-495 WARSZAWA URSUS   TEL: (022) 312-31-39


*******************

1)AFTRANS LLC-512913 ZIP:111116,MOSCOW, ENERGETICHESKAYA STR., 6,INN 7729418215  TEL/FAX +7(495)742-53-82 SHULEVA

In [15]:
tests = {
    euclidean_metric : 1.3,
    cosine_metric : 0.8,
    dice_metric : 0.65,
}
prepare_data(n=300, stop_list=True, freq=0.003)
# for l_o ,l in zip(data, data_prepared):
#     print(' '.join(l))
#     print(l_o)
#     print()
run_tests(tests)

32 {'81-368', 'LTD', 'SHIPPING', 'RUSSIA', 'STREET,', '(POLSKA)', 'POLAND', '+7', 'WARSZAWA', 'A.HARTRODT', '812', 'LLC', 'STR.,', 'SAINT-PETERSBURG,', 'MOSCOW,', 'TEL.', 'UL.PULASKIEGO', 'TEL', 'O.O.', 'COMPANY', 'PETERSBURG,', '1)', 'INN', 'AS', 'LIMITED', 'OFFICE', '5', 'SP.', 'TEL:', 'SAINT', 'RUSSIA,', 'GDYNIA'}


###################################


euclidean_metric

#######################

"ELECTROGROUP" (OOO),190068.RUSSIA,SAINT-PETERSBURG,UL.BOLSHAYA PODYACHESKAYA,5,LIT.A,POM.4-N

"ELECTROGROUP" (OOO),190068.RUSSIA,SAINT-PETERSBURG,UL.BOLSHAYA  PODYACHESKAYA,5,LIT.A,POM.4-N


*******************

"EXPRESS CO. LTD." RUSSIA 155101 IVANOVSKAYA REGION, LEZHNEVSKIY RAION, D. KOROVIHA, CENTRALNAYA STR. 4ARUSSIA

"EXPRESS CO. LTD."  RUSSIA 155101 IVANOVSKAYA REGION, LEZHNEVSKIY RAION, D. KOROVIHA, CENTRALNAYA STR. 4ARUSSIA


*******************

"Goldens" LLC

"SAME AS CONSIGNEE"

1.AS CONSIGNEE.

1.MCT

1. MCT

1.MCT 2.VKASPB@GMAIL.COM

1.SAME AS CONSIGNEE

1)SAME AS CONSIGNEE

1 