In [43]:
import heapq
import numpy as np
from math import inf
from tqdm import trange
from scipy.sparse.linalg import norm as sparse_norm
from scipy.spatial import ConvexHull
from scipy.spatial.distance import cdist
from numpy.linalg import norm as dense_norm
from itertools import chain
from collections import Counter
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.metrics import pairwise_distances
from numba import jit, generated_jit
from time import time

# Ex. 1 using own functions

In [2]:
def to_ngram(text: str, n: int) -> dict:
    vec = {}
    for i in range(len(text)-n+1):
        pos = text[i:i+n]
        vec[pos] = vec.get(pos, 0) + 1
    return vec

In [3]:
t = 'abcdefab'
v = to_ngram(t, 2)
v

{'ab': 2, 'bc': 1, 'cd': 1, 'de': 1, 'ef': 1, 'fa': 1}

In [4]:
%%timeit
to_ngram(t, 2)

1.48 µs ± 36.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [5]:
def euc_norm_np(a: dict) -> float:
    vec = np.fromiter(a.values(), dtype=float)
    return np.sqrt(np.sum(np.square(vec)))

In [6]:
%%timeit
euc_norm_np(v)

6.68 µs ± 180 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [7]:
def dot_product(a: dict, b: dict) -> float:
    prod = 0
    for index, value in a.items():
        if index in b:
            prod += value * b[index]
    return prod

In [8]:
def cosine_dist(a: dict, b: dict) -> float:
    return 1 - dot_product(a, b) / (euc_norm_np(a) * euc_norm_np(b))

In [9]:
a = to_ngram('abcdef', 2)
b = to_ngram('bcbcbc', 2)
dot_product(a, b)

3

In [10]:
%%timeit
cosine_dist(a, b)

15.2 µs ± 280 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [11]:
def normalize_vec(a: dict) -> dict:
    size = euc_norm_np(a)
    return {
        key: val / size
        for key, val in a.items() }

In [12]:
def norm_euc_dist(a: dict, b: dict) -> float:
    a = normalize_vec(a)
    b = normalize_vec(b)
    c = {}
    for key, val in a.items():
        c[key] = c.get(key, 0) + val
    for key, val in b.items():
        c[key] = c.get(key, 0) - val
    return euc_norm_np(c)

In [13]:
def dice_dist(a: dict, b: dict) -> float:
    top = 0
    bottom = len(a) + len(b)
    for item in a:
        if item in b:
            top += 2
    return 1 - top / bottom

In [14]:
norm_euc_dist(a, b)

1.1206210744336147

In [15]:
dice_dist(a, b)

0.7142857142857143

# Using scikit-learn

Using CountVectorizer will make clustering easier later on

In [2]:
def get_documents(file: str) -> list:
    with open(file, 'r') as f:
        data = f.read().split('\n')
    return list(filter(lambda x: len(x) > 0, data))

In [155]:
def vectorize(content: list, n: int, stop_words = None, unit='char'):
    if unit == 'char' and stop_words is not None:
        content = [' '.join(filter(lambda x: x not in stop_words, member.split(' '))) for member in content]
    vectorizer = CountVectorizer(input='content',
                                 lowercase=True,
                                 stop_words=stop_words,
                                 ngram_range=(n, n),
                                 analyzer=unit)
    return vectorizer.fit_transform(content)

In [154]:
def vectorize_file(file: str, n: int, stop_words = None, unit='char'):
    content = get_documents(file)
    return vectorize(content, n, stop_words, unit)

In [5]:
vec = vectorize_file('lines.txt', 1)

In [6]:
vec[1].todense()

matrix([[12,  0,  0,  0,  0,  0,  4,  0,  0,  0,  3,  2,  0,  4, 10,  8,
          0,  0,  5,  3,  2,  4,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  6,  2,  1,  1,  3,  0,  0,  1,  4,  0,  2,  3,  1,  4,  4,
          3,  0,  4,  5,  4,  2,  0,  1,  0,  2,  0,  0]])

In [7]:
def sk_cosine_dist(a, b):
    return 1 - ((a.dot(b.T)) / (sparse_norm(a) * sparse_norm(b))).todense().item()

In [8]:
@jit
def sk_dense_cosine_dist(a, b):
    return 1 - (a @ b.T) / (dense_norm(a) * dense_norm(b)).item()

In [9]:
sk_cosine_dist(vec[1], vec[2])

0.22656300044299682

In [10]:
def sk_euc_dist(a, b):
    a_norm = a / sparse_norm(a)
    b_norm = b / sparse_norm(b)
    return 1 - sparse_norm(a_norm - b_norm)

In [11]:
@jit
def sk_dense_euc_dist(a, b):
    a_norm = a / dense_norm(a)
    b_norm = b / dense_norm(b)
    return dense_norm(a_norm - b_norm)

In [12]:
sk_euc_dist(vec[1], vec[2])

0.32685365566914515

In [13]:
@jit
def sk_dense_dice_dist(a, b):
    a_items = a > 0
    b_items = b > 0
    return 1 - 2 * np.sum((a_items & b_items)) / (np.sum(a_items) + np.sum(b_items))

In [14]:
def sk_dice_dist(a, b):
    return sk_dense_dice_dist(a.todense(), b.todense())

In [15]:
sk_dice_dist(vec[1], vec[2])

0.15625

# Ex. 2 Clustering Quality metrics

In [16]:
def transform_pred(filename, pred):
    raw = get_documents(filename)
    clusters = []
    for i in range(np.max(pred)+1):
        clusters.append([raw[i] for i in np.where(pred == i)[0]])
    return clusters

In [17]:
def transform_to_pred(clusters: list, **vec_kwargs):
    content = sum(map(len, clusters))
    raw = [text for cluster in clusters for text in cluster]
    pred = np.zeros(len(raw))
    pos = 0
    for i, cluster in enumerate(clusters):
        pred[pos: pos + len(cluster)] = i
        pos += len(cluster)
    return vectorize(raw, **vec_kwargs), pred

In [18]:
def from_file(filename):
    clusters = [[]]
    with open(filename, 'r') as f:
        for line in f:
            line = line.replace('\n', '')
            if line == '##########':
                clusters.append([])
            elif len(line) > 0:
                clusters[-1].append(line)
        return clusters

In [19]:
lists = from_file('clusters.txt')
c_content, c_pred = transform_to_pred(lists, n=2)

In [20]:
c_pred

array([   0.,    0.,    0., ..., 3353., 3354., 3355.])

In [128]:
def intracluster_distance(cluster: np.ndarray, metric) -> float:
    if len(cluster.shape) != 2:
        cluster = cluster.reshape(-1, 1)
    return np.max(cdist(cluster, cluster, metric=metric))

In [127]:
def intercluster_distance(cluster_a, cluster_b, metric) -> float:
    if len(cluster_a.shape) != 2:
        cluster_a = cluster_a.reshape(-1, 1)
    if len(cluster_b.shape) != 2:
        cluster_b = cluster_b.reshape(-1, 1)
    return np.min(cdist(cluster_a, cluster_b, metric=metric))

In [23]:
def intracluster_table(items, classes, metric, clusters):
    out = np.zeros(clusters)
    for i in range(clusters):
        out[i] = intracluster_distance(items[classes == i], metric=metric)
    return out

In [24]:
def intercluster_table(items, classes, metric, clusters):
    out = np.zeros((clusters, clusters))
    for i in trange(clusters):
        for j in range(i):
            out[i, j] = intercluster_distance(items[classes == i], items[classes == j], metric)
    return out

In [25]:
def dunn_index(items, classes, metric) -> float:
    bottom = 0
    top = inf
    clusters = int(np.max(classes))
    intra = intracluster_table(items, classes, metric, clusters)
    inter = intercluster_table(items, classes, metric, clusters)
    for i in range(clusters):
        intr = intra[i]
        if intr > bottom:
            bottom = intr
        for j in range(i):
            intr = inter[i, j]
            if intr < top:
                top = intr
    return top / bottom

In [26]:
def db_index(items, classes, metric) -> float:
    total = 0
    clusters = int(np.max(classes))
    intra = intracluster_table(items, classes, metric, clusters)
    inter = intercluster_table(items, classes, metric, clusters)
    for i in range(clusters):
        best = 0
        for j in range(clusters):
            if i == j:
                continue
            score = (intra[i] + intra[j]) / inter[max(i, j), min(i, j)]
            if score > best:
                best = score
        total += best
    return total / clusters

# Ex. 3 stop words

In [27]:
def grab_counter(filename: str, drop_punct: bool = False) -> Counter:
    with open(filename, 'r') as f:
        content = f.read()
    tokenizer = English().tokenizer
    tokens = tokenizer(content)
    words = [
        token.text for token in tokens
        if (not drop_punct) or (not token.is_punct)    
            ]
    return Counter(words)

In [28]:
def get_popular_stopwords(filename: str, size: int, drop_punct: bool = False) -> list:
    counter = grab_counter(filename, drop_punct)
    return [x[0] for x in counter.most_common(size)]

In [29]:
get_popular_stopwords('lines.txt', 10, False)

[',', '-', '\n', '.', ':', ' ', 'LTD', 'CHINA', ')', '(']

In [30]:
get_popular_stopwords('lines.txt', 10, True)

['\n',
 ' ',
 'LTD',
 'CHINA',
 'ROAD',
 'POLAND',
 'LOGISTICS',
 'TEL',
 'OF',
 'CO.,LTD']

In [31]:
def get_least_popular_stopwords(filename: str, size: int, drop_punct: bool = False) -> list:
    counter = grab_counter(filename, drop_punct)
    return [x[0] for x in counter.most_common()[-size:]]

In [32]:
get_least_popular_stopwords('lines.txt', 10)

['750',
 '945',
 '15789',
 'jaromir.witas@zing.com.pl',
 'ZIPP',
 'SKUTERY',
 'SzklanychDomow',
 'Sliwice',
 'SZKLANYCHDOMOW',
 'SLIWICE']

# Ex. 4 & 5 Clustering

### 2 ngrams

In [147]:
vec = vectorize_file('lines.txt', 2)

In [148]:
dense = vec.toarray()
dense.shape

(6751, 2212)

We will use DBSCAN which allows for uknown number of clusters

In [150]:
model = DBSCAN(
    eps=0.2,
    min_samples=1,
    metric=sk_dense_cosine_dist,
    n_jobs=8
)

In [151]:
pred = model.fit_predict(dense)

In [152]:
dunn_index(dense, pred, sk_dense_cosine_dist)

100%|██████████| 3953/3953 [15:18<00:00,  4.31it/s]


0.31986720046479894

In [153]:
db_index(dense, pred, sk_dense_cosine_dist)

100%|██████████| 3953/3953 [16:47<00:00,  3.92it/s]


1.1388442140947126

In [43]:
model = DBSCAN(
    eps=0.2,
    min_samples=3,
    metric=sk_dense_cosine_dist,
    n_jobs=8
)

In [19]:
len(np.unique(pred))

475

In [46]:
def save_to_file(filename: str, content: list, pred: np.ndarray):
    clusters = np.max(pred)
    with open(filename, 'w') as f:
        for cluster in range(clusters):
            for line in np.where(pred == cluster)[0]:
                f.write(content[line] + '\n')
            f.write('########## \n \n')

In [47]:
save_to_file('sol.txt', get_documents('lines.txt'), pred)

## Dunn Index

In simple words, Dunn Index is ratio between smallest intercluster (between different clusters) distance to largest intracluster distance. So we want to maximize this metric (make clusters smaller and further apart)

Cosine dist, minimum samples = 3

In [48]:
dunn_index(dense, pred, sk_dense_cosine_dist)

Compilation is falling back to object mode WITH looplifting enabled because Function "sk_dense_cosine_dist" failed type inference due to: [1m[1mNo implementation of function Function(<built-in function matmul>) found for signature:
 
 >>> matmul(array(int64, 1d, C), array(int64, 1d, C))
 
There are 2 candidate implementations:
[1m  - Of which 2 did not match due to:
  Overload in function 'MatMul.generic': File: numba/core/typing/npydecl.py: Line 994.
    With argument(s): '(array(int64, 1d, C), array(int64, 1d, C))':[0m
[1m   Rejected as the implementation raised a specific error:
     TypingError: [1m'@' only supported on float and complex arrays[0m[0m
  raised from /home/thmtt/.local/lib/python3.9/site-packages/numba/core/typing/npydecl.py:942
[0m
[0m[1mDuring: typing of intrinsic-call at <ipython-input-22-c771f96a904c> (3)[0m
[1m
File "<ipython-input-22-c771f96a904c>", line 3:[0m
[1mdef sk_dense_cosine_dist(a, b):
[1m    return 1 - (a @ b.T) / (dense_norm(a) * dense

0.3206443972512965

Cosine dist, clusters.txt

In [50]:
dunn_index(c_content.todense(), c_pred, sk_dense_cosine_dist)

100%|██████████| 3355/3355 [13:51<00:00,  4.04it/s]


0.002554282587932777

## DB Index

Simply put, DB index is sum of maximum sum of 2 intracluster distances divided by intercluster distance between them. We want to minimize this value

Cosine dist, minimum samples = 3

In [51]:
db_index(dense, pred, sk_dense_cosine_dist)

100%|██████████| 473/473 [01:22<00:00,  5.72it/s]


1.6410049991735118

Cosine dist, clusters.txt

In [52]:
db_index(c_content.todense(), c_pred, sk_dense_cosine_dist)

100%|██████████| 3355/3355 [17:29<00:00,  3.20it/s]


3.215669407345323

## 3 ngrams

In [34]:
vec = vectorize_file('lines.txt', 3)
data = vec.todense()

In [35]:
model = DBSCAN(
    eps=0.2,
    min_samples=3,
    metric='cosine',
    n_jobs=8
)

In [None]:
pred = model.fit_predict(data)

Kernel died for all tries (total of 4, even with sklearn metric)

# Using stopwords

Used stopwords

In [95]:
get_popular_stopwords('lines.txt', 20)

[',',
 '-',
 '\n',
 '.',
 ':',
 ' ',
 'LTD',
 'CHINA',
 ')',
 '(',
 'ROAD',
 '/',
 'POLAND',
 '"',
 'LOGISTICS',
 'TEL',
 'OF',
 'CO.,LTD',
 'CO',
 'RUSSIA']

In [96]:
vec = vectorize_file(
    'lines.txt', 2,
    stop_words=get_popular_stopwords('lines.txt', 20))

In [97]:
dense_stop = vec.todense()
dense_stop.shape

(6751, 2212)

In [101]:
model = DBSCAN(
    eps=0.1,
    min_samples=1,
    metric=sk_dense_cosine_dist,
    n_jobs=8
)

In [102]:
pred = model.fit_predict(dense_stop)

In [103]:
dunn_index(dense_stop, pred, sk_dense_cosine_dist)

100%|██████████| 4859/4859 [22:03<00:00,  3.67it/s]


0.26661129997598526

In [104]:
db_index(dense_stop, pred, sk_dense_cosine_dist)

100%|██████████| 4859/4859 [21:07<00:00,  3.83it/s]


0.6524205814650448

In [166]:
lists = from_file('clusters.txt')
c_content, c_pred = transform_to_pred(lists, n=2, stop_words=get_popular_stopwords('lines.txt', 20))

# Clustering with Levenshtein distance

In [45]:
@jit
def iterative_lev(a, b):
    T = [[0 for _ in range(len(b) + 1)] for _ in range(len(a) + 1)]
    # Fill first iter
    for x in range(len(b) + 1):
        T[0][x] = x
    for x in range(len(a) + 1):
        T[x][0] = x
    # Do the rest
    for x in range(1, len(a) + 1):
        for y in range(1, len(b) + 1):
            T[x][y] = min(
                T[x-1][y] + 1,
                T[x][y-1] + 1,
                T[x-1][y-1] + int(a[x-1] != b[y-1])
            )
    return T, T[-1][-1]

In [74]:
def get_lev(text):
    def lev_wrapper(x, y, text):
        return iterative_lev(text[int(x[0])], text[int(y[0])])[1] / max(len(text[int(x[0])]), len(text[int(y[0])]))
    return lambda x, y: lev_wrapper(x, y, text)

For input >= 1K lines DBSCAN worked for over 30 minutes without solution.

In [63]:
with open('lines.txt', 'r') as f:
    raw_text = f.read()
text = [x for x in raw_text.split(' ') if len(x) > 0]
text = text[:500]

In [64]:
X = np.arange(len(text)).reshape(-1, 1)

In [88]:
model = DBSCAN(
    eps=0.1,
    min_samples=1,
    metric=get_lev(text),
    n_jobs=8
)

In [89]:
pred = model.fit_predict(X)

In [92]:
dunn_index(X, pred, metric=get_lev(text))

100%|██████████| 358/358 [00:04<00:00, 73.68it/s] 


1.375

In [93]:
db_index(X, pred, metric=get_lev(text))

100%|██████████| 358/358 [00:04<00:00, 73.69it/s] 


0.10787333502039155

In [109]:
model = DBSCAN(
    eps=0.1,
    min_samples=3,
    metric=get_lev(text),
    n_jobs=8
)

In [110]:
pred = model.fit_predict(X)

In [112]:
dunn_index(X, pred, metric=get_lev(text))

100%|██████████| 27/27 [00:00<00:00, 158.56it/s]


1.5714285714285714

In [113]:
db_index(X, pred, metric=get_lev(text))

100%|██████████| 27/27 [00:00<00:00, 168.72it/s]


0.12362258595918726

In [141]:
example = from_file('clusters.txt')

In [142]:
text_clusters = [item for member in example for item in member]

In [143]:
text_pred = np.array([i for i, member in enumerate(example) for _ in member])[:500].reshape(-1, 1)

In [144]:
X2 = np.arange(len(text_clusters)).reshape(-1, 1)[:500]

In [146]:
dunn_index(X2, text_pred, metric=get_lev(text_clusters))

100%|██████████| 113/113 [01:55<00:00,  1.02s/it]


0.05718954248366013

In [145]:
db_index(X2, text_pred, metric=get_lev(text_clusters))

100%|██████████| 113/113 [01:52<00:00,  1.00it/s]


4.486728231889996

### Stop words

In [160]:
stop = get_popular_stopwords('lines.txt', 20)
with open('lines.txt', 'r') as f:
    raw_text = f.read()
text = ["".join(filter(lambda y: y not in stop, x.split(" "))) for x in raw_text.split(' ')]
text = [x for x in text if len(x) > 0]
text = text[:500]

In [161]:
model = DBSCAN(
    eps=0.1,
    min_samples=3,
    metric=get_lev(text),
    n_jobs=8
)

In [162]:
X = np.arange(len(text)).reshape(-1, 1)

In [163]:
pred = model.fit_predict(X)

In [164]:
dunn_index(X, pred, metric=get_lev(text))

100%|██████████| 32/32 [00:00<00:00, 161.50it/s]


2.095238095238095

In [165]:
db_index(X, pred, metric=get_lev(text))

100%|██████████| 32/32 [00:00<00:00, 177.85it/s]


0.11837131159281895

## Euclidean distance

In [181]:
vec = vectorize_file(
    'lines.txt', 2)
dense_stop = vec.todense()

In [182]:
model = DBSCAN(
    eps=0.1,
    min_samples=3,
    metric=sk_dense_euc_dist,
    n_jobs=8
)

In [183]:
pred = model.fit_predict(dense_stop)

In [184]:
dunn_index(dense_stop, pred, sk_dense_euc_dist)

100%|██████████| 22/22 [00:00<00:00, 163.80it/s]


1.0066062474614956

In [185]:
db_index(dense_stop, pred, sk_dense_euc_dist)

100%|██████████| 22/22 [00:00<00:00, 207.25it/s]


0.151374657977153

In [172]:
vec = vectorize_file(
    'lines.txt', 2,
    stop_words=get_popular_stopwords('lines.txt', 20))
dense_stop = vec.todense()

In [171]:
model = DBSCAN(
    eps=0.1,
    min_samples=3,
    metric=sk_dense_euc_dist,
    n_jobs=8
)

In [173]:
pred = model.fit_predict(dense_stop)

In [174]:
dunn_index(dense_stop, pred, sk_dense_euc_dist)

Compilation is falling back to object mode WITH looplifting enabled because Function "sk_dense_euc_dist" failed type inference due to: [1m[1m[1mNo implementation of function Function(<function norm at 0x7f8078123af0>) found for signature:
 
 >>> norm(array(int64, 1d, C))
 
There are 2 candidate implementations:
[1m      - Of which 2 did not match due to:
      Overload in function 'norm_impl': File: numba/np/linalg.py: Line 2352.
        With argument(s): '(array(int64, 1d, C))':[0m
[1m       Rejected as the implementation raised a specific error:
         TypingError: [1mnp.linalg.norm() only supported on float and complex arrays.[0m[0m
  raised from /home/thmtt/.local/lib/python3.9/site-packages/numba/np/linalg.py:897
[0m
[0m[1mDuring: resolving callee type: Function(<function norm at 0x7f8078123af0>)[0m
[0m[1mDuring: typing of call at <ipython-input-11-9f70c32b6d07> (3)
[0m
[1m
File "<ipython-input-11-9f70c32b6d07>", line 3:[0m
[1mdef sk_dense_euc_dist(a, b):
[1m

1.089341235412819

In [175]:
db_index(dense_stop, pred, sk_dense_euc_dist)

100%|██████████| 18/18 [00:00<00:00, 208.29it/s]


0.1397089972674707

In [178]:
lists = from_file('clusters.txt')
c_content, c_pred = transform_to_pred(lists, n=2)

In [179]:
dunn_index(c_content.todense() , c_pred, sk_dense_euc_dist)

100%|██████████| 3355/3355 [18:49<00:00,  2.97it/s]  


0.05053991084215299

In [180]:
db_index(c_content.todense(), c_pred, sk_dense_euc_dist)

100%|██████████| 3355/3355 [19:17<00:00,  2.90it/s]  


1.7749570862371746

# Comparison of clustering predictions

Niektóre komórki z wynikami w tabeli nie są bezpośrednio dostępne w notatniku z powodu ponownego wykorzystywania.

<table>
    <tr>
        <th> Metric (used stopwords)</th>
        <th> Minimum cluster size </th>
        <th> Dunn index (higher = better) </th>
        <th> Dunn index for clusters.txt </th>
        <th> DB index (lower = better) </th>
        <th> DB index for clusters.txt </th>
    </tr>
    <tr>
    <tr>
        <td> Cosine (none) </td>
        <td> 3 </td>
        <td> 0.32064 </td>
        <td> 0.00255 </td>
        <td> 1.64100 </td>
        <td> 3.21566 </td>
    </tr>
    <tr>
        <td> Cosine (none) </td>
        <td> 1 </td>
        <td> 0.31986 </td>
        <td> 0.00255 </td>
        <td> 1.13884 </td>
        <td> 3.21566 </td>
    </tr>
    <tr>
        <td> Cosine (20 most common words) </td>
        <td> 3 </td>
        <td> 0.29035 </td>
        <td> 0.00255 </td>
        <td> 1.61483 </td>
        <td> 3.21566 </td>
    </tr>
    <tr>
        <td> Cosine (20 most common words) </td>
        <td> 1 </td>
        <td> 0.26661 </td>
        <td> 0.00255 </td>
        <td> 0.65242 </td>
        <td> 3.21566 </td>
    </tr>
    <tr>
        <td> Levenshtein (none) </td>
        <td> 3 </td>
        <td> 1.57142 </td>
        <td> 0.05718 </td>
        <td> 0.12362 </td>
        <td> 4.48672 </td>
    <tr>
        <td> Levenshtein (none) </td>
        <td> 1 </td>
        <td> 1.37500 </td>
        <td> 0.05718 </td>
        <td> 0.10787 </td>
        <td> 4.48672 </td>
    </tr>
    <tr>
        <td> Levenshtein (20 most common words) </td>
        <td> 3 </td>
        <td> 2.09523 </td>
        <td> 0.05718 </td>
        <td> 0.11837 </td>
        <td> 4.48672 </td>
    </tr>
    <tr>
        <td> Euclidean (none) </td>
        <td> 3 </td>
        <td> 1.00660 </td>
        <td> 0.05053 </td>
        <td> 0.15137 </td>
        <td> 1.77495 </td>
    </tr>
    <tr>
        <td> Euclidean (20 most common words) </td>
        <td> 3 </td>
        <td> 1.08934 </td>
        <td> 0.05053 </td>
        <td> 0.13970 </td>
        <td> 1.77495 </td>
    </tr>
</table>

Dla każdej klasteryzacji dla obu sposobów oceny klasteryzacji wyniki są lepsze od klasteryzacji zaproponowanej w pliku `clusters.txt`. Niektóre wyżej wymienione klasteryzacje nie klasyfikowały wszystkich adresów (w przypadku n_samples=3 odrzucane były klastry o rozmiarze 1 lub 2 traktowane jako szum)

# Ex 6

Dla podanego pliku teskstowe `lines.txt` poza stoplistą w celu polepszenia klasteryzacji można by było wypróbować:
- _Stemming_ tekstu
- Wykorzystanie _Inverse Document Frequency_ (IDF)
- Oczyszczanie tekstu (np. usunięcie wielokrotnych spacji)
- Usunięcie interpunkcji
- Sprawdzenie innych parametrów DBSCAN
- Sprawdzenie innych technik klasteryzacji