# **Vectorizing and Clustering**

imports

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import sys
import os
import json
import hdbscan
import umap
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
# Level up one level directory to add app the the allowed routes
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from app.utils.base_dir import BASE_DIR


## **1. Vectorizing**

open the json and load the text and label

In [6]:


path = BASE_DIR/'data'/'processed'/'processedData.json' 
with open(path, 'r') as f:
    docs = json.load(f)

label = [list(d.keys())[0] for d in docs]
text = [list(d.values())[0] for d in docs]


check if is the text is none

In [13]:
for i, doc in enumerate(text):
    if doc is None:
        print(f"Documento {i} es None")
    elif not isinstance(doc, str):
        print(f"Documento {i} no es str: {type(doc)} -> {repr(doc)}")
    elif not doc.strip():
        print(f"Documento {i} está vacío")


### **1.1 Vectorizing the text with TF_IDF**

In [8]:

def TF_IDF_vectorizer(text_corpus: list):
    vectorizer = TfidfVectorizer(
        #max_features= 10000,  # the max num of words for the total corpus 
        max_df=0.95,    #delete if the word apears on the 85% of docs
        min_df=0.001,    #delete if tye word aperas just on the 0.1% of docs
    )
    return vectorizer.fit_transform(text_corpus)
TF_vector = TF_IDF_vectorizer(text)
print(TF_vector) 

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 560383 stored elements and shape (515, 52845)>
  Coords	Values
  (0, 30546)	0.5657180383997671
  (0, 5217)	0.15508698355357342
  (0, 44460)	0.0731910782856355
  (0, 29742)	0.056013167698216036
  (0, 48885)	0.10739734878561877
  (0, 42842)	0.010058323918841284
  (0, 52529)	0.004704168690331458
  (0, 21050)	0.018871928000092204
  (0, 41825)	0.040640853229877656
  (0, 40659)	0.014370483112177498
  (0, 37668)	0.05783576541411966
  (0, 5189)	0.04358256604399028
  (0, 40315)	0.035167035236809065
  (0, 46457)	0.011437977304705556
  (0, 11542)	0.10222941657828856
  (0, 17280)	0.19969086007933803
  (0, 5291)	0.06121310247623815
  (0, 2786)	0.02034873724421131
  (0, 37389)	0.011060324352179556
  (0, 44701)	0.011341476901808005
  (0, 37706)	0.016853572682855852
  (0, 34391)	0.014322489344269662
  (0, 37255)	0.01748412219632026
  (0, 34028)	0.015742075714384
  (0, 42500)	0.034175679562888896
  :	:
  (514, 63)	0.014010277275747756
  (514

### **1.1 Vectorizing the text with sciBERT**

In [15]:
def SciBERT_vectorizer(text_corpus: list):
    vectorizer = SentenceTransformer("allenai/scibert_scivocab_uncased")
    embeddings = vectorizer.encode(text_corpus, show_progress_bar=True, convert_to_numpy=True)
    print(embeddings.shape)
    return embeddings
sciBERT_vector = SciBERT_vectorizer(text)

No sentence-transformers model found with name allenai/scibert_scivocab_uncased. Creating a new one with mean pooling.
Batches: 100%|██████████| 17/17 [06:36<00:00, 23.30s/it]

(515, 768)





Reduce the dimention with umap

In [18]:
reducer = umap.UMAP(n_components=50, random_state=42)
sciBERT_vector_reduced = reducer.fit_transform(sciBERT_vector)
print(sciBERT_vector_reduced.shape)

  warn(


(515, 50)


## **2. Clustering**

### **HDBSACAN**
More strict than k means
- The documments need to be very closer

In [24]:
def cluster_hdbscan(TF_vector):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=3, # min size for a cluster
    )
    # return a object like [0,1,5,0,1,5,6,9,0] where every number in order is the number of cluster of the document
    return clusterer.fit_predict(TF_vector)
    
cluster_hdbscan_result = cluster_hdbscan(sciBERT_vector_reduced)
print(cluster_hdbscan_result)


[24 -1 13 13 20 28 -1 20 41 -1 25 35 35 22 35 22 -1  4 14 14 37 10 14 -1
 -1  2 -1 37 20 11 28 22 -1 34 22 23 35 35 35 35  4 -1 27 41 31 26 40 -1
 -1 12 12 12 35 35 35 35 35 35 35 -1  4 -1 -1 -1 17 25 13 14 10 36 -1  2
 36  1  1  1  1  2  2  1 -1 11 11 -1 -1 37 20 -1 -1 -1 20 20 -1  0  6 -1
 39 39 39 39 14 30 39 30 12 12 40 18 18  4 18 18 18 18 -1 22 22 34 34 34
 -1 -1 34 -1 34 -1  7 -1 -1 11 16 -1 13  4 -1 11  5 -1 -1 22 22 22 22 35
 35 34 35 14 -1 -1 38 32 -1 -1 28 24 29 35  3 35  3 11  3  3  3  3 14 -1
 -1 -1  5 19 16 -1 41 -1 41 39 11 27 27 22 33 -1 35 -1 -1 16 -1 -1 -1  0
 -1 13 -1  6 -1  5  6 25 -1 -1 -1 -1 -1 13  8  8 -1 -1 33 33 33 34 33 21
 25 25 25 17 -1 16 -1 35 35 -1 34 34 -1 30 30 30 -1 29 37 30 12 20 -1  5
 37 20 -1 37  7 37 37 36 20 37 20 37 -1 26 20  4 -1 -1 34 -1 22 -1 22 35
 23 23 23 23 14  7 22 17 25 25 17 35 22  2  0  5 -1 10 -1 35 11  8 10  2
 11 -1 -1 13 11 -1 11 -1 -1 20 -1 20 34 34 35 19 -1  8 34 32 34 34 -1 14
 37 34 21 -1 -1  4 36 14 -1 26 14 29 -1 29 35 35 33



Quality of clusteing

In [26]:
num_clusters = len(set(cluster_hdbscan_result)) - (1 if -1 in cluster_hdbscan_result else 0)
noise_ratio = np.sum(cluster_hdbscan_result == -1) / len(cluster_hdbscan_result)

print(f"total clusters : {num_clusters}")
print(f"noise: {noise_ratio:.2%}")


total clusters : 42
noise: 26.21%


Cluster distribution

In [38]:
def cluster_distribution(cluster_result):
    labels_validos = cluster_result[cluster_result != -1]
    
    conteo = pd.Series(labels_validos).value_counts().sort_index()
    print("Documents per cluster:")
    print(conteo)
    
    # Proportions
    proportions = conteo / conteo.sum()
    print("Clusters proportion:")
    print(proportions)
    
    # Standar desviation
    desviacion = proportions.std()
    print(f"\nDesviación estándar de las proportions: {desviacion:.4f}")
    
    return conteo, proportions, desviacion
print(cluster_distribution(cluster_hdbscan_result))


Documents per cluster:
0      3
1      6
2     11
3      6
4     11
5     26
6     11
7      6
8     10
9      5
10     9
11    12
12     7
13     8
14    16
15    10
16     7
17     5
18     6
19     3
20    14
21    10
22    16
23     5
24     5
25     8
26     4
27     3
28     8
29     5
30     8
31     5
32     3
33     6
34    20
35    32
36     5
37    11
38     5
39     6
40    15
41     8
Name: count, dtype: int64
Clusters proportion:
0     0.007895
1     0.015789
2     0.028947
3     0.015789
4     0.028947
5     0.068421
6     0.028947
7     0.015789
8     0.026316
9     0.013158
10    0.023684
11    0.031579
12    0.018421
13    0.021053
14    0.042105
15    0.026316
16    0.018421
17    0.013158
18    0.015789
19    0.007895
20    0.036842
21    0.026316
22    0.042105
23    0.013158
24    0.013158
25    0.021053
26    0.010526
27    0.007895
28    0.021053
29    0.013158
30    0.021053
31    0.013158
32    0.007895
33    0.015789
34    0.052632
35    0.084211
36    0.0131

### **K-MEANS**
Only for the no no clustering files (-1)

In [40]:
def recluster_noise(cluster_result, embeddings, k_new, random_state=42):
    """
    Group the docs taged with -1 (no clusterized)
    
    Parameters:
        cluster_result (array-like): the result of previous clustering with HDBSCAN
        embeddings (ndarray): vector of all docs
        k_new (int): cluster number
        random_state: stability
    
    Retorna:
        labels_final (ndarray): etiquetas con los ruidosos reasignados.
    """
    cluster_result = np.array(cluster_result)
    mask_noise = cluster_result == -1
    emb_noise = embeddings[mask_noise]

    if len(emb_noise) == 0:
        print("No noise vectors")
        return cluster_result

    # K-Means only over -1 docs
    kmeans_new = KMeans(n_clusters=k_new, random_state=random_state)
    labels_noise_new = kmeans_new.fit_predict(emb_noise)

    # Reasig IDs
    max_label = max(cluster_result[cluster_result != -1]) if np.any(cluster_result != -1) else -1
    labels_final = cluster_result.copy()
    labels_final[mask_noise] = labels_noise_new + max_label + 1

    return labels_final

kmeans_result = recluster_noise(cluster_hdbscan_result, sciBERT_vector_reduced, 11, 42)
print(kmeans_result)

[24 45 13 13 20 28 51 20 41 45 25 35 35 22 35 22 42  4 14 14 37 10 14 42
 42  2 42 37 20 11 28 22 48 34 22 23 35 35 35 35  4 48 27 41 31 26 40 50
 51 12 12 12 35 35 35 35 35 35 35 44  4 51 52 42 17 25 13 14 10 36 49  2
 36  1  1  1  1  2  2  1 50 11 11 52 52 37 20 52 52 42 20 20 52  0  6 47
 39 39 39 39 14 30 39 30 12 12 40 18 18  4 18 18 18 18 46 22 22 34 34 34
 43 48 34 43 34 50  7 44 44 11 16 49 13  4 42 11  5 47 47 22 22 22 22 35
 35 34 35 14 52 45 38 32 49 51 28 24 29 35  3 35  3 11  3  3  3  3 14 44
 44 52  5 19 16 49 41 50 41 39 11 27 27 22 33 48 35 43 48 16 45 42 42  0
 42 13 49  6 46  5  6 25 46 45 49 50 45 13  8  8 48 52 33 33 33 34 33 21
 25 25 25 17 47 16 43 35 35 52 34 34 51 30 30 30 51 29 37 30 12 20 52  5
 37 20 42 37  7 37 37 36 20 37 20 37 42 26 20  4 43 43 34 48 22 48 22 35
 23 23 23 23 14  7 22 17 25 25 17 35 22  2  0  5 44 10 46 35 11  8 10  2
 11 44 42 13 11 44 11 44 44 20 46 20 34 34 35 19 48  8 34 32 34 34 42 14
 37 34 21 42 52  4 36 14 42 26 14 29 42 29 35 35 33

## **Final results**

In [41]:
print(cluster_distribution(kmeans_result))


Documents per cluster:
0      3
1      6
2     11
3      6
4     11
5     26
6     11
7      6
8     10
9      5
10     9
11    12
12     7
13     8
14    16
15    10
16     7
17     5
18     6
19     3
20    14
21    10
22    16
23     5
24     5
25     8
26     4
27     3
28     8
29     5
30     8
31     5
32     3
33     6
34    20
35    32
36     5
37    11
38     5
39     6
40    15
41     8
42    23
43    10
44    15
45     8
46     9
47    15
48    12
49    13
50     9
51     8
52    13
Name: count, dtype: int64
Clusters proportion:
0     0.005825
1     0.011650
2     0.021359
3     0.011650
4     0.021359
5     0.050485
6     0.021359
7     0.011650
8     0.019417
9     0.009709
10    0.017476
11    0.023301
12    0.013592
13    0.015534
14    0.031068
15    0.019417
16    0.013592
17    0.009709
18    0.011650
19    0.005825
20    0.027184
21    0.019417
22    0.031068
23    0.009709
24    0.009709
25    0.015534
26    0.007767
27    0.005825
28    0.015534
29    0.009709
30 