In [47]:
import pandas as pd
import spacy
from time import time
import string
import numpy as np
from collections import defaultdict

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

nlp = spacy.load('pt_core_news_md')

stop_words = spacy.lang.pt.stop_words.STOP_WORDS
punctuations = string.punctuation

In [43]:
df = pd.read_excel("..\..\Dataset\dataset.xlsx")
df = df[['Id','Descricao', 'Categoria']]
df = df.dropna()

In [46]:
df.head()

Unnamed: 0,Id,Descricao,Categoria
0,151660523,A três meses liguei para fazer o cancelamento ...,Provedores e serv. de internet
1,151658235,estou a uma semana tentando resolver meu probl...,Provedores e serv. de internet
2,151656879,No início de Agosto/2022 minha mãe solicitou u...,Não encontrei meu problema
3,151664737,"Todo mês estou entrando em contato com Tim, po...",Não encontrei meu problema
4,151662159,Entrei em contato com o SAC da Tim Live para r...,Não encontrei meu problema


In [44]:
labels = df.Categoria
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

print(f"{len(df.Categoria)} documents - {true_k} categories")

25199 documents - 11 categories


In [38]:
evaluations = []
evaluations_std = []

def fit_and_evaluate(km, X, name=None, n_runs=5):
    name = km.__class__.__name__ if name is None else name

    train_times = []
    scores = defaultdict(list)
    for seed in range(n_runs):
        km.set_params(random_state=seed)
        t0 = time()
        km.fit(X)
        train_times.append(time() - t0)
        scores["Homogeneity"].append(metrics.homogeneity_score(labels, km.labels_))
        scores["Completeness"].append(metrics.completeness_score(labels, km.labels_))
        scores["V-measure"].append(metrics.v_measure_score(labels, km.labels_))
        scores["Adjusted Rand-Index"].append(
            metrics.adjusted_rand_score(labels, km.labels_)
        )
        scores["Silhouette Coefficient"].append(
            metrics.silhouette_score(X, km.labels_, sample_size=2000)
        )
    train_times = np.asarray(train_times)

    print(f"clustering done in {train_times.mean():.2f} ± {train_times.std():.2f} s ")
    evaluation = {
        "estimator": name,
        "train_time": train_times.mean(),
    }
    evaluation_std = {
        "estimator": name,
        "train_time": train_times.std(),
    }
    for score_name, score_values in scores.items():
        mean_score, std_score = np.mean(score_values), np.std(score_values)
        print(f"{score_name}: {mean_score:.3f} ± {std_score:.3f}")
        evaluation[score_name] = mean_score
        evaluation_std[score_name] = std_score
    evaluations.append(evaluation)
    evaluations_std.append(evaluation_std)

In [40]:
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
t0 = time()
X_tfidf = vectorizer.fit_transform(df.Descricao)

print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

vectorization done in 2.087 s
n_samples: 25199, n_features: 11432


In [45]:
for seed in range(5):
    kmeans = KMeans(
        n_clusters=true_k,
        max_iter=100,
        n_init=1,
        random_state=seed,
    ).fit(X_tfidf)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    print(f"Number of elements asigned to each cluster: {cluster_sizes}")
print()
print(
    "True number of documents in each category according to the class labels: "
    f"{category_sizes}"
)

Number of elements asigned to each cluster: [1623 4969 1639 2323 1078 1896 1154 4026 2237 1937 2317]
Number of elements asigned to each cluster: [ 897 1710 5439 2383 2664 1596 2134 1868 3765 1072 1671]
Number of elements asigned to each cluster: [2373 1991  871 3547 2204 1068 5436 1657 2671 1595 1786]
Number of elements asigned to each cluster: [ 650 2112 3469  907 2289 2557 2047 5641 1894 1078 2555]
Number of elements asigned to each cluster: [4324  511 1999 1019 3331 3239 2272 4210 1092 1007 2195]

True number of documents in each category according to the class labels: [    5     2     1     1  2336  1580   897 20278     1    27    71]
