In [1]:
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import *
from plotnine import *
import pandas as pd
import numpy as np

#Les données sont déjà vectorisé (TF-IDF)
data = fetch_20newsgroups_vectorized(subset='all', remove=('headers', 'footers', 'quotes'))
X = data.data
Y = data.target

#print(data.DESCR)

target_names = data.target_names

In [2]:
svd = TruncatedSVD(n_components=2, random_state=42)
X_2d = svd.fit_transform(X)

# Visualisation
df = pd.DataFrame({
    "Dim1": X_2d[:, 0],
    "Dim2": X_2d[:, 1],
    "Category": [target_names[i] for i in Y]  # noms de catégories pour la couleur
})

plot = (
    ggplot(df.sample(5000))  # échantillonner 5000 points pour alléger le graphique
    + aes(x='Dim1', y='Dim2', color='Category')
    + geom_point(size=1.1, alpha=0.7)
    + theme_minimal()
    + ggtitle("Projection 2D du corpus 20 Newsgroups (TF-IDF + SVD)")
    + scale_color_brewer(type='qual', palette='Set3')
    + theme(figure_size=(10, 8))
)

plot.save("img/20newsgroups.png")



# Kmeans

In [3]:
#Réduction de dimension (car corpus trop volumineux) :
svd = TruncatedSVD(n_components=2, random_state=42)
X_2d = svd.fit_transform(X)

In [4]:
kmeans=KMeans(n_clusters=20, random_state=42) #1 cluster par classe, donc 20 clusters.
clusters = kmeans.fit_predict(X_2d)

In [5]:
#Visualisation avec plotnine :

df = pd.DataFrame({
    "Dim1": X_2d[:, 0],
    "Dim2": X_2d[:, 1],
    "Cluster": clusters.astype(str)  # en string pour une meilleure coloration
})

plot = (
    ggplot(df, aes(x='Dim1', y='Dim2', color='Cluster'))
    + geom_point(size=1.2, alpha=0.7)
    + theme_minimal()
    + ggtitle("Clustering du corpus 20 Newsgroups (K-Means + SVD)")
    + scale_color_brewer(type='qual', palette='Set3')
    + theme(figure_size=(10, 8))
)
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['Dim1', 'Dim2'])
plot += geom_point(data=centers, mapping=aes(x='Dim1', y='Dim2'),
                   color='black', size=5, shape='x')

In [6]:
plot.save("img/kmeans.png")



In [7]:
labels_true = data.target
labels_pred = clusters

print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels_pred):.3f}")
print(f"Completeness: {metrics.completeness_score(labels_true, labels_pred):.3f}")
print(f"V-measure: {metrics.v_measure_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Mutual Information: {metrics.adjusted_mutual_info_score(labels_true, labels_pred):.3f}")

# Silhouette : ici on utilise X_100d normalisé pour que la distance soit plus stable
X_norm = Normalizer().fit_transform(X_2d)
if len(set(labels_pred)) > 1:
    sil = metrics.silhouette_score(X_norm, labels_pred)
    print(f"Silhouette Coefficient: {sil:.3f}")
else:
    print("Silhouette Coefficient: impossible (1 cluster ou tout bruit)")

Homogeneity: 0.030
Completeness: 0.031
V-measure: 0.030
Adjusted Rand Index: 0.009
Adjusted Mutual Information: 0.027
Silhouette Coefficient: -0.182


# DBSCAN

In [8]:
#Réduction de dimension (car corpus trop volumineux) :
svd = TruncatedSVD(n_components=90, random_state=42)
X_90d = svd.fit_transform(X)

In [9]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.3, min_samples=10)
clusters = db.fit_predict(X_90d)  # labels prédits

n_clusters_ = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise_ = list(clusters).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 4
Estimated number of noise points: 17202


In [10]:
labels_true = data.target

# Filtrer les bruit (-1) pour silhouette
mask = clusters != -1
X_silhouette = X_90d[mask]
labels_silhouette = clusters[mask]

print(f"Homogeneity: {metrics.homogeneity_score(labels_true, clusters):.3f}")
print(f"Completeness: {metrics.completeness_score(labels_true, clusters):.3f}")
print(f"V-measure: {metrics.v_measure_score(labels_true, clusters):.3f}")
print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, clusters):.3f}")
print(
    "Adjusted Mutual Information:"
    f" {metrics.adjusted_mutual_info_score(labels_true, clusters):.3f}"
)

if len(set(labels_silhouette)) > 1:
    sil_score = metrics.silhouette_score(X_silhouette, labels_silhouette)
    print(f"Silhouette Coefficient: {sil_score:.3f}")
else:
    print("Silhouette Coefficient: impossible (1 cluster ou tout bruit)")

Homogeneity: 0.008
Completeness: 0.067
V-measure: 0.015
Adjusted Rand Index: 0.001
Adjusted Mutual Information: 0.013
Silhouette Coefficient: 0.484


In [11]:
#Visualisation avec plotnine :

df = pd.DataFrame({
    "Dim1": X_90d[:, 0],
    "Dim2": X_90d[:, 1],
    "Cluster": clusters.astype(str)  # en string pour une meilleure coloration
})

plot = (
    ggplot(df, aes(x='Dim1', y='Dim2', color='Cluster'))
    + geom_point(size=1.2, alpha=0.7)
    + theme_minimal()
    + ggtitle("Clustering du corpus 20 Newsgroups (DBSCAN + SVD)")
    + scale_color_brewer(type='qual', palette='Set3')
    + theme(figure_size=(10, 8))
)
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['Dim1', 'Dim2'])
plot += geom_point(data=centers, mapping=aes(x='Dim1', y='Dim2'),
                   color='black', size=5, shape='x')
plot.save("img/dbscan.png")



In [12]:
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_noise = list(clusters).count(-1)
print(f"Clusters trouvés : {n_clusters}")
print(f"Bruit (points -1) : {n_noise}")

Clusters trouvés : 4
Bruit (points -1) : 17202


# HDBSCAN

In [13]:
#Réduction de dimension (car corpus trop volumineux) :
svd = TruncatedSVD(n_components=90, random_state=42)
X_90d = svd.fit_transform(X)

In [14]:
from sklearn.cluster import HDBSCAN

hdb = HDBSCAN(min_cluster_size=20, min_samples=10)
hdb.fit(X_90d)

0,1,2
,min_cluster_size,20
,min_samples,10
,cluster_selection_epsilon,0.0
,max_cluster_size,
,metric,'euclidean'
,metric_params,
,alpha,1.0
,algorithm,'auto'
,leaf_size,40
,n_jobs,


In [15]:
clusters = hdb.fit_predict(X_90d)
labels_hdb = clusters

In [16]:
#Visualisation avec plotnine :

df = pd.DataFrame({
    "Dim1": X_90d[:, 0],
    "Dim2": X_90d[:, 1],
    "Cluster": labels_hdb.astype(str)
})

plot = (
    ggplot(df, aes(x='Dim1', y='Dim2', color='Cluster'))
    + geom_point(size=1.2, alpha=0.7)
    + theme_minimal()
    + ggtitle("Clustering du corpus 20 Newsgroups (HDBSCAN + SVD)")
    + scale_color_brewer(type='qual', palette='Set3')
    + theme(figure_size=(10, 8))
)
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['Dim1', 'Dim2'])
plot += geom_point(data=centers, mapping=aes(x='Dim1', y='Dim2'),
                   color='black', size=5, shape='x')
plot.save("img/hdbscan.png")



In [17]:
labels_true = data.target
labels_pred = labels_hdb

print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels_pred):.3f}")
print(f"Completeness: {metrics.completeness_score(labels_true, labels_pred):.3f}")
print(f"V-measure: {metrics.v_measure_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Mutual Information: {metrics.adjusted_mutual_info_score(labels_true, labels_pred):.3f}")

# Silhouette
X_norm = Normalizer().fit_transform(X_2d)
if len(set(labels_pred)) > 1:
    sil = metrics.silhouette_score(X_norm, labels_pred)
    print(f"Silhouette Coefficient: {sil:.3f}")
else:
    print("Silhouette Coefficient: impossible (1 cluster)")

Homogeneity: 0.007
Completeness: 0.060
V-measure: 0.012
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.012
Silhouette Coefficient: -0.074


# BayesianGaussianMixture

In [18]:
#Réduction de dimension (car corpus trop volumineux) :
svd = TruncatedSVD(n_components=2, random_state=42)
X_2d = svd.fit_transform(X)

In [19]:
from sklearn.mixture import BayesianGaussianMixture

X_norm = Normalizer().fit_transform(X_2d)

bgm = BayesianGaussianMixture(
    n_components=22,       # un peu plus que le nombre réel de catégories
    covariance_type='full',
    init_params='kmeans',  # plus stable
    max_iter=1000,
    random_state=42
)
bgm.fit(X_norm)

0,1,2
,n_components,22
,covariance_type,'full'
,tol,0.001
,reg_covar,1e-06
,max_iter,1000
,n_init,1
,init_params,'kmeans'
,weight_concentration_prior_type,'dirichlet_process'
,weight_concentration_prior,
,mean_precision_prior,


In [20]:
# Moyennes des clusters
print("Means of clusters:")
print(bgm.means_)

# Prédiction des clusters
labels_bgm = bgm.predict(X_norm)
print(f"Clusters prédits : {len(set(labels_bgm))}")

Means of clusters:
[[ 9.06610199e-01 -9.33749347e-02]
 [ 9.96016028e-01  6.72071540e-02]
 [ 1.68183111e-03 -1.73184807e-04]
 [ 7.55456250e-01  6.43106553e-01]
 [ 9.52651858e-01 -2.97873121e-01]
 [ 9.91786894e-01 -1.14715619e-01]
 [ 9.06610150e-01 -9.33749211e-02]
 [ 9.06610137e-01 -9.33749175e-02]
 [ 4.88865492e-01 -8.63265410e-01]
 [ 8.87398725e-01  4.54763205e-01]
 [ 9.06610095e-01 -9.33749062e-02]
 [ 8.67668964e-01 -4.91804593e-01]
 [ 9.06610060e-01 -9.33748963e-02]
 [ 9.06610039e-01 -9.33748906e-02]
 [ 9.06610018e-01 -9.33748849e-02]
 [ 5.06390033e-01  6.63682170e-01]
 [ 9.06609977e-01 -9.33748735e-02]
 [ 9.62584031e-01  2.62563264e-01]
 [ 7.04835733e-01 -7.03543081e-01]
 [ 9.06506967e-01 -9.33466107e-02]
 [ 9.06506967e-01 -9.33466107e-02]
 [ 9.06506967e-01 -9.33466107e-02]]
Clusters prédits : 11


In [21]:
#Visualisation avec plotnine :

df = pd.DataFrame({
    "Dim1": X_2d[:, 0],
    "Dim2": X_2d[:, 1],
    "Cluster": labels_bgm.astype(str)  # en string pour une meilleure coloration
})

plot = (
    ggplot(df, aes(x='Dim1', y='Dim2', color='Cluster'))
    + geom_point(size=1.2, alpha=0.7)
    + theme_minimal()
    + ggtitle("Clustering du corpus 20 Newsgroups (BMG + SVD)")
    + scale_color_brewer(type='qual', palette='Set3')
    + theme(figure_size=(10, 8))
)
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['Dim1', 'Dim2'])
plot += geom_point(data=centers, mapping=aes(x='Dim1', y='Dim2'),
                   color='black', size=5, shape='x')
plot.save("img/bmg.png")



In [22]:
labels_true = data.target
labels_pred = labels_bgm

print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels_pred):.3f}")
print(f"Completeness: {metrics.completeness_score(labels_true, labels_pred):.3f}")
print(f"V-measure: {metrics.v_measure_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Mutual Information: {metrics.adjusted_mutual_info_score(labels_true, labels_pred):.3f}")

# Silhouette
X_norm = Normalizer().fit_transform(X_2d)
if len(set(labels_pred)) > 1:
    sil = metrics.silhouette_score(X_norm, labels_pred)
    print(f"Silhouette Coefficient: {sil:.3f}")
else:
    print("Silhouette Coefficient: impossible (1 cluster)")

Homogeneity: 0.012
Completeness: 0.018
V-measure: 0.014
Adjusted Rand Index: 0.004
Adjusted Mutual Information: 0.012
Silhouette Coefficient: 0.524


# BIRCH

In [23]:
#Réduction de dimension (car corpus trop volumineux) :
svd = TruncatedSVD(n_components=500, random_state=42)
X_500d = svd.fit_transform(X)

In [24]:
from sklearn.cluster import Birch
brc = Birch(n_clusters=20)
brc.fit(X_500d)

0,1,2
,threshold,0.5
,branching_factor,50
,n_clusters,20
,compute_labels,True
,copy,'deprecated'


In [25]:
clusters = brc.predict(X_500d)
clusters

array([15, 17,  5, ...,  5, 10, 10], shape=(18846,))

In [26]:
#Visualisation avec plotnine :
df = pd.DataFrame({
    "Dim1": X_500d[:, 0],
    "Dim2": X_500d[:, 1],
    "Cluster": clusters.astype(str)  # en string pour une meilleure coloration
})

plot = (
    ggplot(df, aes(x='Dim1', y='Dim2', color='Cluster'))
    + geom_point(size=1.2, alpha=0.7)
    + theme_minimal()
    + ggtitle("Clustering du corpus 20 Newsgroups (BIRCH + SVD)")
    + scale_color_brewer(type='qual', palette='Set3')
    + theme(figure_size=(10, 8))
)
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['Dim1', 'Dim2'])
plot += geom_point(data=centers, mapping=aes(x='Dim1', y='Dim2'),
                   color='black', size=5, shape='x')
plot.save("img/birch.png")



In [27]:
labels_true = data.target
labels_pred = clusters

print(f"Homogeneity: {metrics.homogeneity_score(labels_true, labels_pred):.3f}")
print(f"Completeness: {metrics.completeness_score(labels_true, labels_pred):.3f}")
print(f"V-measure: {metrics.v_measure_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Rand Index: {metrics.adjusted_rand_score(labels_true, labels_pred):.3f}")
print(f"Adjusted Mutual Information: {metrics.adjusted_mutual_info_score(labels_true, labels_pred):.3f}")

# Silhouette : ici on utilise X_500d normalisé pour que la distance soit plus stable
X_norm = Normalizer().fit_transform(X_500d)
if len(set(labels_pred)) > 1:
    sil = metrics.silhouette_score(X_norm, labels_pred)
    print(f"Silhouette Coefficient: {sil:.3f}")
else:
    print("Silhouette Coefficient: impossible (1 cluster ou tout bruit)")

Homogeneity: 0.051
Completeness: 0.055
V-measure: 0.053
Adjusted Rand Index: 0.016
Adjusted Mutual Information: 0.050
Silhouette Coefficient: -0.026
