In [1]:
import numpy as np
# The make_blobs function is used to generate synthetic datasets for clustering.
from sklearn.datasets import make_blobs

In [2]:
# Checking the documentation for the make_blobs module in sklearn using "??"
make_blobs??



[31mSignature:[39m
make_blobs(
    n_samples=[32m100[39m,
    n_features=[32m2[39m,
    *,
    centers=[38;5;28;01mNone[39;00m,
    cluster_std=[32m1.0[39m,
    center_box=(-[32m10.0[39m, [32m10.0[39m),
    shuffle=[38;5;28;01mTrue[39;00m,
    random_state=[38;5;28;01mNone[39;00m,
    return_centers=[38;5;28;01mFalse[39;00m,
)
[31mSource:[39m   
@validate_params(
    {
        [33m"n_samples"[39m: [Interval(Integral, [32m1[39m, [38;5;28;01mNone[39;00m, closed=[33m"left"[39m), [33m"array-like"[39m],
        [33m"n_features"[39m: [Interval(Integral, [32m1[39m, [38;5;28;01mNone[39;00m, closed=[33m"left"[39m)],
        [33m"centers"[39m: [Interval(Integral, [32m1[39m, [38;5;28;01mNone[39;00m, closed=[33m"left"[39m), [33m"array-like"[39m, [38;5;28;01mNone[39;00m],
        [33m"cluster_std"[39m: [Interval(Real, [32m0[39m, [38;5;28;01mNone[39;00m, closed=[33m"left"[39m), [33m"array-like"[39m],
        [33m"center_box"[39m: [tuple

In [3]:
X_blobs, y_blobs_true = make_blobs(n_samples=100, centers=3)
# We created a dataset with 100 samples and 3 centers.
# The shape of the dataset is (100, 2) indicating 100 samples with 2 features each.
print(f"Shape of the dataset: {X_blobs.shape}")
# The true labels for the dataset are stored in y_blobs_true.
# The shape of the true labels is (100,) indicating 100 samples with their corresponding labels.
print(f"Shape of the true labels: {y_blobs_true.shape}")
# The first 5 samples of the dataset
print(X_blobs[:5])
# The first 5 true labels
print(y_blobs_true[:5])

Shape of the dataset: (100, 2)
Shape of the true labels: (100,)
[[-7.89103028  0.66614939]
 [ 0.48859637  6.50288819]
 [ 2.95643446  7.51274205]
 [-8.95904239  0.73214769]
 [ 5.93701467  5.39770259]]
[2 1 1 2 0]


In [4]:
import sklearn as sk
# Checking the documentation for the cluster module in sklearn using "??"
sk.cluster??
# We have many clustering algorithms available in sklearn. 
# We will use KMeans.

[31mType:[39m        module
[31mString form:[39m <module 'sklearn.cluster' from '/home/tushar/miniconda3/envs/horizon25/lib/python3.12/site-packages/sklearn/cluster/__init__.py'>
[31mFile:[39m        ~/miniconda3/envs/horizon25/lib/python3.12/site-packages/sklearn/cluster/__init__.py
[31mSource:[39m     
[33m"""Popular unsupervised clustering algorithms."""[39m

[38;5;66;03m# Authors: The scikit-learn developers[39;00m
[38;5;66;03m# SPDX-License-Identifier: BSD-3-Clause[39;00m

[38;5;28;01mfrom[39;00m ._affinity_propagation [38;5;28;01mimport[39;00m AffinityPropagation, affinity_propagation
[38;5;28;01mfrom[39;00m ._agglomerative [38;5;28;01mimport[39;00m (
    AgglomerativeClustering,
    FeatureAgglomeration,
    linkage_tree,
    ward_tree,
)
[38;5;28;01mfrom[39;00m ._bicluster [38;5;28;01mimport[39;00m SpectralBiclustering, SpectralCoclustering
[38;5;28;01mfrom[39;00m ._birch [38;5;28;01mimport[39;00m Birch
[38;5;28;01mfrom[39;00m ._bisect_k_means [

In [5]:
# Importing the KMeans class from sklearn.cluster.
from sklearn.cluster import KMeans

In [6]:
# We created an instance of the KMeans class with 3 clusters.
km = KMeans(n_clusters=3)
km

In [7]:
# Let us fit the KMeans model to our dataset.
km.fit(X_blobs)

In [8]:
# Let us examine the cluster centers.
centers = km.cluster_centers_
print(f"Cluster centers:\n{centers}")

Cluster centers:
[[ 2.88144459  6.37390145]
 [-8.06832997 -0.51375547]
 [ 5.92303794  5.23173771]]


In [9]:
# The labels assigned to each sample in the dataset.
labels = km.labels_
# These labels indicate which cluster each sample belongs to.
# The shape of the labels is (100,) indicating 100 samples with their corresponding cluster labels.
print(f"Labels shape: {labels.shape}")
# The first 5 labels assigned to the samples.
print(f"First 5 labels: {labels[:5]}")

Labels shape: (100,)
First 5 labels: [1 0 0 1 2]
