**Début de l'implémentation de l'algorithme de clustering K-Means from scratch**

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 

In [2]:
df = pd.read_csv("iris.csv")

In [4]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


Etant donné que le K-Means est un algorithme d'apprentissage non supervisé, on enlève donc la target comportant les noms de fleurs.

In [5]:
df = df.drop(['species'], axis = 1)

In [6]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


On standardise nos données également

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True


In [11]:
X_scaled = scaler.fit_transform(df)

In [13]:
X_scaled

array([[-9.00681170e-01,  1.03205722e+00, -1.34127240e+00,
        -1.31297673e+00],
       [-1.14301691e+00, -1.24957601e-01, -1.34127240e+00,
        -1.31297673e+00],
       [-1.38535265e+00,  3.37848329e-01, -1.39813811e+00,
        -1.31297673e+00],
       [-1.50652052e+00,  1.06445364e-01, -1.28440670e+00,
        -1.31297673e+00],
       [-1.02184904e+00,  1.26346019e+00, -1.34127240e+00,
        -1.31297673e+00],
       [-5.37177559e-01,  1.95766909e+00, -1.17067529e+00,
        -1.05003079e+00],
       [-1.50652052e+00,  8.00654259e-01, -1.34127240e+00,
        -1.18150376e+00],
       [-1.02184904e+00,  8.00654259e-01, -1.28440670e+00,
        -1.31297673e+00],
       [-1.74885626e+00, -3.56360566e-01, -1.34127240e+00,
        -1.31297673e+00],
       [-1.14301691e+00,  1.06445364e-01, -1.28440670e+00,
        -1.44444970e+00],
       [-5.37177559e-01,  1.49486315e+00, -1.28440670e+00,
        -1.31297673e+00],
       [-1.26418478e+00,  8.00654259e-01, -1.22754100e+00,
      

In [14]:
def distance_euclidienne(x, y):
    distance = 0
    for k in range(len(x)):
        distance += (x[k]-y[k])**2
    return np.sqrt(distance)
        

In [19]:
class k_means:

    def __init__(self, k=3, max_iter=100):
        self.k = k
        self.max_iter = max_iter
        self.centroids = None

    def inertie(self, X):
        inertie = 0
        for x in X:
            distances = [distance_euclidienne(x, c) for c in self.centroids]
            nearest = np.argmin(distances)
            inertie += distances[nearest]**2
        return inertie
    
    def fit(self, X):

        centroids = np.random.uniform(
            low=X.min(axis=0),
            high=X.max(axis=0),
            size=(self.k, X.shape[1])
        )

        for w in range(self.max_iter):

            old_centroids = centroids.copy()

            
            clusters = [[] for _ in range(self.k)]

            
            for x in X:

                distances = [
                    distance_euclidienne(x, centroids[i])
                    for i in range(len(centroids))
                ]

                cluster_index = np.argmin(distances)
                clusters[cluster_index].append(x)

        
            new_centroids = []

            for cluster in clusters:

                if len(cluster) > 0:
                    new_centroids.append(np.mean(cluster, axis=0))
                else:
                    new_centroids.append(
                        X[np.random.randint(len(X))]
                    )

            centroids = np.array(new_centroids)

            if np.allclose(old_centroids, centroids):
                break

        self.centroids = centroids
        return self
    
    
    def predict(self, X):

        predictions = []

        for x in X:

            distances = [
                distance_euclidienne(x, c)
                for c in self.centroids
            ]

            predictions.append(np.argmin(distances))

        return predictions