# K-Means

### Importing Python modules

In [3]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabaz_score

### Importing data
After importing data, the head of dataset looks like:

![Image of head of Mall Customers](https://raw.githubusercontent.com/StevanStanovic/mlflow/master/2%20-%20MLFlow%20Tracking%20pour%20un%20algorithme%20non%20supervisé/Images/Head_Mall_Customers.PNG)

In [5]:
def import_data(path):
    dataset = pd.read_csv(path)
    X = dataset.iloc[:, 1:].values
    le = LabelEncoder()
    X[:, 0] = le.fit_transform(X[:, 0])
    return(X)

### Starting a MLFlow experiment

In [7]:
def mlflow_run(parameter, exp_id=None, run_name=None):
  
    with mlflow.start_run(run_name=run_name, experiment_id=exp_id) as run:
        
        # Recovering run ID
        run_id = run.info.run_uuid
        
        # Importing data
        X = import_data('/dbfs/FileStore/tables/Mall_Customers.csv')
        
        # Creating cluster
        cluster = KMeans(n_clusters=parameter, init='k-means++', random_state=0)
        
        # Logging tags and parameters
        mlflow.set_tag("Number of parameters", 1)
        mlflow.log_param("n_clusters", parameter)
        
        # Fitting the training
        cluster.fit_predict(X)
        
        # Storing cluster scores in variables
        inertia = cluster.inertia_
        silhouette = silhouette_score(X, cluster.labels_)
        davies = davies_bouldin_score(X, cluster.labels_)
        calinski = calinski_harabaz_score(X, cluster.labels_)
        
        # Logging metrics
        mlflow.log_metric("inertia", inertia)
        mlflow.log_metric("silhouette_score", silhouette)
        mlflow.log_metric("davies_bouldin_score", davies)
        mlflow.log_metric("calinski_harabasz_score", calinski)
        
        # Logging model
        mlflow.sklearn.log_model(cluster, run_name + "_Model")

        return run_id

### Running experiments

We execute nineteen experiments to compare them and determinate the best value of number of clusters.

In [9]:
if __name__=='__main__':
    # Setting experiment ID and run name
    exp_id = ###Set_your_experiment_ID_using_an_integer
    run_name = "KMeans"
    params = range(2,21)
    for i in params:
      run_id = mlflow_run(i, exp_id, run_name)
      print("Finish experiment !\n exp_id = {} et run_id = {}\n\n".format(exp_id, run_id))