# K-Means

### Importing Python modules

In [12]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

### Setting Tracking URI in the current folder

In [13]:
mlflow.set_tracking_uri('')
mlflow.tracking.get_tracking_uri()

''

### Creating an experiment and verifying its existence

In [14]:
#mlflow.create_experiment('Results')
mlflow.get_experiment_by_name('Results')

<Experiment: artifact_location='file:///C:/Users/stevan.stanovic/Desktop/MLFlow%20local/K-Means/mlruns/1', experiment_id='1', lifecycle_stage='active', name='Results', tags={}>

### Importing data
After importing data, the head of dataset looks like:

![Image of head of Mall Customers](https://raw.githubusercontent.com/StevanStanovic/mlflow/master/2%20-%20MLFlow%20Tracking%20pour%20un%20algorithme%20non%20supervisé/Images/Head_Mall_Customers.PNG)

In [15]:
def import_data(path):
    dataset = pd.read_csv(path)
    X = dataset.iloc[:, 1:].values
    le = LabelEncoder()
    X[:, 0] = le.fit_transform(X[:, 0])
    return(X)

### Starting a MLFlow experiment

In [18]:
def mlflow_run(parameter, exp_id=None, run_name=None):
  
    with mlflow.start_run(run_name=run_name, experiment_id=exp_id) as run:
        
        # Recovering run ID
        run_id = run.info.run_uuid
        
        # Importing data
        X = import_data('Mall_Customers.csv')
        
        # Creating cluster
        cluster = KMeans(n_clusters=parameter, init='k-means++', random_state=0)
        
        # Logging tags and parameters
        mlflow.set_tag("Number of parameters", 1)
        mlflow.log_param("n_clusters", parameter)
        
        # Fitting the training
        cluster.fit_predict(X)
        
        # Storing cluster scores in variables
        inertia = cluster.inertia_
        silhouette = silhouette_score(X, cluster.labels_)
        davies = davies_bouldin_score(X, cluster.labels_)
        calinski = calinski_harabasz_score(X, cluster.labels_)
        
        # Logging metrics
        mlflow.log_metric("inertia", inertia)
        mlflow.log_metric("silhouette_score", silhouette)
        mlflow.log_metric("davies_bouldin_score", davies)
        mlflow.log_metric("calinski_harabasz_score", calinski)
        
        # Logging model
        mlflow.sklearn.log_model(cluster, run_name + "_Model")

        return run_id

### Running experiments

We execute nineteen experiments to compare them and determinate the best value of number of clusters.

In [19]:
if __name__=='__main__':
    # Setting experiment ID and run name
    exp_id = ###Set_your_experiment_ID_using_an_integer
    run_name = "KMeans"
    params = range(2,21)
    for i in params:
      run_id = mlflow_run(i, exp_id, run_name)
      print("Experience finie !\n exp_id = {} et run_id = {}\n\n".format(exp_id, run_id))

Experience finie !
 exp_id = 1 et run_id = 96d0b081c5fb44809071015399999821


Experience finie !
 exp_id = 1 et run_id = 26b28284af364fa4989fa6bd0ac0f339


Experience finie !
 exp_id = 1 et run_id = b656f4ed99f64b68983721c748c72f6f


Experience finie !
 exp_id = 1 et run_id = 7bd0175612b547fc9ad9f21e86ab55c4


Experience finie !
 exp_id = 1 et run_id = fd1cf101b10a4555a82bdaec2cb9eec1


Experience finie !
 exp_id = 1 et run_id = 82edc413f96241648a2dc41824875ad4


Experience finie !
 exp_id = 1 et run_id = 29559d3f85e741a6867d0b7879d5e94c


Experience finie !
 exp_id = 1 et run_id = 54f0335358c14319aee1eb8fed0b9aad


Experience finie !
 exp_id = 1 et run_id = 8e6490f7e10a45df90aff535ff73a74e


Experience finie !
 exp_id = 1 et run_id = e37cc7011c044cd5963e5e4b07017683


Experience finie !
 exp_id = 1 et run_id = fb4dc70a585c452b8f1b745e19f4a38e


Experience finie !
 exp_id = 1 et run_id = 26e97143300d44a79810cf75c5835512


Experience finie !
 exp_id = 1 et run_id = 0f7519df6c9f45868146e