In [1]:
import mlflow
import logging

logging.basicConfig(filename='model_training_1.log', level=logging.DEBUG, format='%(name)s - %(levelname)s - %(message)s')
# Get the current tracking uri
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))


Current tracking uri: file:///C:/Users/Jo%C3%A3oMonteiro/Documents/dev/formacao/Closer-Challenge/notebook/mlruns


In [2]:
mlflow.set_experiment(experiment_name = "Proof-of-concept KMeans")

KM_EXPERIMENT = mlflow.get_experiment_by_name("Proof-of-concept KMeans")
logging.info(f'Experiment Set. Experiment ID: {KM_EXPERIMENT.experiment_id}')
logging.debug(f'Artifact Location: {KM_EXPERIMENT.artifact_location}')
logging.debug(f'Tags: {KM_EXPERIMENT.tags}')
logging.debug(f'Lifecycle_stage: {KM_EXPERIMENT.lifecycle_stage}')

## Library Imports

In [3]:
import pandas as pd
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
from sklearn.cluster import KMeans
# https://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics
from sklearn.metrics import silhouette_score, calinski_harabasz_score

## Parameters

In [11]:
DATA_PATH = "../data/20220412-171427_dataset.csv"

In [17]:
SELECTED_FEATURES = ['fe_amt_plob_motor_scale','fe_amt_plob_life_scale',
                    'fe_amt_plob_health_scale','fe_amt_plob_wcomp_scale',
                    'fe_amt_plob_household_scale']

In [13]:
kmeans_params = {
            "n_clusters": 3,
            "init": 'random',
            "n_init": 10,
            "max_iter": 20,
            "random_state": 42}

## Funções predefinidas

### Scores

In [20]:
def metrics_calculation(xData, labelData):
    sil_score = silhouette_score(xData, labelData, metric='euclidean')
    ch_score = calinski_harabasz_score(xData, labelData)
    return sil_score, ch_score

## Load data

In [14]:
dfA = pd.read_csv(DATA_PATH)

In [15]:
dfTrain = dfA.loc[dfA.Set == 'train']
dfTest = dfA.loc[dfA.Set == 'test']

## MLFLOW

### OBRIGATORIO: START_RUN

In [21]:
mlflow.start_run()

<ActiveRun: >

### Correr o modelo

In [22]:
model_kmeans = KMeans(**kmeans_params)
model_kmeans.fit(dfTrain[SELECTED_FEATURES])

KMeans(init='random', max_iter=20, n_clusters=3, random_state=42)

#### Guardar o modelo e parametros

In [27]:
mlflow.sklearn.log_model(model_kmeans, artifact_path = 'poc_kmodel')

## Guardar também as variáveis utilizadas
mlflow.log_param('features', SELECTED_FEATURES)
mlflow.log_params(kmeans_params)

### Scores 

#### Train

In [25]:
predict_labelData = model_kmeans.predict(dfTrain[SELECTED_FEATURES])
sil_train, ch_train = metrics_calculation(dfTrain[SELECTED_FEATURES], predict_labelData)

mlflow.log_metric('Train - Silhouette', sil_train)
mlflow.log_metric('Train - Calinski Harabasz', ch_train)

#### Test

In [26]:
predict_labelData = model_kmeans.predict(dfTest[SELECTED_FEATURES])
sil_test, ch_test = metrics_calculation(dfTest[SELECTED_FEATURES], predict_labelData)

mlflow.log_metric('Test - Silhouette', sil_test)
mlflow.log_metric('Test - Calinski Harabasz', ch_test)

### Visualização

In [None]:
fig, ax = plt.subplots()
ax.plot([0, 1], [2, 3])


mlflow.log_figure(fig, "figure.png")

### OBRIGATORIO: END_RUN

In [30]:
mlflow.end_run()