In [5]:
import mlflow
import logging

logging.basicConfig(filename='model_training_j.log', level=logging.DEBUG, format='%(name)s - %(levelname)s - %(message)s')
# Get the current tracking uri
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))


Current tracking uri: file:///C:/Users/Jo%C3%A3oMonteiro/Documents/dev/formacao/Closer-Challenge/notebook/mlruns


In [6]:
mlflow.set_experiment(experiment_name = "Segmentation of Prospect")

KM_EXPERIMENT = mlflow.get_experiment_by_name("Segmentation of Prospect")
logging.info(f'Experiment Set. Experiment ID: {KM_EXPERIMENT.experiment_id}')
logging.debug(f'Artifact Location: {KM_EXPERIMENT.artifact_location}')
logging.debug(f'Tags: {KM_EXPERIMENT.tags}')
logging.debug(f'Lifecycle_stage: {KM_EXPERIMENT.lifecycle_stage}')

## Library Imports

In [20]:
import pandas as pd
import numpy as np
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
from sklearn.cluster import KMeans, DBSCAN
# https://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

## Parameters

In [8]:
DATA_PATH = "../data/20220413-234639_dataset.csv"

In [9]:
kmeans_kwargs = {
            "init": 'k-means++',
            "n_init": 10,
            "max_iter": 100,
            "random_state": 42}



## Funções predefinidas

### Scores

In [10]:
def metrics_calculation(xData, labelData):
    sil_score = silhouette_score(xData, labelData, metric='euclidean')
    ch_score = calinski_harabasz_score(xData, labelData)
    return sil_score, ch_score

## Load data

In [11]:
dfA = pd.read_csv(DATA_PATH)

In [12]:
dfA.columns

Index(['cod_cust_id', 'dt_fpy', 'atr_cust_age', 'dsc_edu_deg', 'amt_gms',
       'atr_gla', 'flg_children', 'amt_cmv', 'rt_cr', 'amt_plob_motor',
       'amt_plob_household', 'amt_plob_health', 'amt_plob_life',
       'amt_plob_wcomp', 'Set', 'amt_premium_total', 'outlier_candidate',
       'atr_edu_deg', 'atr_fpy_to_date', 'rt_plob_life', 'rt_plob_household',
       'rt_plob_motor', 'rt_plob_health', 'rt_plob_wcomp', 'amt_gys',
       'amt_claims_total', 'rt_premiums_year', 'rt_claims_year',
       'atr_credit_score_proxy', 'fe_bin_plob_motor',
       'fe_amt_plob_motor_scale', 'fe_bin_cmv_motor', 'fe_cmv_motor_scale',
       'log_amt_plob_motor', 'minmax_amt_plob_motor', 'norm_amt_plob_motor',
       'log_rt_plob_motor', 'fe_bin_plob_household',
       'fe_amt_plob_household_scale', 'fe_bin_cmv_household',
       'fe_cmv_household_scale', 'log_amt_plob_household',
       'minmax_amt_plob_household', 'norm_amt_plob_household',
       'log_rt_plob_household', 'fe_bin_plob_life', 'fe_am

In [15]:
#dfA = dfA.loc[dfA['DATA_MAIN_CUT'] == 'KEEP']

In [88]:
METRIC_FEATURES = ['atr_cust_age', 'flg_children', 'atr_edu_deg']
ignore_columns = ['cod_cust_id']
#BAD_FEATURES = ['dt_fpy', 'minmax_atr_cust_age', 'norm_atr_cust_age', 'PCA1_3', 'PCA2_', 'PCA1_4', 'mt_claims_total', 'fe_cmv_cr_quadrant_Type2_X', 
#                'fe_cmv_cr_quadrant_Type2_Y', 'amt_cmv', 'atr_gla', 'rt_cr', 'dt_fpy', 'sqrt_amt_cmv', 'PCA1_5', 'log_amt_plob_health']

In [89]:
columns = [column for column in dfA.select_dtypes(include=['int64', 'float64']) if (column not in ignore_columns) and (column not in METRIC_FEATURES) ]


#select all columns of type int and float except id, cluster and test columns
# columns_no_pre_transform = [column for column in dfInsuranceA.select_dtypes(include=['int64', 'float64']) if column not in ['cod_cust_id', 'Clusters_1', 'test_feature', 'sqrt_amt_cmv', 'log_amt_plob_life', 'log_amt_plob_household',
#        'log_amt_plob_wcomp', 'minmax_amt_plob_health', 'minmax_amt_plob_motor',
#        'minmax_amt_gms', 'log_amt_premium_total',
#        'log_rt_plob_life', 'log_rt_plob_household', 'log_rt_plob_motor',
#        'log_rt_plob_health', 'log_rt_plob_wcomp', 'minmax_atr_cust_age',
#        'minmax_dt_fpy', 'minmax_fpy_to_date']]



In [90]:
# dfTrain = dfA.loc[dfA.Set == 'train']
# dfTest = dfA.loc[dfA.Set == 'test']

## MLFLOW

### OBRIGATORIO: START_RUN

In [91]:
#mlflow.start_run()

In [9]:
METRIC_FEATURES = ['atr_cust_age', 'flg_children', 'atr_edu_deg']
ignore_columns = ['cod_cust_id']
columns = [column for column in dfA.select_dtypes(include=['int64', 'float64']) if (column not in ignore_columns) and (column not in METRIC_FEATURES) ]

### Correr o modelo

## HEAC

In [99]:
maxvars=5
kmin=2
kmax=4

cols=columns
results_for_each_k=[]
vars_for_each_k={}
bad_features = []

for k in range(kmin,kmax+1):
    bad_features = []
    cols=columns
    with mlflow.start_run(run_name=f'{k} Clusters Run') as parent_run:
        mlflow.log_param('num_clusters', k)
        selected_variables=[]
        while(len(selected_variables)<maxvars):
            results=[]
            cols=[column for column in columns if (column not in bad_features) and (column not in selected_variables)]
            for col in cols:
                with mlflow.start_run(run_name = 'HEAC', nested = True) as child_run:
                    scols=[]
                    scols.extend(selected_variables)
                    scols.append(col)
                    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
                    kmeans.fit(dfA[scols])
                    sil_result = silhouette_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols]))
                    results.append(sil_result)
                    
                    mlflow.log_metric('silhouette_score', silhouette_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    mlflow.log_metric('ch_score', calinski_harabasz_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    mlflow.log_metric('db_score', davies_bouldin_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    
                    mlflow.sklearn.log_model(kmeans, artifact_path = 'head_model')
                    mlflow.log_params(kmeans_kwargs)
                    mlflow.log_param('features', scols)
                    mlflow.log_param('num_clusters', k)
                    if sil_result < 0.05:
                        bad_features.append(col)

            selected_var=cols[np.random.choice(np.argpartition(results, len(results) - 3)[-3:], 1)[0]]
            #np.argmax(results)
            #results[np.argpartition(results, -2)[-2:]])[0]
            selected_variables.append(selected_var)
            cols.remove(selected_var)
        results_for_each_k.append(max(results))
        vars_for_each_k[k]=selected_variables
   


  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])


## LOVES

### KMeans

In [17]:
METRIC_FEATURES = ['amt_plob_motor', 'amt_plob_household', 'amt_plob_life', 'amt_plob_health', 'amt_plob_wcomp', 'amt_cmv']
ignore_columns = ['cod_cust_id', 'amt_gys']
columns = [column for column in dfA.select_dtypes(include=['int64', 'float64']) if (column not in ignore_columns) and (column not in METRIC_FEATURES)
                               and ('_gys' not in column)]

In [18]:
maxvars=6
kmin=3
kmax=4

cols=columns
results_for_each_k=[]
vars_for_each_k={}
bad_features = []

for k in range(kmin,kmax+1):
    bad_features = []
    cols=columns
    with mlflow.start_run(run_name=f'{k} Clusters Run') as parent_run:
        mlflow.log_param('num_clusters', k)
        selected_variables=[]
        while(len(selected_variables)<maxvars):
            results=[]
            cols=[column for column in columns if (column not in bad_features) and (column not in selected_variables)]
            for col in cols:
                with mlflow.start_run(run_name = 'LOVES', nested = True) as child_run:
                    scols=[]
                    scols.extend(selected_variables)
                    scols.append(col)
                    kmeans = DBSCAN(n_clusters=k, **kmeans_kwargs)
                    kmeans.fit(dfA[scols])
                    sil_result = silhouette_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols]))
                    results.append(sil_result)
                    
                    mlflow.log_metric('silhouette_score', silhouette_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    mlflow.log_metric('ch_score', calinski_harabasz_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    mlflow.log_metric('db_score', davies_bouldin_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    
                    mlflow.sklearn.log_model(kmeans, artifact_path = 'LOVES_model')
                    mlflow.log_params(kmeans_kwargs)
                    mlflow.log_param('features', scols)
                    mlflow.log_param('num_clusters', k)
                    if sil_result < 0.05:
                        bad_features.append(col)

            selected_var=cols[np.random.choice(np.argpartition(results, len(results) - 3)[-3:], 1)[0]]
            #np.argmax(results)
            #results[np.argpartition(results, -2)[-2:]])[0]
            selected_variables.append(selected_var)
            cols.remove(selected_var)
        results_for_each_k.append(max(results))
        vars_for_each_k[k]=selected_variables
   


  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])


## SCORER

### KMeans

In [13]:
METRIC_FEATURES = ['atr_cust_age', 'amt_gms', 'amt_cmv', 'amt_premium_total']
ignore_columns = ['cod_cust_id', 'amt_gys']
columns = [column for column in dfA.select_dtypes(include=['int64', 'float64']) if (column not in ignore_columns) and (column not in METRIC_FEATURES) and ('_gys' not in column) and ('_gms' not in column)]

In [14]:
maxvars=6
kmin=2
kmax=4

cols=columns
results_for_each_k=[]
vars_for_each_k={}
bad_features = []

for k in range(kmin,kmax+1):
    bad_features = []
    cols=columns
    with mlflow.start_run(run_name=f'{k} Clusters Run') as parent_run:
        mlflow.log_param('num_clusters', k)
        selected_variables=[]
        while(len(selected_variables)<maxvars):
            results=[]
            cols=[column for column in columns if (column not in bad_features) and (column not in selected_variables)]
            for col in cols:
                with mlflow.start_run(run_name = 'SCORER', nested = True) as child_run:
                    scols=[]
                    scols.extend(selected_variables)
                    scols.append(col)
                    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
                    kmeans.fit(dfA[scols])
                    sil_result = silhouette_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols]))
                    results.append(sil_result)
                    
                    mlflow.log_metric('silhouette_score', silhouette_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    mlflow.log_metric('ch_score', calinski_harabasz_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    mlflow.log_metric('db_score', davies_bouldin_score(dfA[METRIC_FEATURES], kmeans.predict(dfA[scols])))
                    
                    mlflow.sklearn.log_model(kmeans, artifact_path = 'SCORER_model')
                    mlflow.log_params(kmeans_kwargs)
                    mlflow.log_param('features', scols)
                    mlflow.log_param('num_clusters', k)
                    if sil_result < 0.05:
                        bad_features.append(col)

            selected_var=cols[np.random.choice(np.argpartition(results, len(results) - 3)[-3:], 1)[0]]
            #np.argmax(results)
            #results[np.argpartition(results, -2)[-2:]])[0]
            selected_variables.append(selected_var)
            cols.remove(selected_var)
        results_for_each_k.append(max(results))
        vars_for_each_k[k]=selected_variables
   


  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])
  kmeans.fit(dfA[scols])


### DBSCAN

In [43]:
dfA.columns

Index(['cod_cust_id', 'dt_fpy', 'atr_cust_age', 'dsc_edu_deg', 'amt_gms',
       'atr_gla', 'flg_children', 'amt_cmv', 'rt_cr', 'amt_plob_motor',
       'amt_plob_household', 'amt_plob_health', 'amt_plob_life',
       'amt_plob_wcomp', 'Set', 'amt_premium_total', 'outlier_candidate',
       'atr_edu_deg', 'atr_fpy_to_date', 'rt_plob_life', 'rt_plob_household',
       'rt_plob_motor', 'rt_plob_health', 'rt_plob_wcomp', 'amt_gys',
       'amt_claims_total', 'rt_premiums_year', 'rt_claims_year',
       'atr_credit_score_proxy', 'fe_bin_plob_motor',
       'fe_amt_plob_motor_scale', 'fe_bin_cmv_motor', 'fe_cmv_motor_scale',
       'log_amt_plob_motor', 'minmax_amt_plob_motor', 'norm_amt_plob_motor',
       'log_rt_plob_motor', 'fe_bin_plob_household',
       'fe_amt_plob_household_scale', 'fe_bin_cmv_household',
       'fe_cmv_household_scale', 'log_amt_plob_household',
       'minmax_amt_plob_household', 'norm_amt_plob_household',
       'log_rt_plob_household', 'fe_bin_plob_life', 'fe_am

In [71]:
dbscan_kwargs = {
            "metric": "euclidean",
            "algorithm": 'kd_tree',
            "min_samples": 17}

columns = [column for column in dfA.select_dtypes(include=['int64', 'float64']) if (column not in ignore_columns) and 
           ((column in ['flg_children', 'atr_edu_deg']) or  ('_scale' in column) or ('_quadrant' in column) or
            (('rt_' in column) and ('sqrt' not in column)))]

In [73]:
maxvars=6
eps_param = np.arange(0.1,0.81,0.1)

cols=columns
results_for_each_k=[]
vars_for_each_k={}
bad_features = []

for k in eps_param:
    bad_features = []
    cols=columns
    with mlflow.start_run(run_name=f'{k} Clusters Run') as parent_run:
        mlflow.log_param('num_clusters', k)
        selected_variables=[]
        while(len(selected_variables)<maxvars):
            results=[]
            cols=[column for column in columns if (column not in bad_features) and (column not in selected_variables)]
            for col in cols:
                with mlflow.start_run(run_name = 'SCORER_DBSCAN', nested = True) as child_run:
                    scols=[]
                    scols.extend(selected_variables)
                    scols.append(col)
                    dbscan_model = DBSCAN(eps = k, **dbscan_kwargs)
                    dbscan_model.fit(dfA[scols])
                    predictions = dbscan_model.labels_
                    if len(np.unique(predictions)) > 1:
                        sil_result = calinski_harabasz_score(dfA[METRIC_FEATURES], predictions)
                        results.append(sil_result)
                        mlflow.set_tag("MODEL.TYPE", "DBSCAN")
                        mlflow.set_tag("MODEL.NAME", "SCORER")
                        mlflow.log_metric('silhouette_score', silhouette_score(dfA[METRIC_FEATURES], predictions))
                        mlflow.log_metric('ch_score', calinski_harabasz_score(dfA[METRIC_FEATURES], predictions))
                        mlflow.log_metric('db_score', davies_bouldin_score(dfA[METRIC_FEATURES], predictions))
                        mlflow.log_params(dbscan_kwargs)
                        mlflow.log_param('features', scols)
                        mlflow.log_param('eps', k)
                        mlflow.set_tag("MODEL.QUALITY", "OK")
                        if sil_result > 100:
                            mlflow.set_tag("MODEL.QUALITY", "GOOD")
                            mlflow.sklearn.log_model(dbscan_model, artifact_path = 'SCORER_model')
                            if sil_result > 500:
                                mlflow.set_tag("MODEL.QUALITY", "HIGH")
                        if sil_result < 10:
                            mlflow.set_tag("MODEL.QUALITY", "BAD")
                            bad_features.append(col)
                    else:
                        mlflow.set_tag("MODEL.QUALITY", "REMOVE")

            selected_var=cols[np.random.choice(np.argpartition(results, len(results) - 3)[-3:], 1)[0]]
            #np.argmax(results)
            #results[np.argpartition(results, -2)[-2:]])[0]
            selected_variables.append(selected_var)
            cols.remove(selected_var)
        results_for_each_k.append(max(results))
        vars_for_each_k[k]=selected_variables
   


In [23]:
 mlflow.end_run()

In [22]:
model_kmeans = KMeans(**kmeans_params)
model_kmeans.fit(dfTrain[SELECTED_FEATURES])

KMeans(init='random', max_iter=20, n_clusters=3, random_state=42)

#### Train

In [25]:
predict_labelData = model_kmeans.predict(dfTrain[SELECTED_FEATURES])
sil_train, ch_train = metrics_calculation(dfTrain[SELECTED_FEATURES], predict_labelData)

mlflow.log_metric('Train - Silhouette', sil_train)
mlflow.log_metric('Train - Calinski Harabasz', ch_train)

#### Test

In [26]:
predict_labelData = model_kmeans.predict(dfTest[SELECTED_FEATURES])
sil_test, ch_test = metrics_calculation(dfTest[SELECTED_FEATURES], predict_labelData)

mlflow.log_metric('Test - Silhouette', sil_test)
mlflow.log_metric('Test - Calinski Harabasz', ch_test)