In [6]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import iplot
from scipy import stats
import gower
from pyclustering.cluster.kmedoids import kmedoids
from sklearn.preprocessing import MinMaxScaler


def load_data(filepath):
    '''takes in the file path where the data is store and returns a pandas dataframe.
    filepath must be entered as a string
    '''
    
    df = pd.read_csv(filepath)
    return df;

filepath = '../data/portfolio3_ETL.csv'
portfolio3_ID = load_data(filepath)

portfolio3 = portfolio3_ID.drop(columns=['id'])
portfolio3.head()

Unnamed: 0,idade_empresa_anos,"de_faixa_faturamento_estimado_ATE R$ 81.000,00","de_faixa_faturamento_estimado_DE R$ 1.500.000,01 A R$ 4.800.000,00","de_faixa_faturamento_estimado_DE R$ 10.000.000,01 A R$ 30.000.000,00","de_faixa_faturamento_estimado_DE R$ 100.000.000,01 A R$ 300.000.000,00","de_faixa_faturamento_estimado_DE R$ 30.000.000,01 A R$ 100.000.000,00","de_faixa_faturamento_estimado_DE R$ 300.000.000,01 A R$ 500.000.000,00","de_faixa_faturamento_estimado_DE R$ 360.000,01 A R$ 1.500.000,00","de_faixa_faturamento_estimado_DE R$ 4.800.000,01 A R$ 10.000.000,00","de_faixa_faturamento_estimado_DE R$ 500.000.000,01 A 1 BILHAO DE REAIS",...,setor_INDUSTRIA,setor_SERVIÇO,setor_nan,sg_uf_AM,sg_uf_MA,sg_uf_PI,sg_uf_RN,sg_uf_RO,vl_total_veiculos_leves_grupo,vl_total_veiculos_pesados_grupo
0,21.81,0,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,1
1,16.39,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,1
2,40.38,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,43.85,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,1
4,43.87,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [7]:
#list of non bolean columns taken from the market database
non_bolean_cols = ['idade_empresa_anos',
 'idade_maxima_socios',
 'idade_media_socios',
 'idade_minima_socios',
 'qt_filiais',
 'qt_socios',
 'qt_socios_st_regular']

In [8]:
def min_max_col(df_transform,cols):
    '''
    Input 
    Takes in a dataframe and a column being a continuous feature to normalize (min =0, and max =1)
    
    Output
    New dataframe with the column passed normalized
    '''
    mmsc = MinMaxScaler()
    for col in cols:
        var_cont = df_transform.loc[:,col].values.reshape(-1,1)
        var_cont_standarized = mmsc.fit_transform(var_cont)
        df_transform.loc[:,col] = var_cont_standarized
    return df_transform;



#Keeping the same rule applied to the market, which is normalization over standarization in iteration 0
portfolio3 = min_max_col(portfolio3,non_bolean_cols)
portfolio3[non_bolean_cols]

Unnamed: 0,idade_empresa_anos,idade_maxima_socios,idade_media_socios,idade_minima_socios,qt_filiais,qt_socios,qt_socios_st_regular
0,0.412341,0.453488,0.441860,0.418605,0.011628,0.000000,0.000000
1,0.306170,0.767442,0.767442,0.767442,0.000000,0.007246,0.000000
2,0.776102,0.453488,0.441860,0.418605,0.000000,0.000000,0.000000
3,0.844074,0.453488,0.441860,0.418605,0.000000,0.000000,0.000000
4,0.844466,0.453488,0.441860,0.418605,0.005814,0.000000,0.000000
...,...,...,...,...,...,...,...
260,0.356905,0.558140,0.558140,0.558140,0.000000,0.007246,0.000000
261,0.425661,0.546512,0.372093,0.267442,0.017442,0.021739,0.018519
262,0.234868,0.523256,0.430233,0.337209,0.000000,0.007246,0.009259
263,0.502253,0.767442,0.639535,0.476744,0.029070,0.021739,0.027778


In [10]:
dissimilarity_matrix = gower.gower_matrix(portfolio3)
dissimilarity_matrix.shape, dissimilarity_matrix

((265, 265),
 array([[0.        , 0.05949069, 0.07867876, ..., 0.07275528, 0.06309787,
         0.06945597],
        [0.05949069, 0.        , 0.09097382, ..., 0.08289523, 0.06874134,
         0.08725666],
        [0.07867876, 0.09097382, 0.        , ..., 0.02188547, 0.04660119,
         0.07754648],
        ...,
        [0.07275528, 0.08289523, 0.02188547, ..., 0.        , 0.06424994,
         0.06783611],
        [0.06309787, 0.06874134, 0.04660119, ..., 0.06424994, 0.        ,
         0.09141041],
        [0.06945597, 0.08725666, 0.07754648, ..., 0.06783611, 0.09141041,
         0.        ]], dtype=float32))

In [11]:
# set random initial medoids
initial_medoids = [1, 10, 50, 100]
# create K-Medoids algorithm for processing distance matrix instead of points
kmedoids_instance = kmedoids(dissimilarity_matrix, initial_medoids, data_type='distance_matrix')
# run cluster analysis and obtain results
kmedoids_instance.process()

clusters = kmedoids_instance.get_clusters()

medoids = kmedoids_instance.get_medoids()

medoids

[66, 224, 16, 160]

In [78]:
kmedoids_instance.get_cluster_encoding()

<type_encoding.CLUSTER_INDEX_LIST_SEPARATION: 1>

In [79]:
len(clusters[0]), len(clusters[1]), len(clusters[2]), len(clusters[3])

(33, 43, 53, 136)

In [80]:
labels = pd.Series(0,index=range(0,portfolio3.shape[0]))
for i in range (0,len(clusters)):
    for n in range (0,len(clusters[i])):
        index = clusters[i][n]
        labels.iloc[index] = i

labels.value_counts()

3    136
2     53
1     43
0     33
dtype: int64

In [81]:
from sklearn.metrics import silhouette_score

silhouette_score(dissimilarity_matrix, labels, metric='precomputed', sample_size=None, random_state=42)

0.23027454