# Problema de clusterização

Base **wine.csv** e o algoritmo **kmeans**.

## **Importação de bibliotecas e funções e carregamento do dataset**

In [2]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt

# importando função para separar dados de treino e teste
from sklearn.model_selection import train_test_split

# importando métricas de avaliação
from sklearn import metrics

In [3]:
# Pegar a rota para o arquivo das variáveis de ambiente
from decouple import config
home_dir = config('HOME')

# Carregando o dataset
ds_wine = pd.read_csv(home_dir + '/Documentos/Code/ML-IGTI/datasets/wine.csv')

## **Conhecendo o dataframe**

In [3]:
# Base de clusterização
ds_wine.head(10)

Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
5,1,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,1,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,1,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,1,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,1,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


## **Quantidade de instâncias e features do dataset**

In [4]:
# Base de clusterização
print('Amostras e Features', ds_wine.shape)

Amostras e Features (178, 14)


## **Verificação da existência de dados faltantes**

### *Dataset: wine.csv*

In [5]:
# estrutura do dataset
ds_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   class                           178 non-null    int64  
 1   Alcohol                         178 non-null    float64
 2   Malic_acid                      178 non-null    float64
 3   Ash                             178 non-null    float64
 4   Alcalinity_of_ash               178 non-null    float64
 5   Magnesium                       178 non-null    int64  
 6   Total_phenols                   178 non-null    float64
 7   Flavanoids                      178 non-null    float64
 8   Nonflavanoid_phenols            178 non-null    float64
 9   Proanthocyanins                 178 non-null    float64
 10  Color_intensity                 178 non-null    float64
 11  Hue                             178 non-null    float64
 12  OD280%2FOD315_of_diluted_wines  178 

In [6]:
# estatísticas do dataset
ds_wine.describe()

Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [7]:
# contagem de valores nulos por coluna
ds_wine.isnull().sum()

class                             0
Alcohol                           0
Malic_acid                        0
Ash                               0
Alcalinity_of_ash                 0
Magnesium                         0
Total_phenols                     0
Flavanoids                        0
Nonflavanoid_phenols              0
Proanthocyanins                   0
Color_intensity                   0
Hue                               0
OD280%2FOD315_of_diluted_wines    0
Proline                           0
dtype: int64

## Clusterização

In [8]:
# criando um dicionário de dados para o mapeamento
classes = {1: 0, 2: 1, 3: 2}

#substituindo os valores categóricos na coluna 'class' pelo mapeamento
ds_wine['class'] = ds_wine['class'].map(classes)

# verificando
ds_wine.head(10)

Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
0,0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,0,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,0,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,0,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,0,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735
5,0,14.2,1.76,2.45,15.2,112,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450
6,0,14.39,1.87,2.45,14.6,96,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290
7,0,14.06,2.15,2.61,17.6,121,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295
8,0,14.83,1.64,2.17,14.0,97,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045
9,0,13.86,1.35,2.27,16.0,98,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045


In [9]:
# separando os conjuntos de treino e teste
y = ds_wine['class'].values
ds_wine = ds_wine.drop('class', axis=1)
X = ds_wine.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print('Formato dos dados:', X_train.shape, y_train.shape)
print('Formato dos dados de normalizacao:', X_test.shape, y_test.shape)

Formato dos dados: (133, 13) (133,)
Formato dos dados de normalizacao: (45, 13) (45,)


In [10]:
# importando o modelo K-means
from sklearn.cluster import KMeans

# importando as métricas que serão usadas para avaliação
from sklearn.metrics import silhouette_score, davies_bouldin_score, mutual_info_score

In [11]:
# Configurando o KMeans
clustering = KMeans(n_clusters= 3, random_state = 42)

# treinando o modelo com o conjunto de treino
clustering.fit(X_train)

In [12]:
# fazendo a previsão com o modelo treinado
y_pred = clustering.predict(X_test)

In [13]:
p = pd.DataFrame({'Real': y_test, 'Previsto': y_pred})
p.head(10)

Unnamed: 0,Real,Previsto
0,0,2
1,0,0
2,2,1
3,0,0
4,1,1
5,0,0
6,1,1
7,2,2
8,1,2
9,2,2


In [14]:
# Avaliando o modelo

Silhueta = silhouette_score(X_test, y_pred)
print("Coeficiente Silhueta:", Silhueta)

DaviesBouldin = davies_bouldin_score(X_test, y_pred)
print("Davies-Bouldin Score:", DaviesBouldin)

MutualInfo = mutual_info_score(y_test, y_pred)
print("Mutual Information:", MutualInfo)

Coeficiente Silhueta: 0.5519241838976597
Davies-Bouldin Score: 0.5415115100039742
