In [34]:
# OBJETIVO DO NOTEBOOK
# 1º Normalizar dados númericos e categóricos
# 2º Caslibrar o número de grupos para o cluster (Otimização)
# 3º Treinar o modelo de cluster 
# 4º Descrever os grupos de modelo de cluster (Centróides)
# 5º Definir a qual grupo uma nova instância pertence

###########################################
# TRABALHO DE NORMALIZAÇÃO
# Data Since: 07/04/2023
# @Autor: Santian

!python.exe -m pip install --upgrade pip
!pip install pandas==1.5.2
!pip install matplotlib
!pip install scikit-learn



In [35]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

ABRE O ARQUIVO E REALIZA A LEITURA DO MESMO

In [36]:
# LE O ARQUIVO
dataframe = pd.read_csv('codon_usage.csv\codon_usage.csv', sep=',')
# IMPRIME AS 5 PRIMEIRAS LINHAS
dataframe

  dataframe = pd.read_csv('codon_usage.csv\codon_usage.csv', sep=',')


Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.00050,0.00351,0.01203,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.00050,0.00000
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.01560,0.04410,0.00271,0.00068,0.00000
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.00000,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,...,0.00366,0.01410,0.01671,0.03760,0.01932,0.03029,0.03446,0.00261,0.00157,0.00000
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.01380,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.00000,0.00044,0.00131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13023,pri,0,9601,1097,Pongo pygmaeus abelii,0.02552,0.03555,0.00547,0.01367,0.01276,...,0.00820,0.01367,0.01094,0.01367,0.02279,0.02005,0.04102,0.00091,0.00091,0.00638
13024,pri,1,9601,2067,mitochondrion Pongo pygmaeus abelii,0.01258,0.03193,0.01984,0.00629,0.01451,...,0.00145,0.00000,0.00048,0.00194,0.01306,0.01838,0.00677,0.00242,0.00097,0.01887
13025,pri,1,9602,1686,mitochondrion Pongo pygmaeus pygmaeus,0.01423,0.03321,0.01661,0.00356,0.01127,...,0.00000,0.00000,0.00000,0.00178,0.01661,0.02788,0.00297,0.00356,0.00119,0.02017
13026,pri,0,9606,40662582,Homo sapiens,0.01757,0.02028,0.00767,0.01293,0.01319,...,0.01142,0.01217,0.01196,0.02178,0.02510,0.02896,0.03959,0.00099,0.00079,0.00156


In [37]:
# Verifica quais colunas são categóricas
colunas_categoricas = [coluna for coluna in dataframe.columns if dataframe[coluna].dtype == 'object']

colunas_categoricas

['Kingdom', 'SpeciesName', 'UUU', 'UUC']

In [38]:
# preenche valores faltantes com a média para colunas numéricas
df_num = dataframe.select_dtypes(include=['float64', 'int64'])
df_num = df_num.fillna(df_num.mean())

# preenche valores faltantes com a moda para colunas categóricas
df_cat = dataframe.select_dtypes(include=['object'])
df_cat = df_cat.fillna(df_cat.mode().iloc[0])

df_num

Unnamed: 0,DNAtype,SpeciesID,Ncodons,UUA,UUG,CUU,CUC,CUA,CUG,AUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,0,100217,1995,0.00050,0.00351,0.01203,0.03208,0.00100,0.04010,0.00551,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.00050,0.00000
1,0,100220,1474,0.00068,0.00678,0.00407,0.02849,0.00204,0.04410,0.01153,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.01560,0.04410,0.00271,0.00068,0.00000
2,0,100755,4862,0.01357,0.01543,0.00782,0.01111,0.01028,0.01193,0.02283,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.00000,0.00144
3,0,100880,1915,0.01619,0.00992,0.01567,0.01358,0.00940,0.01723,0.02402,...,0.00366,0.01410,0.01671,0.03760,0.01932,0.03029,0.03446,0.00261,0.00157,0.00000
4,0,100887,22831,0.00767,0.03679,0.01380,0.00548,0.00473,0.02076,0.02716,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.00000,0.00044,0.00131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13023,0,9601,1097,0.00547,0.01367,0.01276,0.02097,0.00820,0.03555,0.01459,...,0.00820,0.01367,0.01094,0.01367,0.02279,0.02005,0.04102,0.00091,0.00091,0.00638
13024,1,9601,2067,0.01984,0.00629,0.01451,0.05322,0.07644,0.01258,0.03096,...,0.00145,0.00000,0.00048,0.00194,0.01306,0.01838,0.00677,0.00242,0.00097,0.01887
13025,1,9602,1686,0.01661,0.00356,0.01127,0.05042,0.09609,0.01068,0.02728,...,0.00000,0.00000,0.00000,0.00178,0.01661,0.02788,0.00297,0.00356,0.00119,0.02017
13026,0,9606,40662582,0.00767,0.01293,0.01319,0.01959,0.00715,0.03964,0.01600,...,0.01142,0.01217,0.01196,0.02178,0.02510,0.02896,0.03959,0.00099,0.00079,0.00156


In [39]:
# cria um objeto StandardScaler para normalizar os dados numéricos
scaler = StandardScaler()

# normaliza os dados numéricos
df_numerico = dataframe.select_dtypes(include=['float64','int64','float','int'])
df_numerico = pd.DataFrame(scaler.fit_transform(df_numerico), columns=df_numerico.columns)

# normaliza os dados categóricos
df_cat = dataframe[['Kingdom', 'SpeciesName']]
df_cat = pd.get_dummies(df_cat)

# concatena as duas partes do dataset
df_final = pd.concat([df_numerico, df_cat], axis=1)

# imprime o dataset final
df_final

Unnamed: 0,DNAtype,SpeciesID,Ncodons,UUA,UUG,CUU,CUC,CUA,CUG,AUU,...,SpeciesName_plastid Phaseolus vulgaris,SpeciesName_plastid Porphyra yezoensis,SpeciesName_plastid Toxoplasma gondii,SpeciesName_plastid Triticum aestivum,SpeciesName_plastid Wajira grahamiana,SpeciesName_plastid Wajira praecox,SpeciesName_plastid Zea mays,SpeciesName_secondary endosymbiont of Bemisia tabaci,SpeciesName_x Doritaenopsis sp.,SpeciesName_x Tritordeum sp.
0,-0.533192,-0.242295,-0.107842,-0.972411,-1.141708,-0.546975,0.946505,-0.744101,1.306003,-1.304748,...,0,0,0,0,0,0,0,0,0,0
1,-0.533192,-0.242271,-0.108566,-0.963719,-0.789316,-1.298923,0.700139,-0.701214,1.547300,-0.960875,...,0,0,0,0,0,0,0,0,0,0
2,-0.533192,-0.237983,-0.103858,-0.341257,0.142853,-0.944676,-0.492577,-0.361413,-0.393334,-0.315399,...,0,0,0,0,0,0,0,0,0,0
3,-0.533192,-0.236982,-0.107953,-0.214737,-0.450934,-0.203120,-0.323072,-0.397703,-0.073615,-0.247425,...,0,0,0,0,0,0,0,0,0,0
4,-0.533192,-0.236925,-0.078890,-0.626170,2.444716,-0.379771,-0.878940,-0.590284,0.139330,-0.068062,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13023,-0.533192,-0.968488,-0.109089,-0.732408,-0.046814,-0.478015,0.184073,-0.447188,1.031527,-0.786082,...,0,0,0,0,0,0,0,0,0,0
13024,0.918820,-0.968488,-0.107742,-0.038477,-0.842121,-0.312700,2.397254,2.366889,-0.354123,0.149000,...,0,0,0,0,0,0,0,0,0,0
13025,0.918820,-0.968480,-0.108271,-0.194455,-1.136320,-0.618769,2.205102,3.177214,-0.468739,-0.061208,...,0,0,0,0,0,0,0,0,0,0
13026,-0.533192,-0.968448,56.390828,-0.626170,-0.126560,-0.437395,0.089369,-0.490488,1.278254,-0.705541,...,0,0,0,0,0,0,0,0,0,0


In [40]:
distorcoes = []
K = range(1,101)
for k in K: 
  modelo = KMeans(n_clusters=k,random_state=42,n_init='auto').fit(df_final)
  distorcoes.append(sum(np.min(cdist(df_final, modelo.cluster_centers_, 'euclidean'), axis=1)) / df_final.shape[0])
  
plt.plot(distorcoes)

KeyboardInterrupt: 