In [1]:
!pip install plotly --upgrade

Collecting plotly
  Downloading plotly-5.3.1-py2.py3-none-any.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 13 kB/s 
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-5.3.1 tenacity-8.0.1


In [2]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# K-Means

## Base idade e salário

In [9]:
from sklearn.cluster import KMeans

In [10]:
x = [20, 27, 21, 37, 46, 53, 55, 47, 52, 32, 39, 41, 39, 48, 48]
y = [1000, 1200, 2900, 1850, 900, 950, 2000, 2100, 3000, 5900, 4800, 5100, 7000, 5000, 6500]

In [11]:
grafico = px.scatter(x = x, y = y)
grafico.show()

In [12]:
base_salario = np.array([[x[i] , y[i]] for i in range(len(x))])
base_salario

array([[  20, 1000],
       [  27, 1200],
       [  21, 2900],
       [  37, 1850],
       [  46,  900],
       [  53,  950],
       [  55, 2000],
       [  47, 2100],
       [  52, 3000],
       [  32, 5900],
       [  39, 4800],
       [  41, 5100],
       [  39, 7000],
       [  48, 5000],
       [  48, 6500]])

In [13]:
scaler_salario = StandardScaler()
base_salario = scaler_salario.fit_transform(base_salario)

In [14]:
kmeans_salario = KMeans(n_clusters= 3)
kmeans_salario.fit(base_salario)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [15]:
centroides = kmeans_salario.cluster_centers_
centroides

array([[-1.63312883, -0.78799405],
       [ 0.73953003, -0.74014016],
       [ 0.07703438,  1.13413719]])

In [16]:
scaler_salario.inverse_transform(kmeans_salario.cluster_centers_)

array([[  22.66666667, 1700.        ],
       [  48.33333333, 1800.        ],
       [  41.16666667, 5716.66666667]])

In [17]:
rotulos = kmeans_salario.labels_
rotulos

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)

In [18]:
grafico1 = px.scatter(x = base_salario[:, 0], y = base_salario[:, 1], color=rotulos)
grafico2 = px.scatter(x = centroides[:, 0], y = centroides[:, 1], size = [12, 12, 12])
grafico3 = go.Figure(data = grafico1.data + grafico2.data)
grafico3.show()

## Dados Randômicos

In [19]:
from sklearn.datasets import make_blobs

In [20]:
x_random, y_random = make_blobs(n_samples=2000, centers=5, random_state=0)

In [21]:
grafico = px.scatter(x = x_random[:, 0], y = x_random[: , 1])
grafico.show()

In [22]:
kmeans_blobs = KMeans(n_clusters=5)
kmeans_blobs.fit(x_random)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [23]:
rotulos = kmeans_blobs.predict(x_random)

In [24]:
centroides = kmeans_blobs.cluster_centers_

In [25]:
grafico1 = px.scatter(x = x_random[:, 0], y = x_random[:, 1], color = rotulos)
grafico2 = px.scatter(x = centroides[:, 0], y = centroides[:, 1], size = [5, 5 , 5, 5 ,5 ])
grafico3 = go.Figure(data = grafico1.data + grafico2.data)
grafico3.show()

## Base cartão de crédito

In [26]:
import pandas as pd
base_cartao = pd.read_csv("credit_card_clients.csv", header = 1)

In [27]:
base_cartao.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [28]:
base_cartao["BILL_TOTAL"] = base_cartao["BILL_AMT1"] + base_cartao["BILL_AMT2"] + base_cartao["BILL_AMT3"] + base_cartao["BILL_AMT4"] +  base_cartao["BILL_AMT5"] + base_cartao["BILL_AMT6"]

In [29]:
x_cartao = base_cartao.iloc[:, [1,25]].values

In [30]:
scaler_cartao = StandardScaler()
x_cartao = scaler_cartao.fit_transform(x_cartao)

In [31]:
wcss = []
for i in range(1, 11):
  kmeans_cartao = KMeans(n_clusters=i, random_state=0)
  kmeans_cartao.fit(x_cartao)
  wcss.append(kmeans_cartao.inertia_)

In [32]:
grafico = px.line(x = range(1,11), y = wcss)
grafico.show()

In [33]:
kmeans_cartao = KMeans(n_clusters=4, random_state=0)
rotulos = kmeans_cartao.fit_predict(x_cartao)

In [34]:
grafico1 = px.scatter(x = x_cartao[:, 0], y = x_cartao[:, 1], color = rotulos)
grafico1.show()

In [35]:
lista_clientes = np.column_stack((base_cartao, rotulos))
lista_clientes = lista_clientes[lista_clientes[:, 26].argsort()]

## Base cartão com mais atributos

In [37]:
x_cartao_mais = base_cartao.iloc[:, [1,2,3,4,5,25]].values

In [38]:
scaler_cartao_mais = StandardScaler()
x_cartao_mais = scaler_cartao_mais.fit_transform(x_cartao_mais)

In [41]:
wcss = []
for i in range(1, 11):
  kmeans_cartao_mais = KMeans(n_clusters=i, random_state=0)
  kmeans_cartao_mais.fit(x_cartao_mais)
  wcss.append(kmeans_cartao_mais.inertia_)

In [43]:
grafico = px.line(x = range(1,11), y = wcss)
grafico.show()

In [44]:
kmeans_cartao_mais = KMeans(n_clusters=4, random_state=0)
rotulos = kmeans_cartao_mais.fit_predict(x_cartao_mais)

In [45]:
from sklearn.decomposition import PCA

In [46]:
pca = PCA(n_components=2)
x_cartao_mais_pca = pca.fit_transform(x_cartao_mais)

In [47]:
x_cartao_mais_pca.shape

(30000, 2)

In [48]:
grafico1 = px.scatter(x = x_cartao_mais_pca[:, 0], y = x_cartao_mais_pca[:, 1], color = rotulos)
grafico1.show()

In [49]:
lista_clientes = np.column_stack((base_cartao, rotulos))
lista_clientes = lista_clientes[lista_clientes[:, 26].argsort()]