<a href="https://colab.research.google.com/github/Pedro-Boechat/periodo1trainee/blob/main/Per%C3%ADodo2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

# Tratando nan

In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
# Preencher os 201 nan pela média do bmi

media = df['bmi'].mean()
df = df.fillna(media)

# One hot encoding 

Para fazer com que todos os dados sejam numéricos, usarei one hot encoding

In [6]:
df = df.drop(columns='id')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
df.value_counts('gender')

gender
Female    2994
Male      2115
Other        1
dtype: int64

In [8]:
df = df[df['gender'] != 'Other']
df.value_counts('gender')

# Como só há 1 other, vou retirar

gender
Female    2994
Male      2115
dtype: int64

In [9]:
# Onde os dados são binários, vou transformá-los da seguinte forma:

df['gender'] = df['gender'].replace({'Male':0,'Female':1}).astype(np.uint8)
df['ever_married'] = df['ever_married'].replace({'No':0,'Yes':1}).astype(np.uint8)
df['Residence_type'] = df['Residence_type'].replace({'Rural':0,'Urban':1}).astype(np.uint8)

In [10]:
# One hot encoding nas outras colunas que será necessário transformar os dados

dummies = pd.get_dummies(df.work_type)
df = pd.concat([df, dummies], axis=1)
df = df.drop(columns='work_type')

In [11]:
dummies = pd.get_dummies(df.smoking_status)
df = pd.concat([df, dummies], axis=1)
df = df.drop(columns='smoking_status')

In [12]:
# Normalizar os dados

df['age'] = df['age']/df['age'].max()
df['avg_glucose_level'] = df['avg_glucose_level']/df['avg_glucose_level'].max()
df['bmi'] = df['bmi']/df['bmi'].max()

In [13]:
df.head()


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,Govt_job,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes
0,0,0.817073,0,1,1,1,0.841577,0.375,1,0,0,1,0,0,0,1,0,0
1,1,0.743902,0,0,1,0,0.74413,0.296037,1,0,0,0,1,0,0,0,1,0
2,0,0.97561,0,1,1,0,0.389784,0.332992,1,0,0,1,0,0,0,0,1,0
3,1,0.597561,0,0,1,1,0.630124,0.352459,1,0,0,1,0,0,0,0,0,1
4,1,0.963415,1,0,1,0,0.64076,0.245902,1,0,0,0,1,0,0,0,1,0


# Balancear os dados

In [14]:
# Perceber que os dados estão desbalanceados: há inúmeras amostras sem enfarto, e pouquíssimas enfartadas.
df['stroke'].value_counts()

0    4860
1     249
Name: stroke, dtype: int64

In [15]:
from sklearn.utils import resample
# Separar a maioria e minoria
df_majority = df[df['stroke']==0]
df_minority = df[df['stroke']==1]
# Downsample
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    
                                 n_samples=249)
# Combinar a maioria downsampled com a minoria
df_balanc = pd.concat([df_majority_downsampled, df_minority])

In [16]:
df_balanc['stroke'].value_counts()

1    249
0    249
Name: stroke, dtype: int64

# KMeans Clustering

Silhueta refere-se a quanto uma amostra é semelhante às restantes de seu cluster

In [17]:
from sklearn.metrics import silhouette_samples, silhouette_score
k = 1
while k < 6:
  k += 1
  kmeans = KMeans(n_clusters=k)
  pred = kmeans.fit_predict(df_balanc)
  print(silhouette_score(df_balanc, kmeans.labels_))
# Procedimento realizado para verificar qual o melhor número de clusters para o dataset.

# Esse resultado sugere que um bom número de clusters é 4.

0.12640212053407288
0.15616743409119047
0.15823547815480993
0.19084936233482236
0.21840995012275474


In [18]:
kmeans = KMeans(n_clusters=4)
pred = kmeans.fit_predict(df_balanc)

In [19]:
# Verificar os centros de cada cluster
kmeans.cluster_centers_

array([[ 4.82142857e-01,  7.63501742e-01,  2.05357143e-01,
         1.42857143e-01,  9.10714286e-01,  4.91071429e-01,
         4.53850319e-01,  3.11576526e-01,  6.25000000e-01,
         1.16071429e-01,  3.46944695e-18,  6.16071429e-01,
         2.67857143e-01, -4.16333634e-17,  3.33066907e-16,
         1.00000000e+00,  1.66533454e-16, -1.11022302e-16],
       [ 5.00000000e-01,  5.17580645e-01,  5.64516129e-02,
         6.45161290e-02,  5.56451613e-01,  5.96774194e-01,
         3.92851801e-01,  2.67258684e-01,  3.79032258e-01,
         9.67741935e-02,  8.06451613e-03,  4.59677419e-01,
         1.45161290e-01,  2.90322581e-01,  1.00000000e+00,
         2.49800181e-16,  5.55111512e-17, -1.38777878e-16],
       [ 6.72222222e-01,  6.73915989e-01,  2.22222222e-01,
         7.77777778e-02,  7.27777778e-01,  5.16666667e-01,
         4.36050391e-01,  3.09634145e-01,  5.00000000e-01,
         1.33333333e-01,  5.55555556e-03,  6.11111111e-01,
         2.33333333e-01,  1.66666667e-02,  3.60822483e

Há outros métodos para clustering. Neste caso utilizei o KMeans, mas há também o Kmedoids.
A grande diferença é que o método KMeans é bastante afetado pela presença de outliers, enquanto a Kmedoids não. Entretanto, a Kmedoids tem um custo computacional maior.

# PCA

In [20]:
pca = PCA(.95)

# Definindo 95% de variância para nosso novo dataframe

In [21]:
pca.fit(df)

PCA(n_components=0.95)

In [22]:
pca.n_components_

# Houve redução de 18 colunas para 11 colunas

11

In [23]:
df = pca.transform(df)

In [24]:
dpca = pd.DataFrame(df)

# Transformando o array em dataframe para visualizarmos melhor

In [25]:
dpca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.544678,0.105910,-1.096520,-0.241259,0.471201,-0.522699,0.404622,-0.074248,0.239738,0.095207,1.229125
1,-0.554662,0.799884,0.811660,0.452444,-0.274260,-0.167084,-0.217475,0.532170,-0.053459,0.062539,0.489178
2,-0.821494,-0.255739,0.050655,0.655131,0.649316,-0.196342,-0.311269,0.015787,0.207551,0.140143,1.251209
3,-0.517941,-0.218162,-0.572670,-0.620920,-0.439699,0.668494,-0.180272,0.166137,0.085898,-0.129641,0.515558
4,-0.699096,0.927515,0.763793,0.485685,-0.258663,-0.212181,-0.289652,0.613094,0.875351,-0.014085,0.202668
...,...,...,...,...,...,...,...,...,...,...,...
5104,-0.934358,-0.205513,0.352190,-0.562457,-0.006164,-0.239672,-0.252059,0.028328,0.811510,-0.221320,-0.243347
5105,-0.571947,0.901192,0.707742,-0.505154,-0.050211,-0.202503,-0.231286,0.511498,-0.238813,0.039792,-0.001032
5106,-0.417716,0.690005,0.893987,0.436051,-0.269446,-0.093672,-0.139488,0.505364,-0.334137,-0.078001,-0.119793
5107,-0.408053,-0.130330,-0.863180,0.685846,0.207921,-0.399730,0.498011,-0.138280,-0.178833,-0.147095,-0.045458
