In [None]:
#importando livrarias para o tratamento dos dados
import pandas as pd
import pickle as pkl
import numpy as np
import math
from google.colab import  drive
drive.mount('/drive')

#livraria utilizada para balancear o dataframe (tornar a proporcao de stroke 1 e 0 praticamente 1:1)
from imblearn.over_sampling import SMOTE

#livraria para divisao dos dados em treino e teste (escolheremos 80:20)
from sklearn.model_selection import train_test_split

#importando livrarias para a rede neural em si 
#(tecnica escolhida = MLPClassifier documentacao em https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

Mounted at /drive


In [None]:
#@title Tratamento dos dados
df = pd.read_csv('/drive/MyDrive/RedeNeural/healthcare-dataset-stroke-data.csv')
print('Proporcao antes: %.4f%%' %( 100 * df['stroke'].value_counts()[1] / (df['stroke'].value_counts()[0] + df['stroke'].value_counts()[1]) )) 
#print no inicio e no final para checar se houve grande mudanca no ratio de 1 e 0 na coluna stroke antes e apos o excluimento de algumas linhas

#tirando a coluna "Ever married" (pelas pesquisas sobre motivos de avc nao parece ter uma influencia 
#muito direta, provavelmente atrapalharia a rede) alem de tirar coluna 'id' por ser inutil tendo em vista a presenca da coluna index
#tirado tambem a linha 'Other' por existir apenas um no dataframe
df.drop('ever_married', inplace=True, axis=1)
df.drop('id', inplace=True, axis=1)
df.drop(df[df['gender'] == 'Other'].index, inplace = True)


#tirar rows (bmi - NaN) ; (smoking_status - Unknown) -- pois estas informacoes apenas desregulariam a rede pois sao informacoes desconhecidas
df = df[df.smoking_status != "Unknown"]
df = df.dropna(axis=0)
print('Porcentagem depois: %.4f%%' %( 100 * df['stroke'].value_counts()[1] / (df['stroke'].value_counts()[0]+ df['stroke'].value_counts()[1]) ))

#reorganizando as linhas do dataframe para evitar qualquer tipo de logica que a rede possa perceber 
#que foi utilizada na formulação do dataset alem de nao "viciar" a rede em um resultado especifico
df = df.reindex(np.random.permutation(df.index))

#checando quantidades e tipos em cada coluna
#print(df['gender'].value_counts())
#print(df['work_type'].value_counts())
#print(df['Residence_type'].value_counts())
#print(df['smoking_status'].value_counts())

#mudando todos os valores de string para inteiros para a leitura da rede
df.gender.replace({'Male':0,'Female':1},inplace=True)
df.work_type.replace({'Never_worked':0,'children':1,'Self-employed':2,'Private':3,'Govt_job':4},inplace=True)
df.Residence_type.replace({'Rural':0,'Urban':1},inplace=True)
df.smoking_status.replace({'never smoked':0,'formerly smoked':1,'smokes':2},inplace=True)

#Como foi visto no comeco, a proporcao entre pessoas que tiveram AVC e que nao tiveram é absurdamente grande 
#(apenas pouco mais de 5% de todos os casos do dataframe houve AVC). Desta forma é necessario manipular os 
#dados para deixá-los mais balanceados
oversample = SMOTE(random_state = 25) #um valor qualquer foi setado como random_state para obtermos sempre o mesmo "shuffle" nas variaveis
x = df.drop(['stroke'],axis=1) # x = todas as colunas do df sem contar 'stroke'
y = df[['stroke']] #y = apenas a coluna stroke
x,y= oversample.fit_resample(x,y['stroke'].values.ravel()) #comando usado para de fato criar os novos casos de 'stroke' (x = todas as colunas sem ser stroke; y = coluna stroke)
y = pd.DataFrame({'stroke':y}) #criando um dataframe com apenas a coluna y
df.to_csv("/drive/MyDrive/RedeNeural/stroke-dataset-tratado_completo.csv")

#dataframe esta agora separado entre x e y, precisamos entao apenas concatená-lo para se tornar um so
df = pd.concat([x,y],axis = 1)

#separando teste e treino (proporcao 80:20) da base de dados separando entre os dados x e o resultado esperado y
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=25, test_size=0.2, shuffle=True)
y_train = np.ravel(y_train)

display(df)


Proporcao antes: 4.8728%


Unnamed: 0,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,a,a,hipertensao,cardiaco,trabalho,residencia,glicose,imc,status


Porcentagem depois: 5.2555%


Unnamed: 0,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,18.000000,0,0,4,1,90.920000,16.000000,0,0
1,0,41.000000,0,0,4,1,106.980000,27.600000,0,0
2,0,51.000000,0,1,2,1,187.470000,34.200000,0,0
3,1,32.000000,0,0,3,0,80.280000,43.700000,0,0
4,1,27.000000,0,0,3,0,73.000000,20.000000,0,0
...,...,...,...,...,...,...,...,...,...,...
6485,1,67.025787,0,0,2,0,61.917737,25.360170,1,1
6486,1,77.821954,0,0,2,0,108.713306,30.835609,0,1
6487,0,73.906574,0,0,3,0,219.816695,33.527163,0,1
6488,0,73.005096,0,0,3,1,131.708178,27.595923,0,1


In [None]:
#@title Criação e treinamento da rede neural com a acurácia

#Utilizando o metodo Multi-layer Perceptron Classifier (MLPClassifier) para ativar e fazer os calculos da rede. 
#o MLPClass.. utiliza o metodo de gradient descent para achar com certa precisao o minimo na loss-function
#utilizamos para comeco de teste 3 hidden layers com cada uma tendo respectivamente 32, 16 e 16 layers
#utilizamos a curva sigmoide (logistic) para a ativação da função visto que facilitaria nossas contas em parte 
#por entregar um resultado de 0-1, sendo assim uma funcao otima devido à natureza probabilistica do nosso objetivo
#o solver escolhido foi o 'adam', por ser, segundo a pagina do sklearn um gradient descent otimizado
rede = MLPClassifier(hidden_layer_sizes=(64,64,64,64), activation='identity', solver = 'adam', batch_size=50, learning_rate_init=0.001, random_state=0, max_iter=2000, beta_1=0.5, beta_2=0.99999, n_iter_no_change=44).fit(x_train, y_train)
accuracy = rede.score(x_test, y_test)
#rede = MLPClassifier(hidden_layer_sizes=(64,64,64,64), activation='logistic', solver = 'adam', batch_size=50, learning_rate_init=0.001, random_state=0, max_iter=2000, beta_1=0.5, beta_2=0.99999, n_iter_no_change=44).fit(x_train, y_train)


print((accuracy*100))


#usar este comando para dizer a probabilidade de alguem ter ou nao o AVC
#print(rede.predict_proba(x_test.values[4:5]))

77.96610169491525


In [None]:
#88.13559322033898% - hidden_layer_sizes=(64,64,64,64), activation='logistic', solver = 'adam', batch_size=50, learning_rate_init=0.001, random_state=0, max_iter=2000, beta_1=0.5, beta_2=0.99999, n_iter_no_change=44

#88.6748844375963 - hidden_layer_sizes=(32,32,32), activation='logistic', solver = 'adam', batch_size=50, learning_rate_init=0.001, random_state=2, max_iter=1300, beta_1=0.5, beta_2=0.99999, n_iter_no_change=44
#/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1300) reached and the optimization hasn't converged yet.
#  ConvergenceWarning,
print(y_test.values[9:10])
#teste = rede.predict_proba(x_test.sample(n = 1))
print(rede.predict_proba(x_test.values[9:10]))
#print(teste[0][1]*100)

[[1]]
[[0.00584073 0.99415927]]


  "X does not have valid feature names, but"


In [None]:
#@title Salvando a rede neural no drive

pkl.dump(rede, open('/drive/MyDrive/RedeNeural/model.pkl', 'wb'))

In [None]:
#@title Carregando rede neural

rede_dwnld = pkl.load(open('/drive/MyDrive/RedeNeural/model.pkl', 'rb'))

In [None]:
accuracy = rede_dwnld.score(x_test, y_test)

print((accuracy*100))
display(x_test)
print(x_test.values[3:4])

print(rede_dwnld.predict_proba(x_test.values[3:4])[0][1])

95.83975346687211


Unnamed: 0,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
5079,1,48.156375,0,0,3,0,74.037285,21.266240,0
252,0,71.000000,0,0,4,0,99.760000,33.400000,1
5681,1,81.202409,1,0,3,0,81.158671,29.052293,0
6272,0,73.000000,1,0,2,0,194.160513,33.432804,0
5583,0,68.839683,0,0,2,0,76.374810,23.471746,1
...,...,...,...,...,...,...,...,...,...
2425,0,28.000000,0,0,3,1,73.270000,25.400000,2
1758,0,17.000000,0,0,4,1,123.040000,29.600000,0
5922,0,58.052964,0,0,2,0,107.830422,38.435812,1
4781,0,62.972012,0,1,2,0,116.742757,34.532186,1


[[  0.          73.           1.           0.           2.
    0.         194.16051341  33.43280421   0.        ]]
0.9144740570338055




https://www.3blue1brown.com/lessons/neural-networks

https://www.3blue1brown.com/lessons/gradient-descent

https://www.3blue1brown.com/lessons/neural-network-analysis

https://www.3blue1brown.com/lessons/backpropagation

https://www.3blue1brown.com/lessons/backpropagation-calculus

https://www.youtube.com/watch?v=BR9h47Jtqyw 

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

https://scikit-learn.org/stable/modules/neural_networks_supervised.html

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor 

https://imbalanced-learn.org/stable/over_sampling.html

https://www.kaggle.com/code/veronicaxiaohu/stroke-prediction

https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/

https://www.youtube.com/watch?v=M4R77LfyqHM

https://practicaldatascience.co.uk/machine-learning/how-to-save-and-load-machine-learning-models-using-pickle#:~:text=Save%20the%20model%20with%20Pickle,pkl%20.

https://towardsdatascience.com/why-do-we-set-a-random-state-in-machine-learning-models-bb2dc68d8431

https://scikit-learn.org/stable/model_persistence.html

https://medium.com/codex/properly-pickle-out-to-a-path-in-python-when-using-google-colab-741f0905e68b

https://www.geeksforgeeks.org/how-to-get-first-row-of-pandas-dataframe/#:~:text=We%20can%20get%20the%20first%20row%20by%20using%200%20indexes.

https://www.geeksforgeeks.org/how-to-do-train-test-split-using-sklearn-in-python/#:~:text=The%20train_test_split()%20method%20is,training%20and%20fitting%20the%20model.

https://analyticsindiamag.com/a-beginners-guide-to-scikit-learns-mlpclassifier/

https://www.javatpoint.com/numpy-ravel

https://www.geeksforgeeks.org/python-pandas-dataframe-sample/

