In [8]:
# imports required to run the code.
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import pandas as pd
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import sklearn
import autograd.numpy as np_
import numpy as np
from autograd import grad

In [9]:
# preparing dataset.

dfo = pd.read_csv('healthcare-dataset-stroke-data.csv')
dfo = dfo.fillna(0)

# ---------------------------------------- #

dfo.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,0.0,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [10]:
# preparing dataset.

dfo = pd.read_csv('healthcare-dataset-stroke-data.csv')
dfo = dfo.fillna(0)

dfo = dfo.sort_values(by=['bmi'], ascending=True)
dfo = dfo.reset_index(drop = True)

# ---------------------------------------- #

df = pd.get_dummies(dfo) # converte dados não numéricos.
df = df.drop(columns=['gender_Other', 'gender_Male']) # remoção de dados redundantes.
df.rename(columns={"gender_Female": "gender"}, inplace = True)

# ---------------------------------------- #
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,34248,50.0,1,0,81.96,0.0,0,0,1,0,...,0,1,0,0,0,1,0,1,0,0
1,29224,30.0,0,0,91.23,0.0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
2,49894,78.0,1,1,206.53,0.0,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,37526,68.0,1,1,233.3,0.0,0,1,0,1,...,0,1,0,0,1,0,1,0,0,0
4,29095,71.0,1,0,93.6,0.0,0,0,0,1,...,0,0,1,0,1,0,0,0,1,0


In [11]:
# normalizing dataframe.

filtery = (df['stroke'] == 0)
dfo1 = df[filtery]

dfo1 = dfo1.iloc[:300]

filtery = (df['stroke'] == 1)
dfo2 = df[filtery]

dfo = dfo1.append(dfo2)

# ----------------- #

# excluding more redundant data.
dfa = dfo.drop(columns=['stroke','id','ever_married_No','Residence_type_Rural','smoking_status_formerly smoked']).astype(float)

# train prep
X_train, X_test, y_train, y_test = train_test_split(dfa, dfo['stroke'], train_size=0.5)
y_train = y_train.replace(to_replace=0,value=-1)
y_test = y_test.replace(to_replace=0,value=-1)
y_train.value_counts()


# ----------------- #

  dfo = dfo1.append(dfo2)


-1    154
 1    120
Name: stroke, dtype: int64

In [12]:
# To Numpy
X_train,X_test,y_train,y_test = X_train.to_numpy(),X_test.to_numpy(),y_train.to_numpy(),y_test.to_numpy()

In [13]:
# ---------------------------------------- #

def predict( w, b, pontos ): # Uses data obtained to extrapolate real value.
    est = w.T @ pontos + b
    return est


def accuracy(y_test, y_est): # Compares extrapolated data with real data.
    return np.mean(np.sign(y_test) == np.sign(y_est))

def loss( parametros ): # 
    w, b, pontos, val = parametros
    est = w.T @ pontos + b
    mse = np_.mean( (est - val)**2)
    return mse

# ---------------------------------------- #

In [14]:
# Generating data for accuracy test.
# Current values are going to take a while.

results = []
for _ in range(10):
    g = grad(loss)

    pontos = X_train.T
    alvos = (y_train).astype(float)

    w = np.random.randn( pontos.shape[0],1)
    b = 0.0
    alpha = 0.000001

    for n in range(100000):
        grad_ = g( (w, b, pontos, alvos) )
        w -= alpha*grad_[0]
        b -= alpha*grad_[1]

    y_pred = (predict( w, b, X_test.T )).reshape((y_test.shape))

    results.append(accuracy(y_test, y_pred))

In [15]:
# prints accuracy data.

print(np.array(results).mean())
print(np.array(results).std())

0.6159999999999999
0.09769847355786677


Com base nos parametros normalisados, podemos afirmar que nosso classificador tem um funcionamento teórico bom - acertando em media 60% das vezes.

In [16]:
# Generating prediction values.

g = grad(loss)

pontos = X_train.T
alvos = (y_train).astype(float)

w = np.random.randn( pontos.shape[0],1)
b = 0.0
alpha = 0.000001
print(w)
for n in range(100000):
    grad_ = g( (w, b, pontos, alvos) )
    w -= alpha*grad_[0]
    b -= alpha*grad_[1]

y_pred = (predict( w, b, X_test.T )).reshape((y_test.shape))

[[ 0.24785399]
 [ 0.08178801]
 [ 0.04249051]
 [ 1.39151283]
 [-1.28416604]
 [-1.05066268]
 [ 0.15598179]
 [-0.42760918]
 [ 0.59368556]
 [ 0.10079456]
 [ 1.07123987]
 [-0.61293936]
 [ 0.95309774]
 [ 0.06581073]
 [ 0.13137143]
 [-0.35574231]]


In [19]:
from collections import OrderedDict

columns = list(dfa.columns)
res = {columns[i]: float(w[i]) for i in range(len(w))}
sorted(res.items(), key=lambda t: t[1], reverse=True)

[('work_type_Self-employed', 1.041907432927806),
 ('Residence_type_Urban', 0.8905267999459772),
 ('work_type_Never_worked', 0.5936855570567383),
 ('ever_married_Yes', 0.1487704159036878),
 ('smoking_status_never smoked', 0.1367572483447336),
 ('work_type_Private', 0.10700079799414651),
 ('hypertension', 0.0709654372100363),
 ('heart_disease', 0.037670367103386004),
 ('bmi', 0.036853390628247176),
 ('smoking_status_Unknown', 0.03532927466292671),
 ('age', -0.0005915237221607413),
 ('avg_glucose_level', -0.004987872485936206),
 ('smoking_status_smokes', -0.35898581193108114),
 ('work_type_Govt_job', -0.41403522381982),
 ('work_type_children', -0.6276051334615018),
 ('gender', -1.0024944161663631)]

[('smoking_status_never smoked', 1.9662127728927499),
 ('work_type_Govt_job', 1.5288039518764849),
 ('heart_disease', 1.3447350243293723),
 ('work_type_Private', 0.7072463824185442),
 ('ever_married_Yes', 0.4894299210636069),
 ('hypertension', 0.26751445224545006),
 ('work_type_children', 0.11226938267801652),
 ('bmi', 0.028330032893819176),
 ('gender', 0.0008012721255334959),
 ('avg_glucose_level', -0.0048443433053298015),
 ('age', -0.007321216850998305),
 ('smoking_status_smokes', -0.6108328187336413),
 ('Residence_type_Urban', -0.6731651799177507),
 ('smoking_status_Unknown', -0.7852556759763394),
 ('work_type_Never_worked', -1.1298236983714267),
 ('work_type_Self-employed', -1.7866693388941148)]

In [20]:
from sklearn.tree import DecisionTreeClassifier

df_features = dfa
df_rotulo = df['stroke']
tree = DecisionTreeClassifier(criterion='entropy')

# Agora, vamos usar o método .fit() para ajustar os parâmetros da árvore:
tree.fit(df_features, df_rotulo)

# Podemos visualizar a árvore de decisão em uma figura!
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure( figsize=(20,20) )
a = plot_tree(tree, feature_names=df_features.columns, fontsize=15, 
              node_ids=False, impurity=False, filled=True)


ValueError: Number of labels=5110 does not match number of samples=549