In [45]:
import pandas as pd
from joblib import load, dump

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, f1_score, recall_score, matthews_corrcoef

In [46]:
def get_metrics(y_true=None, y_predict=None, name_model=""):
    row = [
        name_model,
        precision_score(y_pred=y_predict, y_true=y_true),
        accuracy_score(y_true=y_true, y_pred=y_predict),
        recall_score(y_true=y_true, y_pred=y_predict),
        f1_score(y_true=y_true, y_pred=y_predict),
        matthews_corrcoef(y_true=y_true, y_pred=y_predict),
    ]
    return row

In [47]:
df_train = pd.read_csv("../../results/4_train_dataset.csv")
df_test_mix = pd.read_csv("../../results/4_testing_mix.csv")
df_test_neg = pd.read_csv("../../results/4_testing_negatives.csv")

In [48]:
scaler = load("../../results/scaler_process.joblib")

In [49]:
response = df_train["stroke"].values
df_values = df_train.drop(columns=["stroke"]).values

In [50]:
X_train, X_val, y_train, y_val = train_test_split(df_values, response, test_size=.2, random_state=42)

In [51]:
knn_class = KNeighborsClassifier()
knn_class.fit(X_train, y_train)

In [52]:
dt_class = DecisionTreeClassifier()
dt_class.fit(X_train, y_train)

In [53]:
rf_class = RandomForestClassifier()
rf_class.fit(X_train, y_train)

In [54]:
svc_class = SVC()
svc_class.fit(X_train, y_train)

In [55]:
gnv_class = GaussianNB()
gnv_class.fit(X_train, y_train)

In [56]:
sgd_class = SGDClassifier()
sgd_class.fit(X_train, y_train)

In [57]:
knn_predict_val = knn_class.predict(X_val)
dt_predict_val = dt_class.predict(X_val)
svc_predict_val = svc_class.predict(X_val)
rf_predict_val = rf_class.predict(X_val)
gnv_predict_val = gnv_class.predict(X_val)
sgd_predict_val = sgd_class.predict(X_val)

In [58]:
matrix_data = [
    get_metrics(y_true=y_val, y_predict=knn_predict_val, name_model="KNN"),
    get_metrics(y_true=y_val, y_predict=dt_predict_val, name_model="DT"),
    get_metrics(y_true=y_val, y_predict=svc_predict_val, name_model="SVC"),
    get_metrics(y_true=y_val, y_predict=rf_predict_val, name_model="RF"),
    get_metrics(y_true=y_val, y_predict=gnv_predict_val, name_model="GNV"),
    get_metrics(y_true=y_val, y_predict=sgd_predict_val, name_model="SGD")
]

df_performance_val = pd.DataFrame(data=matrix_data, columns=["name_model", "precision_score", "accuracy_score", "recall_score", "f1_score", "matthews_corrcoef"])
df_performance_val

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef
0,KNN,0.6875,0.629213,0.647059,0.666667,0.250391
1,DT,0.738095,0.651685,0.607843,0.666667,0.315453
2,SVC,0.706897,0.696629,0.803922,0.752294,0.370174
3,RF,0.732143,0.719101,0.803922,0.766355,0.41903
4,GNV,0.681818,0.696629,0.882353,0.769231,0.372553
5,SGD,0.706897,0.696629,0.803922,0.752294,0.370174


In [59]:
response_test_mix = df_test_mix["stroke"]
df_test_mix_to_standardize = df_test_mix[["age", "avg_glucose_level", "bmi"]]
df_test_mix_no_standardize = df_test_mix.drop(columns=["stroke", "age", "avg_glucose_level", "bmi"])


In [61]:
scaler_x_mix = scaler.transform(df_test_mix_to_standardize)

df_scaler_mix = pd.DataFrame(data=scaler_x_mix, columns=df_test_mix_to_standardize.columns)
X_test_mix = pd.concat([df_scaler_mix, df_test_mix_no_standardize], axis=1)
X_test_mix.head(5)



Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,ever_married,Residence_type,Female,Male,formerly smoked,never smoked,smokes,Unknown,children,Govt_job,Never_worked,Private,Self-employed
0,0.719512,0.353868,0.954545,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0
1,1.0,0.218297,0.718615,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0
2,0.414634,0.317804,0.694805,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0
3,0.634146,0.46493,0.75974,0,0,1,1,1,0,0,1,0,0,0,1,0,0,0
4,0.914634,0.385368,0.625395,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0


In [63]:
knn_predict_test_mix = knn_class.predict(X_test_mix.values)
dt_predict_test_mix = dt_class.predict(X_test_mix.values)
svc_predict_test_mix = svc_class.predict(X_test_mix.values)
rf_predict_test_mix = rf_class.predict(X_test_mix.values)
gnv_predict_test_mix = gnv_class.predict(X_test_mix.values)
sgd_predict_test_mix = sgd_class.predict(X_test_mix.values)

In [64]:
matrix_data = [
    get_metrics(y_true=response_test_mix.values, y_predict=knn_predict_test_mix, name_model="KNN"),
    get_metrics(y_true=response_test_mix.values, y_predict=dt_predict_test_mix, name_model="DT"),
    get_metrics(y_true=response_test_mix.values, y_predict=svc_predict_test_mix, name_model="SVC"),
    get_metrics(y_true=response_test_mix.values, y_predict=rf_predict_test_mix, name_model="RF"),
    get_metrics(y_true=response_test_mix.values, y_predict=gnv_predict_test_mix, name_model="GNV"),
    get_metrics(y_true=response_test_mix.values, y_predict=sgd_predict_test_mix, name_model="SGD")
]

df_performance_test_mix = pd.DataFrame(data=matrix_data, columns=["name_model", "precision_score", "accuracy_score", "recall_score", "f1_score", "matthews_corrcoef"])
df_performance_test_mix

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef
0,KNN,0.565217,0.5,0.464286,0.509804,0.009701
1,DT,0.8,0.68,0.571429,0.666667,0.394771
2,SVC,0.777778,0.74,0.75,0.763636,0.475347
3,RF,0.846154,0.8,0.785714,0.814815,0.600012
4,GNV,0.740741,0.7,0.714286,0.727273,0.394506
5,SGD,0.821429,0.8,0.821429,0.821429,0.594156


In [65]:
response_negative = df_test_neg["stroke"]
df_negative_to_standardize = df_test_neg[["age", "avg_glucose_level", "bmi"]]
df_negative_no_standardize = df_test_neg.drop(columns=["stroke", "age", "avg_glucose_level", "bmi"])

scaler_x_neg = scaler.transform(df_negative_to_standardize)

df_scaler_neg = pd.DataFrame(data=scaler_x_neg, columns=df_negative_to_standardize.columns)
X_test_neg = pd.concat([df_scaler_neg, df_negative_no_standardize], axis=1)
X_test_neg.head(5)



Unnamed: 0,age,avg_glucose_level,bmi,hypertension,heart_disease,ever_married,Residence_type,Female,Male,formerly smoked,never smoked,smokes,Unknown,children,Govt_job,Never_worked,Private,Self-employed
0,0.987805,0.606352,0.746753,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0
1,0.378049,0.219438,0.430736,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0
2,0.756098,0.962943,0.930736,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1
3,0.512195,0.407301,0.701299,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0
4,0.670732,0.343564,0.614719,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0


In [66]:
knn_predict_negative = knn_class.predict(X_test_neg.values)
dt_predict_negative = dt_class.predict(X_test_neg.values)
svc_predict_negative = svc_class.predict(X_test_neg.values)
rf_predict_negative = rf_class.predict(X_test_neg.values)
gnv_predict_negative = gnv_class.predict(X_test_neg.values)
sgd_predict_negative = sgd_class.predict(X_test_neg.values)

matrix_data = [
    get_metrics(y_true=response_negative.values, y_predict=knn_predict_negative, name_model="KNN"),
    get_metrics(y_true=response_negative.values, y_predict=dt_predict_negative, name_model="DT"),
    get_metrics(y_true=response_negative.values, y_predict=svc_predict_negative, name_model="SVC"),
    get_metrics(y_true=response_negative.values, y_predict=rf_predict_negative, name_model="RF"),
    get_metrics(y_true=response_negative.values, y_predict=gnv_predict_negative, name_model="GNV"),
    get_metrics(y_true=response_negative.values, y_predict=sgd_predict_negative, name_model="SGD")
]

df_performance_negative = pd.DataFrame(data=matrix_data, columns=["name_model", "precision_score", "accuracy_score", "recall_score", "f1_score", "matthews_corrcoef"])
df_performance_negative

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef
0,KNN,0.0,0.659319,0.0,0.0,0.0
1,DT,0.0,0.703852,0.0,0.0,0.0
2,SVC,0.0,0.648408,0.0,0.0,0.0
3,RF,0.0,0.696281,0.0,0.0,0.0
4,GNV,0.0,0.597417,0.0,0.0,0.0
5,SGD,0.0,0.649076,0.0,0.0,0.0


In [68]:
confusion_matrix(response_negative.values, knn_predict_negative)

array([[2961, 1530],
       [   0,    0]])

In [70]:
df_performance_negative["status"] = "Testing negative"
df_performance_test_mix["status"] = "Testing mix"
df_performance_val["status"] = "Validacion"

df_performances = pd.concat([df_performance_val, df_performance_test_mix, df_performance_negative], axis=0)
df_performances

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef,status
0,KNN,0.6875,0.629213,0.647059,0.666667,0.250391,Validacion
1,DT,0.738095,0.651685,0.607843,0.666667,0.315453,Validacion
2,SVC,0.706897,0.696629,0.803922,0.752294,0.370174,Validacion
3,RF,0.732143,0.719101,0.803922,0.766355,0.41903,Validacion
4,GNV,0.681818,0.696629,0.882353,0.769231,0.372553,Validacion
5,SGD,0.706897,0.696629,0.803922,0.752294,0.370174,Validacion
0,KNN,0.565217,0.5,0.464286,0.509804,0.009701,Testing mix
1,DT,0.8,0.68,0.571429,0.666667,0.394771,Testing mix
2,SVC,0.777778,0.74,0.75,0.763636,0.475347,Testing mix
3,RF,0.846154,0.8,0.785714,0.814815,0.600012,Testing mix


In [71]:
df_performances.sort_values(by="matthews_corrcoef", ascending=False)

Unnamed: 0,name_model,precision_score,accuracy_score,recall_score,f1_score,matthews_corrcoef,status
3,RF,0.846154,0.8,0.785714,0.814815,0.600012,Testing mix
5,SGD,0.821429,0.8,0.821429,0.821429,0.594156,Testing mix
2,SVC,0.777778,0.74,0.75,0.763636,0.475347,Testing mix
3,RF,0.732143,0.719101,0.803922,0.766355,0.41903,Validacion
1,DT,0.8,0.68,0.571429,0.666667,0.394771,Testing mix
4,GNV,0.740741,0.7,0.714286,0.727273,0.394506,Testing mix
4,GNV,0.681818,0.696629,0.882353,0.769231,0.372553,Validacion
2,SVC,0.706897,0.696629,0.803922,0.752294,0.370174,Validacion
5,SGD,0.706897,0.696629,0.803922,0.752294,0.370174,Validacion
1,DT,0.738095,0.651685,0.607843,0.666667,0.315453,Validacion
