## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

## Reading the data and analyzing it

In [2]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [4]:
# Check the values in the object columns
for column in df.columns.tolist():
    if df[column].dtype not in ['int64', 'float64']:
        display(df[column].value_counts())

gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

ever_married
Yes    3353
No     1757
Name: count, dtype: int64

work_type
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64

Residence_type
Urban    2596
Rural    2514
Name: count, dtype: int64

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

In [5]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

## Simple preprocessing

In [6]:
df = df.dropna(subset=['bmi'])
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [7]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [8]:
df = df[df['gender'] != 'Other']
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [9]:
for column in df.columns.tolist():
    if df[column].dtype not in ['int64', 'float64']:
        display(df[column].value_counts())

gender
Female    2897
Male      2011
Name: count, dtype: int64

ever_married
Yes    3204
No     1704
Name: count, dtype: int64

work_type
Private          2810
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: count, dtype: int64

Residence_type
Urban    2490
Rural    2418
Name: count, dtype: int64

smoking_status
never smoked       1852
Unknown            1483
formerly smoked     836
smokes              737
Name: count, dtype: int64

In [10]:
label_encoder = LabelEncoder()

In [11]:
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_columns = ['age', 'avg_glucose_level', 'bmi']

In [12]:
scaler = StandardScaler()

# Logistic Regression 

## First train

In [44]:
df1 = df.copy().drop('id', axis=1)
for column in df1.columns:
    if df1[column].dtype not in ['int64', 'float64']:
        df1.loc[:, column] = label_encoder.fit_transform(df1[column])
df1

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,0,13.0,0,0,0,4,0,103.08,18.6,0,0
5106,0,81.0,0,0,1,3,1,125.20,40.0,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.6,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.6,1,0


In [45]:
y1 = df1['stroke']
X1 = df1.drop('stroke', axis=1)

In [46]:
y1.value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

In [47]:
X1

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1,67.0,0,1,1,2,1,228.69,36.6,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2
3,0,49.0,0,0,1,2,1,171.23,34.4,3
4,0,79.0,1,0,1,3,0,174.12,24.0,2
5,1,81.0,0,0,1,2,1,186.21,29.0,1
...,...,...,...,...,...,...,...,...,...,...
5104,0,13.0,0,0,0,4,0,103.08,18.6,0
5106,0,81.0,0,0,1,3,1,125.20,40.0,2
5107,0,35.0,0,0,1,3,0,82.99,30.6,2
5108,1,51.0,0,0,1,2,0,166.29,25.6,1


In [48]:
X_scaled = scaler.fit_transform(X1)
df_standartized = pd.DataFrame(X_scaled, columns=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'])

In [49]:
df_standartized

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,1.200240,1.069938,-0.318102,4.381499,0.729270,-0.155713,0.985436,2.777797,0.981145,-0.351828
1,1.200240,1.646336,-0.318102,4.381499,0.729270,-0.155713,-1.014779,0.014016,0.459086,0.585108
2,-0.833166,0.271847,-0.318102,-0.228232,0.729270,-0.155713,0.985436,1.484266,0.701016,1.522044
3,-0.833166,1.601998,3.143642,-0.228232,0.729270,0.759543,-1.014779,1.549325,-0.623231,0.585108
4,1.200240,1.690675,-0.318102,-0.228232,0.729270,-0.155713,0.985436,1.821493,0.013426,-0.351828
...,...,...,...,...,...,...,...,...,...,...
4903,-0.833166,-1.324334,-0.318102,-0.228232,-1.371234,1.674800,-1.014779,-0.049918,-1.310821,-1.288764
4904,-0.833166,1.690675,-0.318102,-0.228232,0.729270,0.759543,0.985436,0.448045,1.414072,0.585108
4905,-0.833166,-0.348890,-0.318102,-0.228232,0.729270,0.759543,-1.014779,-0.502181,0.217156,0.585108
4906,1.200240,0.360524,-0.318102,-0.228232,0.729270,-0.155713,-1.014779,1.373057,-0.419501,-0.351828


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y1, test_size=0.2)

In [51]:
lgr = LogisticRegression()
lgr.fit(X_train, y_train)
y_pred = lgr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[928   0]
 [ 54   0]]
Recall score: 0.0
Precission score: 0.0
Accuracy score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y1, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = (lgr.predict_proba(X_test)[:, 1] >= 0.05).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[703 238]
 [  7  34]]
Recall score: 0.8292682926829268
Precission score: 0.125
Accuracy score: 0.125


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y1, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = (lgr.predict_proba(X_test)[:, 1] >= 0.3).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[929   5]
 [ 42   6]]
Recall score: 0.125
Precission score: 0.5454545454545454
Accuracy score: 0.5454545454545454


## Trying a different aproach

In [68]:
df2 = pd.get_dummies(df1, columns=categorical_columns)

In [69]:
df2

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_0,gender_1,ever_married_0,ever_married_1,...,work_type_1,work_type_2,work_type_3,work_type_4,Residence_type_0,Residence_type_1,smoking_status_0,smoking_status_1,smoking_status_2,smoking_status_3
0,67.0,0,1,228.69,36.6,1,False,True,False,True,...,False,True,False,False,False,True,False,True,False,False
2,80.0,0,1,105.92,32.5,1,False,True,False,True,...,False,True,False,False,True,False,False,False,True,False
3,49.0,0,0,171.23,34.4,1,True,False,False,True,...,False,True,False,False,False,True,False,False,False,True
4,79.0,1,0,174.12,24.0,1,True,False,False,True,...,False,False,True,False,True,False,False,False,True,False
5,81.0,0,0,186.21,29.0,1,False,True,False,True,...,False,True,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,True,False,True,False,...,False,False,False,True,True,False,True,False,False,False
5106,81.0,0,0,125.20,40.0,0,True,False,False,True,...,False,False,True,False,False,True,False,False,True,False
5107,35.0,0,0,82.99,30.6,0,True,False,False,True,...,False,False,True,False,True,False,False,False,True,False
5108,51.0,0,0,166.29,25.6,0,False,True,False,True,...,False,True,False,False,True,False,False,True,False,False


In [70]:
for column in df2.columns.tolist():
    if column not in numerical_columns:
        df2[column] = df2[column].astype(int)

In [71]:
df2

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_0,gender_1,ever_married_0,ever_married_1,...,work_type_1,work_type_2,work_type_3,work_type_4,Residence_type_0,Residence_type_1,smoking_status_0,smoking_status_1,smoking_status_2,smoking_status_3
0,67.0,0,1,228.69,36.6,1,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,1,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,1,0,1,0,...,0,0,0,1,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,0,1,0,0,1,...,0,0,1,0,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0


In [72]:
for column in numerical_columns:
    df2[column] = scaler.fit_transform(df2[[column]].values.reshape(-1, 1))

In [73]:
X2 = df2.drop('stroke', axis=1)
y2 = df2['stroke']

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = lgr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[938   0]
 [ 44   0]]
Recall score: 0.0
Precission score: 0.0
Accuracy score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = (lgr.predict_proba(X_test)[:, 1] >= 0.02).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[538 406]
 [  1  37]]
Recall score: 0.9736842105263158
Precission score: 0.0835214446952596
Accuracy score: 0.0835214446952596


In [85]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = (lgr.predict_proba(X_test)[:, 1] >= 0.35).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[936   0]
 [ 44   2]]
Recall score: 0.043478260869565216
Precission score: 1.0
Accuracy score: 1.0


## Trying a third aproach (standartize only numerical columns)

In [33]:
df3 = df.copy().drop('id', axis=1)

In [34]:
for column in df3.columns:
    if df3[column].dtype not in ['int64', 'float64']:
        df3.loc[:, column] = label_encoder.fit_transform(df3[column])

In [35]:
for column in numerical_columns:
    df3[column] = scaler.fit_transform(df3[[column]].values.reshape(-1, 1))

In [36]:
X3 = df3.drop('stroke', axis=1)
y3 = df3['stroke']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X3, y1, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = lgr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[939   0]
 [ 42   1]]
Recall score: 0.023255813953488372
Precission score: 1.0
Accuracy score: 1.0


In [90]:
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = (lgr.predict_proba(X_test)[:, 1] >= 0.02).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[528 415]
 [  2  37]]
Recall score: 0.9487179487179487
Precission score: 0.08185840707964602
Accuracy score: 0.08185840707964602


In [88]:
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size=0.2)
lgr = LogisticRegression(max_iter=1000)
lgr.fit(X_train, y_train)
y_pred = (lgr.predict_proba(X_test)[:, 1] >= 0.4).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[942   0]
 [ 39   1]]
Recall score: 0.025
Precission score: 1.0
Accuracy score: 1.0


# Gausian Naive Bayes

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y1, test_size=0.2)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[839 103]
 [ 21  19]]
Recall score: 0.475
Precission score: 0.1557377049180328
Accuracy score: 0.1557377049180328


In [92]:
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size=0.2)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[338 607]
 [  0  37]]
Recall score: 1.0
Precission score: 0.05745341614906832
Accuracy score: 0.05745341614906832


In [93]:
X_train, X_test, y_train, y_test = train_test_split(X3, y1, test_size=0.2)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[829 107]
 [ 24  22]]
Recall score: 0.4782608695652174
Precission score: 0.17054263565891473
Accuracy score: 0.17054263565891473


# Random Forest Regressor

### Fully stadartized dataset

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.5).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[949   3]
 [ 29   1]]
Recall score: 0.03333333333333333
Precission score: 0.25
Accuracy score: 0.25


In [97]:
feature_imp=pd.Series(rfr.feature_importances_,
index=df.columns.tolist()[1:-1]).sort_values(ascending=False)
feature_imp

avg_glucose_level    0.309273
bmi                  0.259125
age                  0.200629
smoking_status       0.061803
work_type            0.043293
Residence_type       0.030713
gender               0.030526
hypertension         0.025959
heart_disease        0.024660
ever_married         0.014019
dtype: float64

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.05).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[681 255]
 [ 11  35]]
Recall score: 0.7608695652173914
Precission score: 0.1206896551724138
Accuracy score: 0.1206896551724138


In [99]:
feature_imp=pd.Series(rfr.feature_importances_,
index=df.columns.tolist()[1:-1]).sort_values(ascending=False)
feature_imp

avg_glucose_level    0.323832
bmi                  0.247061
age                  0.198346
smoking_status       0.062311
work_type            0.043438
Residence_type       0.033705
gender               0.026507
hypertension         0.025425
heart_disease        0.021280
ever_married         0.018094
dtype: float64

In [119]:
columns_to_drop = ['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type']

In [None]:
df_standartized = pd.DataFrame(X_scaled, columns=['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'])
df_standartized.drop(columns_to_drop, axis=1)

In [121]:
X_train, X_test, y_train, y_test = train_test_split(df_standartized, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.5).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[926   0]
 [ 54   2]]
Recall score: 0.03571428571428571
Precission score: 1.0
Accuracy score: 1.0


In [137]:
X_train, X_test, y_train, y_test = train_test_split(df_standartized, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.05).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[692 238]
 [ 12  40]]
Recall score: 0.7692307692307693
Precission score: 0.14388489208633093
Accuracy score: 0.14388489208633093


### Standartized numerical and get dummies

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.5).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[945   0]
 [ 37   0]]
Recall score: 0.0
Precission score: 0.0
Accuracy score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [100]:
feature_imp=pd.Series(rfr.feature_importances_,
index=df.columns.tolist()[1:-1]).sort_values(ascending=False)
feature_imp

avg_glucose_level    0.323832
bmi                  0.247061
age                  0.198346
smoking_status       0.062311
work_type            0.043438
Residence_type       0.033705
gender               0.026507
hypertension         0.025425
heart_disease        0.021280
ever_married         0.018094
dtype: float64

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.05).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[687 258]
 [  9  28]]
Recall score: 0.7567567567567568
Precission score: 0.0979020979020979
Accuracy score: 0.0979020979020979


In [101]:
feature_imp=pd.Series(rfr.feature_importances_,
index=df.columns.tolist()[1:-1]).sort_values(ascending=False)
feature_imp

avg_glucose_level    0.323832
bmi                  0.247061
age                  0.198346
smoking_status       0.062311
work_type            0.043438
Residence_type       0.033705
gender               0.026507
hypertension         0.025425
heart_disease        0.021280
ever_married         0.018094
dtype: float64

In [139]:
X2_1 = X1.drop(columns_to_drop, axis=1)

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.5).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[934   6]
 [ 41   1]]
Recall score: 0.023809523809523808
Precission score: 0.14285714285714285
Accuracy score: 0.14285714285714285


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.05).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

### Standartized numerical columns

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X3, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 20, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.5).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[929   9]
 [ 44   0]]
Recall score: 0.0
Precission score: 0.0
Accuracy score: 0.0


In [102]:
feature_imp=pd.Series(rfr.feature_importances_,
index=df.columns.tolist()[1:-1]).sort_values(ascending=False)
feature_imp

avg_glucose_level    0.323832
bmi                  0.247061
age                  0.198346
smoking_status       0.062311
work_type            0.043438
Residence_type       0.033705
gender               0.026507
hypertension         0.025425
heart_disease        0.021280
ever_married         0.018094
dtype: float64

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X3, y1, test_size=0.2)
rfr = RandomForestRegressor(n_estimators = 20, random_state = 0)
rfr.fit(X_train, y_train)
y_pred = (rfr.predict(X_test) >= 0.02).astype(int)
cm = confusion_matrix(y_test, y_pred)
rc = recall_score(y_test, y_pred)
pc = precision_score(y_test, y_pred)
acs = accuracy_score(y_test, y_pred)
print('Confussion matrix:\n', cm)
print(f'Recall score: {rc}')
print(f'Precission score: {pc}')
print(f'Accuracy score: {pc}')

Confussion matrix:
 [[657 273]
 [ 14  38]]
Recall score: 0.7307692307692307
Precission score: 0.12218649517684887
Accuracy score: 0.12218649517684887


In [125]:
feature_imp=pd.Series(rfr.feature_importances_,
index=df.columns.tolist()[1:-1]).sort_values(ascending=False)
feature_imp

avg_glucose_level    0.318007
bmi                  0.259940
age                  0.197909
smoking_status       0.060695
work_type            0.043665
Residence_type       0.032636
hypertension         0.024671
gender               0.023899
heart_disease        0.021863
ever_married         0.016715
dtype: float64

## apendix

In [23]:
pca = PCA(n_components=2)
pca.fit(X)
pca_X = pca.transform(X)

In [32]:
pca_X.shape

(4908, 2)