In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score



Data collection and Analysis

In [34]:
# loading the stroke dataset to a pandas dataframe

stroke_dataset = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

In [35]:
#printing first five rows of the dataset
stroke_dataset.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,67.0,0,1,228.69,36.6,1
1,61.0,0,0,202.21,,1
2,80.0,0,1,105.92,32.5,1
3,49.0,0,0,171.23,34.4,1
4,79.0,1,0,174.12,24.0,1


In [36]:
from sklearn.metrics.cluster import davies_bouldin_score
# number of rows and columns in this dataset
stroke_dataset.shape

(5110, 6)

In [37]:
from scipy.sparse import dia
#getting the statistical measures of the data
stroke_dataset.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [38]:
# Drop the column with missing BMI values
stroke_dataset = stroke_dataset.dropna(subset=['bmi'])


In [39]:
# checking number of rows and columns in this dataset
stroke_dataset.shape

(4909, 6)

In [40]:
stroke_dataset['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

0---> no stroke
1---> stroke

In [41]:
stroke_dataset.groupby('stroke').mean()

Unnamed: 0_level_0,age,hypertension,heart_disease,avg_glucose_level,bmi
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,41.760451,0.083191,0.043191,104.003736,28.823064
1,67.712919,0.287081,0.191388,134.571388,30.471292


In [42]:
# separating the data and labels

X = stroke_dataset.drop(columns = 'stroke', axis=1)
Y = stroke_dataset['stroke']

In [43]:
print(X)

       age  hypertension  heart_disease  avg_glucose_level   bmi
0     67.0             0              1             228.69  36.6
2     80.0             0              1             105.92  32.5
3     49.0             0              0             171.23  34.4
4     79.0             1              0             174.12  24.0
5     81.0             0              0             186.21  29.0
...    ...           ...            ...                ...   ...
5104  13.0             0              0             103.08  18.6
5106  81.0             0              0             125.20  40.0
5107  35.0             0              0              82.99  30.6
5108  51.0             0              0             166.29  25.6
5109  44.0             0              0              85.28  26.2

[4909 rows x 5 columns]


In [44]:
print(Y)

0       1
2       1
3       1
4       1
5       1
       ..
5104    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 4909, dtype: int64


Data standardisation

In [45]:
# Standardize the numerical columns
scaler = StandardScaler()
scaler.fit(X)

In [46]:
standarized_data = scaler.transform(X)

In [47]:
print(standarized_data)

[[ 1.07013796 -0.31806673  4.38196829  2.77769839  0.98134488]
 [ 1.64656262 -0.31806673  4.38196829  0.0138418   0.45926914]
 [ 0.27201152 -0.31806673 -0.22820795  1.48413156  0.70120668]
 ...
 [-0.34875349 -0.31806673 -0.22820795 -0.50236926  0.21733161]
 [ 0.36069224 -0.31806673 -0.22820795  1.37291993 -0.41934612]
 [ 0.05030973 -0.31806673 -0.22820795 -0.45081569 -0.34294479]]


In [48]:
X = standarized_data
Y = stroke_dataset['stroke']

Train test split

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [50]:
print(X.shape, X_train.shape, X_test.shape)

(4909, 5) (3927, 5) (982, 5)


Training the model

In [51]:
Classifier = svm.SVC(kernel='linear')

In [52]:
# training the support vector machine classifier

Classifier.fit(X_train, Y_train)

Model Evaluation

Accuracy score

In [53]:
#accuracy score on the training data
X_train_prediction = Classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [54]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9574738986503692


In [55]:
#accuracy score on the test data
X_test_prediction = Classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [56]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9572301425661914


In [57]:
# checking number of rows and columns in this dataset
stroke_dataset.shape

(4909, 6)

Making a input data

In [58]:
input_data = (0.72,0,0,62.13,16.8)

#change input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are prediciting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

#standarizing the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

predicition = Classifier.predict(std_data)
print(predicition)

if (predicition[0]==0):
  print('The person doesnt have chance of stroke')
else:
  print('The person have chance of stroke')

[[-1.86874096 -0.31806673 -0.22820795 -0.97197949 -1.53989892]]
[0]
The person doesnt have chance of stroke


