# **Import Libraries and import Dataset**

In [None]:
import os
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
from google.colab import files
uploaded =files.upload()


In [None]:
rawDataset =pd.read_csv('/content/healthcare-dataset-stroke-data.csv')
rawDataset.head()

In [None]:
rawDataset.describe()

In [None]:
# Drop the id column
rawDataset.drop(['id'], axis=1, inplace=True)

# **Checking Correlation**

In [None]:
import seaborn as sns
#Using Pearson Correlation
plt.figure(figsize=(10,10))
cor = rawDataset.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

In [None]:
#Checking the distributions of all columns
figures = plt.figure(figsize = (20,25))
axis = figures.gca()
rawDataset.hist(ax = axis)

In [None]:
sns.countplot(x='stroke',data=rawDataset)

In [None]:
rawDataset.info()

# **Checking Null values**

In [None]:
rawDataset.isnull().sum()

# **Replace Null Values With Mean Value**

In [None]:
#replacing bmi column missing values with mean
mean_value_bmi=rawDataset['bmi'].mean()
rawDataset['bmi']=rawDataset['bmi'].fillna(mean_value_bmi)
rawDataset.isnull().sum()


In [None]:
#replace yes and no with 1 and 0 in ever married column
rawDataset['ever_married'] = rawDataset['ever_married'].map({'Yes': 1, 'No': 0})
rawDataset['Residence_type'] = rawDataset['Residence_type'].map({'Urban': 1, 'Rural': 0})

In [None]:
rawDataset = rawDataset.rename(columns={'Residence_type' : 'isUrban'})

#**Balancing the dataset using OverSampling**

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC

smotenc = SMOTENC(random_state=42, categorical_features=[0, 2, 3, 4, 5,6, 9])

In [None]:
#Spliiting data into testing and training
Y = rawDataset['stroke']
X = rawDataset.drop(['stroke'], axis = 1)

In [None]:
X, Y=smotenc.fit_resample(X,Y)

In [None]:
Y_df = pd.DataFrame(Y, columns=['stroke'])

In [None]:
X_df = pd.DataFrame(X, columns=rawDataset.columns.drop('stroke'))


In [None]:
ds = pd.concat([X_df, Y_df], axis=1)

In [None]:
sns.countplot(x='stroke',data=ds)

# **Onehot Encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder()
dataEncoded=enc.fit(ds[['gender','work_type','smoking_status']]).transform(ds[['gender','work_type','smoking_status']])

In [None]:
temp=pd.DataFrame(dataEncoded.toarray(),columns=enc.get_feature_names_out())

In [None]:
ds =pd.concat([temp,ds[['avg_glucose_level','isUrban','age','ever_married','hypertension','heart_disease','bmi', 'stroke']]], axis=1, join='inner')

In [None]:
ds

# **Checking** **Outliers**

In [None]:
sns.boxplot(x=rawDataset['bmi'])

In [None]:
sns.boxplot(x=rawDataset['avg_glucose_level'])

In [None]:
### Remove outliers based on the contamination value
from sklearn.ensemble import IsolationForest
rng = np.random.RandomState(42)
iso = IsolationForest(max_samples='auto', contamination=float(0.1), n_estimators=100, random_state=rng)
yhat = iso.fit_predict(ds.iloc[:, 0:-1])
yhat.shape

In [None]:
# select all rows that are not outliers
mask = yhat != -1
ds = ds.iloc[mask, :]
ds.shape

# **Spliting the training and testing sets**

In [None]:
#Spliiting data into testing and training
Y = ds['stroke']
X = ds.drop(['stroke'], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state=42)

# **Training dataset using  Different Models and Cheking Accuracy**

# SVM

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1, gamma=0.001, random_state=1)
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

y_predict = model.predict(X_test)
print("Accuracy:" ,accuracy_score(y_test, y_predict))
svm = accuracy_score(y_test, y_predict)

# KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

Range_k = range(1,15)
scores = {}
scores_list = []
for k in Range_k:
   classifier = KNeighborsClassifier(n_neighbors=k)
   classifier.fit(X_train, y_train)
   y_pred3 = classifier.predict(X_test)
   scores[k] = metrics.accuracy_score(y_test,y_pred3)
   scores_list.append(metrics.accuracy_score(y_test,y_pred3))


print("Accuracy:",metrics.accuracy_score(y_test, y_pred3))
KNeighborsClassifier = metrics.accuracy_score(y_test, y_pred3)

# Gaussian Naive Bayes model

In [None]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
gnb = GaussianNB()

#Train the model using the training sets
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred2 = gnb.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred2))
Gaussian_Naive_Bayes = metrics.accuracy_score(y_test, y_pred2)

# RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 50)
classifier.fit(X_train, y_train)

y_pred4 = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


print("Accuracy:",accuracy_score(y_test,y_pred4))
RandomForestClassifier = accuracy_score(y_test,y_pred4)

In [None]:
modeles = ['SVM','KNeighborsClassifier','Gaussian_Naive_Bayes','RandomForestClassifier']
Accuracies = pd.DataFrame({"Model": modeles,"Accuracy": [svm,KNeighborsClassifier,Gaussian_Naive_Bayes,RandomForestClassifier]})
Accuracies

# **Hyper** **Parameter** **Tuning**

In [None]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')
#over=BorderlineSMOTE(sampling_strategy=0.3, k_neighbors=6)
#under=RandomUnderSampler(sampling_strategy=0.4)

from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold

# Number of trees in random forest
n_estimators = [200]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'min_samples_split': [1,2],
               'min_samples_leaf': [1,2],
               'max_features': ['sqrt','log2','auto'],
               'max_depth': [10,20,30],
               'bootstrap': [False,True]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

cv=RepeatedStratifiedKFold(n_splits=7, n_repeats=2, random_state=42)

# Going to use RandomizedSearchCV because we have a lot of parameters and this way it will be more time efficient
grid_search=RandomizedSearchCV(estimator=rf
                               ,param_distributions=random_grid
                               ,n_jobs=-1
                               ,scoring='precision'
                               ,cv=cv
                               ,n_iter = 50
                              )

pipe=Pipeline([ ('model',grid_search)])

In [None]:
pipe.fit(X_train,y_train)

In [None]:
print("Training Accuracy :", pipe.score(X_train, y_train))

# **Model Accuracy Stats**

In [None]:
# calculate accuracy
y_hat = pipe.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
#run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = pipe.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
#run.log('AUC', np.float(auc))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(
    cm,
    classes,
    normalize=False,
    title='Confusion matrix',
    cmap=plt.cm.Blues,
    ):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = ('.2f' if normalize else 'd')

    thresh = cm.max() / 2.

    for (i, j) in itertools.product(range(cm.shape[0]),
                                    range(cm.shape[1])):

        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment='center', color=('white' if cm[i,
                 j] > thresh else 'black'))

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_hat)
np.set_printoptions(precision=2)

# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['0', '1'],
                      title='Confusion matrix')

plt.rcParams['figure.figsize'] = (10, 10)
