## 1. Problem Definition

Can we predict or classify whether the patient is susceptible for stroke or not?

In [None]:
!pip3 install seaborn

In [None]:
# EDA and plotiing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # seaborn gets shortened to sns

%matplotlib inline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

## Exploratory Data Analysis

In [None]:
# Load the data
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.info()

In [None]:
df["smoking_status"].unique()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Let's see how many positives(1) and negatives(0) in our target
df["stroke"].value_counts()

In [None]:
# Normalized value counts
df["stroke"].value_counts(normalize=True)*100

We have an **unbalanced** target column, we have more samples for false and very less samples for negative

In [None]:
# Visulazing the value counts
df["stroke"].value_counts().plot(kind="bar", color=["skyblue", "lightgreen"]);

In [None]:
# Getting metrics on the columns
df.describe()

### `Stroke` with respect to `gender`

In [None]:
pd.crosstab(df.gender, df.stroke)

**Female** with stroke are more than **Male**, There's an outlier with one sample in **other** gender we'll drop the sample.

In [None]:
df = df[df["gender"] != "Other"]

In [None]:
pd.crosstab(df.gender, df.stroke).plot(kind="bar", color=["skyblue", "lightgreen"]);

### `Stroke` with respect to `ever_married`

In [None]:
pd.crosstab(df["ever_married"], df["stroke"])

**ever_married** people have stroke more than **never_married**. (Not sure what to say about this one 😜)

In [None]:
# Visualizing the crosstab
pd.crosstab(df["ever_married"], df["stroke"]).plot(kind="bar", color=["skyblue", "lightgreen"]);

### `residence_type` with respect to `stroke`

In [None]:
pd.crosstab(df["Residence_type"], df["stroke"])

**urban** people have a little edge than **rural** on stroke possiblity

In [None]:
# Visualizing residence_type with target
pd.crosstab(df["Residence_type"], df["stroke"]).plot(kind="bar", color=["skyblue", "lightgreen"]);

### `smoking_type` with respect to `stroke`

In [None]:
pd.crosstab(df["smoking_status"], df["stroke"])

In [None]:
pd.crosstab(df["smoking_status"], df["stroke"]).plot(kind="bar");

In [None]:
df["smoking_status"].value_counts(normalize=True)

**never smoked** are affected by stroke higher than other categories, but the sample size for never_smoked is 37%.

### `work_type` with respect to `stroke`

In [None]:
pd.crosstab(df["work_type"], df["stroke"])

In [None]:
df["work_type"].value_counts(normalize=True)

In [None]:
pd.crosstab(df["work_type"], df["stroke"]).plot(kind="bar");

People in **private** are more susceptibe to stroke compared to other categories, but *private* category sample is 57% - conveying they have the option to visit the hospital or have a health check more compared to other categories

### `hyper_tension` with respect to `stroke`

In [None]:
pd.crosstab(df["hypertension"], df["stroke"])

**hypertension** people are more susceptible to stroke

In [None]:
pd.crosstab(df["hypertension"], df["stroke"]).plot(kind="bar", color=["skyblue", "lightgreen"]);

### `age` with respect to `stroke`

In [None]:
pd.crosstab(df["age"], df["stroke"]).plot(kind="line");

**Aged** people are more susceptible to stroke than young ones

### `bmi` with respect to `stroke`

In [None]:
pd.crosstab(df["bmi"], df["stroke"]).plot(kind="line");

### `age` and `bmi` impact on `stroke`

In [None]:
# Create another figure
plt.figure(figsize=(10,6))

# Start with positve examples
plt.scatter(df.age[df.stroke==0], 
            df.bmi[df.stroke==0], 
            c="lightblue") # define it as a scatter figure

# Now for negative examples, we want them on the same plot, so we call plt again
plt.scatter(df.age[df.stroke==1], 
            df.bmi[df.stroke==1], 
            c="salmon") # axis always come as (x, y)

# Add some helpful info
plt.title("Stroke in function of Age and Hyper tension")
plt.xlabel("Age")
plt.legend(["No Disease", "Disease"])
plt.ylabel("BMI");

Higer age and less bmi are contributing towards a possiblity in stroke

### `age` and `average_glucose_level` impact on `stroke`

In [None]:
# Create another figure
plt.figure(figsize=(10,6))

# Start with positve examples
plt.scatter(df.age[df.stroke==0], 
            df.avg_glucose_level[df.stroke==0], 
            c="lightblue") # define it as a scatter figure

# Now for negative examples, we want them on the same plot, so we call plt again
plt.scatter(df.age[df.stroke==1], 
            df.avg_glucose_level[df.stroke==1], 
            c="salmon") # axis always come as (x, y)

# Add some helpful info
plt.title("Stroke in function of Age and Hyper tension")
plt.xlabel("Age")
plt.legend(["No Disease", "Disease"])
plt.ylabel("Average Glucose level");

Older people with low glucose level are more susceptible to stroke

In [None]:
df.isna().sum()

In [None]:
df_with_nan_bmi = df[df["bmi"].isnull()]
df_with_nan_bmi.to_csv("nan-bmi-samples.csv")

In [None]:
# Now we've saved the nan bmi samples to a scv let's drop them from the dataframe
df = df.dropna()

Let's convert the categorical features to numbers using pandas

In [None]:
df.head()

In [None]:
df["gender"].unique()

### Getting data ready

1. Let's convert `Male` to `0` and `Female` to `1` in `gender` feature

In [None]:
df['gender'] = np.where((df.gender == 'Male'),'0',df.gender)
df['gender'] = np.where((df.gender == 'Female'),'1',df.gender)

In [None]:
df["gender"] = df["gender"].astype('int64')

In [None]:
df.info()

In [None]:
df["ever_married"].unique()

2. Let's convert `Yes` to `0` and `No` to `1` in `ever_married` feature

In [None]:
df['ever_married'] = np.where((df.ever_married == 'Yes'),'0',df.ever_married)
df['ever_married'] = np.where((df.ever_married == 'No'),'1',df.ever_married)
df["ever_married"] = df["ever_married"].astype('int64')
df.head()

In [None]:
df["work_type"].unique()

3. Let's convert `work_type` feature using below mapping
* `Private`: `0`
* `Self-employed`: `1`
* `Govt_job`: `2`
* `children`: `3`
* `Never_worked`: `4`

In [None]:
df['work_type'] = np.where((df.work_type == 'Private'),'0',df.work_type)
df['work_type'] = np.where((df.work_type == 'Self-employed'),'1',df.work_type)
df['work_type'] = np.where((df.work_type == 'Govt_job'),'2',df.work_type)
df['work_type'] = np.where((df.work_type == 'children'),'3',df.work_type)
df['work_type'] = np.where((df.work_type == 'Never_worked'),'4',df.work_type)
df["work_type"] = df["work_type"].astype('int64')
df.head()

In [None]:
df["Residence_type"].unique()

4. Let's convert `residence_type feature`, 
* `Urban`: `0`
* `Rural`: `1`

In [None]:
df['Residence_type'] = np.where((df.Residence_type == 'Urban'),'0',df.Residence_type)
df['Residence_type'] = np.where((df.Residence_type == 'Rural'),'1',df.Residence_type)
df["Residence_type"] = df["Residence_type"].astype('int64')
df.head()

In [None]:
df["smoking_status"].unique()

4. Let's convert `smoking_status feature`, 
* `formerly smoked`: `0`
* `never smoked`: `1`
* `smokes`: `2`
* `Unknown`: `3`

In [None]:
df['smoking_status'] = np.where((df.smoking_status == 'formerly smoked'),'0',df.smoking_status)
df['smoking_status'] = np.where((df.smoking_status == 'never smoked'),'1',df.smoking_status)
df['smoking_status'] = np.where((df.smoking_status == 'smokes'),'2',df.smoking_status)
df['smoking_status'] = np.where((df.smoking_status == 'Unknown'),'3',df.smoking_status)
df["smoking_status"] = df["smoking_status"].astype('int64')

In [None]:
df.head()

In [None]:
df.dtypes

One Final EDA to check the relationship between independent variables using correlation matrix

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
# Let's plot the correlation matrix
plt.figure(figsize=(30,20))
sns.heatmap(corr_matrix,
           annot=True,
           linewidths=0.5,
           fmt=".2f",
           cmap="YlGnBu");

Much better. A higher positive value means a potential positive correlation (increase) and a higher negative value means a potential negative correlation (decrease).

**Now all the categorical features are converted to numerical** Let's model yeah😎

## Modelling

1. Let's split our features and target

In [None]:
X = df.drop("stroke", axis=1)
Y = df.stroke.values

In [None]:
# Random seed for probabality
np.random.seed(42)

# Splitting the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2)

In [None]:
X_train.head()

In [None]:
len(X_train), len(Y_train), len(X_test), len(Y_test)

### Let's train the data with few models and see how it goes

In [None]:
# Put models in a dictionary
models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(), 
          "Random Forest": RandomForestClassifier()}

def fit_and_score(models, x_train, x_test, y_train, y_test):
    """
    Model to fit the data to a model and score the model with test data
    models - dictionary of models
    x_train - training features
    x_test - test features
    y_train - training features
    y_test - test_features
    """
    np. random.seed(42)
    model_scores = {}
    for name, model in models.items():
        model.fit(x_train, y_train)
        model_scores[name] = model.score(x_test, y_test)

    return model_scores

In [None]:
model_scores = fit_and_score(models=models,
                             x_train=X_train,
                             x_test=X_test,
                             y_train=Y_train,
                             y_test=Y_test)
model_scores

Looks like our model seems to be overfitting, since it's an unbalanced dataset. It's choosing the true negative for most cases to minimize error. Let's evaluate the model using F1 score

In [None]:
model_compare = pd.DataFrame(model_scores, index=['accuracy'])
model_compare.T.plot.bar();

## Hypertuning the three models

In [None]:
# Creating a list for train scores
train_scores = []

# Creating a list for test scores
test_scores = []

# Creating a list for n neighbours
neighbours = range(1,21)

knn= KNeighborsClassifier()

# Loop through different neighbours values
for i in neighbours:
    # Set neighbours
    knn.set_params(n_neighbors = i)
    
    # Fitting the model
    knn.fit(X_train, Y_train)
    
    # Scoring the model
    train_scores.append(knn.score(X_train, Y_train))
    
    # Scoring the model on test data set
    test_scores.append(knn.score(X_test, Y_test))

In [None]:
train_scores, test_scores

In [None]:
plt.plot(neighbours, train_scores, label="Train score")
plt.plot(neighbours, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")

There seems to be **no improvement** after hypertuning the knneighbours

Model seems to be working very well on train data like mentioned above, seems like overfitting. Let's perform hypertuning for other models. Let's check some more metrics like classification_report, ROC_curve

### Tuning model with RandomizedSearchCV

In [None]:
# Different LogisticRegression hyperparameters
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
# Setup random seed
np.random.seed(42)

# Setup Random hyper parameter tuning for Logistic Regression
rs_log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions=log_reg_grid,
                               cv=5,
                               n_iter=20,
                               verbose=True)

rs_log_reg.fit(X_train, Y_train)

In [None]:
# Finding the best hyper parameters
rs_log_reg.best_params_

In [None]:
rs_log_score = rs_log_reg.score(X_test, Y_test)

In [None]:
model_scores["Logistic Regression"] - rs_log_score

There's 0.0010183299389002753 decrease in logistic regression score after hyper tuning

In [None]:
# Tuning random forest classifier using randomsearchcv parameters
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                          param_distributions=rf_grid,
                          cv=5,
                          n_iter=20,
                          verbose=True)

rs_rf.fit(X_train, Y_train)

rs_rf.score(X_test, Y_test)

In [None]:
rs_rf.best_params_

In [None]:
rs_rf_score = rs_rf.score(X_test, Y_test)

In [None]:
model_scores["Random Forest"] - rs_rf_score

The hyper tuned random classifier performs better than the model

Let's find out more metrics regading the RandomSearchCV randomforestclassifier model

## Model evaluation beyond accuracy

We'll use the below metrics,

1. ROC and AUC curve
2. Confusion matrix
3. Classification report
4. Recall score
5. F1 score

To make comparisons and evalutions we'll need predictions, let's get em.

In [None]:
Y_preds = rs_rf.predict(X_test)

In [None]:
# Import roc curve from metrics module
from sklearn.metrics import plot_roc_curve

plot_roc_curve(rs_rf, X_test, Y_test)

RandomizedSearchCV random forest model does good with an auc of 0.84

Let's proceed with confusion matrix

In [None]:
print(confusion_matrix(Y_test, Y_preds))

Let's visualize the confusion matrix using sns heatmap

#### Confusion matrix

In [None]:
# Import seaborn
import seaborn as sns

def plot_conf_matrix(y_test, y_preds):
    """
    Funtion to plot confusion matrix
    """

    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                    annot=True,
                    cbar=False)

    plt.xlabel("True label")
    plt.ylabel("Predicted label")

#plot_conf_matrix(Y_test, Y_preds)

#### Classification report

In [None]:
print(classification_report(Y_test, Y_preds))

### Inferences

Looking at the classification report and confusion matrix, The model is unable to predict true positives at all due to the class imbalance in dataset.

We can see this more clearly in classificatio report's f1-score, 0.97 for class 0 and 0 for class 1.

macro average which will be bad when there are class imbalances and it's 0.49 which is pretty poor


Let's give a try if the model improves with best hyper parameters and calculate the `cross_val_score` for all the metrics and visualize them

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
rs_rf.best_params_

In [None]:
clf = RandomForestClassifier(n_estimators=260,
                            min_samples_split=8,
                            min_samples_leaf=13,
                            max_depth=10)

In [None]:
# Cross-validated F1 score
cv_f1 = np.mean(cross_val_score(clf,
                                X,
                                Y,
                                cv=5, # 5-fold cross-validation
                                scoring="f1")) # f1 as scoring
cv_f1

In cross validation F1 become zero meaning our model performance is 😕😑

Cross validation is not working.

## Dataset based imbalance techniques - oversampling

In [None]:
!pip install -U imbalanced-learn

In [None]:
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

In [None]:
# Put models in a dictionary
models = {"KNN": KNeighborsClassifier(),
          "LogisticRegression": LogisticRegression(), 
          "RandomForest": RandomForestClassifier()}

resamplers = {
    "ros": RandomOverSampler(sampling_strategy='minority'),
    "smote": SMOTE(sampling_strategy='minority'),
    "adasyn": ADASYN(sampling_strategy='minority'),
    "rus": RandomUnderSampler(sampling_strategy="majority"),
    "tomek": TomekLinks(sampling_strategy="majority")
}

def fit_resample_and_score(models, samplers, x, y):
    """
    Model to resample data to a model and score the model with test data
    models - dictionary of models
    samplers - samplers to resample the data
    x - features
    y - labels
    """
    np. random.seed(42)
    model_scores = {}
    for sname, sampler in samplers.items():
        
        # resampling the data
        X_resampled, Y_resampled = sampler.fit_resample(x, y)
        
        # Splitting the data
        X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)
        
        for mname, model in models.items():
            #print(sname + mname)
            model.fit(X_train, Y_train)
            model_scores[sname+mname] = model.score(X_test, Y_test)

    return model_scores

In [None]:
model_scores = fit_resample_and_score(models=models,
                      samplers=resamplers,
                      x=X,
                      y=Y)

model_scores

We've written a function to resample and model the data 😎.

In [None]:
model_compare = pd.DataFrame(model_scores, index=['accuracy'])
model_compare.T.plot(kind="bar", figsize=(20,10));

In [None]:
print(f"The model with highest accuracy is rosKNN with {max(model_scores.values())}")

Let's see if we can improve the performace of this model

# Hypertuning once again

In [None]:
X_resampled, Y_resampled = RandomOverSampler(sampling_strategy='minority').fit_resample(X,Y)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled)

In [None]:
train_scores = []
test_scores = []
neigbors = range(1,21)
knn = KNeighborsClassifier()

for i in neigbors:
    
    knn.set_params(n_neighbors = i)
    
    knn.fit(X_train, Y_train)
    
    train_scores.append(knn.score(X_train, Y_train))
    
    test_scores.append(knn.score(X_test, Y_test))

In [None]:
plt.plot(neighbours, train_scores, label="Train score")
plt.plot(neighbours, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")

The model had improved a bit after hypertuning with n_neighbors=1

In [None]:
# Using the best hyperparameters
clf_knn = KNeighborsClassifier(n_neighbors=1)

clf_knn.fit(X_train, Y_train)

clf_knn.score(X_test, Y_test)

Let's find below metrics as a final destination
1. ROC curve
2. classificationreport
3. confusionmatrix

In [None]:
plot_roc_curve(clf_knn, X_test, Y_test);

In [None]:
Y_preds = clf_knn.predict(X_test)
print(classification_report(Y_test, Y_preds))

In [None]:
plot_conf_matrix(Y_test, Y_preds)

In [None]:
id_final = pd.Series(X_test["id"])
stroke_final = pd.Series(Y_test)

In [None]:
df_final = pd.DataFrame({"id": id_final, "stroke": stroke_final})

In [None]:
df_final.dropna(inplace=True)
df_final.isna().sum()
df_final.to_csv("submission.csv")

This is the final model KNN with RandomOverSampling