In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv("stroke_data.csv")
data.head(3)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,3.0,0,0,95.12,18.0,0
1,58.0,1,0,87.96,39.2,0
2,8.0,0,0,110.89,17.6,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41938 entries, 0 to 41937
Data columns (total 6 columns):
age                  41938 non-null float64
hypertension         41938 non-null int64
heart_disease        41938 non-null int64
avg_glucose_level    41938 non-null float64
bmi                  41938 non-null float64
stroke               41938 non-null int64
dtypes: float64(3), int64(3)
memory usage: 1.9 MB


In [6]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,41938.0,41938.0,41938.0,41938.0,41938.0,41938.0
mean,41.83929,0.08751,0.043111,103.632645,28.605038,0.015332
std,22.48366,0.282585,0.20311,42.229814,7.77002,0.122872
min,0.08,0.0,0.0,55.0,10.1,0.0
25%,24.0,0.0,0.0,77.37,23.2,0.0
50%,43.0,0.0,0.0,91.32,27.7,0.0
75%,59.0,0.0,0.0,111.48,32.9,0.0
max,82.0,1.0,1.0,291.05,97.6,1.0


In [7]:
data.stroke.value_counts()

0    41295
1      643
Name: stroke, dtype: int64

In [8]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = data[data.stroke==0]
df_minority = data[data.stroke==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [9]:
df_upsampled.stroke.value_counts()

1    41295
0    41295
Name: stroke, dtype: int64

In [10]:
df_upsampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82590 entries, 0 to 39464
Data columns (total 6 columns):
age                  82590 non-null float64
hypertension         82590 non-null int64
heart_disease        82590 non-null int64
avg_glucose_level    82590 non-null float64
bmi                  82590 non-null float64
stroke               82590 non-null int64
dtypes: float64(3), int64(3)
memory usage: 4.4 MB


In [14]:
X = df_upsampled.drop(["stroke"], axis=1)
y = df_upsampled["stroke"]

In [15]:
from sklearn.model_selection import cross_val_score
print("randomForest_cv_score",cross_val_score(clf, X, y, cv=3) )

randomForest_cv_score [0.99644025 0.99665819 0.99720305]


In [16]:
# Split data to train and test set
# Fit and train the model
# Make Prediction on test data
# Print the roc_auc_score of the model on test data set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
np.random.seed(42)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print("Score", roc_auc_score(y_test, prediction))

Score 0.9966666666666666


### Save model to pickle format

In [17]:
#Serialize clf object into a file called clf.pkg on disk using pickle

with open('clf.pkl', 'wb') as handle:
    pickle.dump(clf, handle, pickle.HIGHEST_PROTOCOL)
    

# pickle.HIGHEST_PROTOCOL using the highest available protocol 
# (we used wb to open file as binary and use a higher pickling protocol)

In [18]:
# de-serialize clf.pickle file into an object called clf_model using pickle

with open('clf.pkl', 'rb') as handle:
    clf_model = pickle.load(handle)

In [19]:
# no we can call various methods over clf_model as as:
# Let X_test be the features for which we want to predict the output 
result = clf_model.predict(X_test)
print("Score", roc_auc_score(y_test, result))

Score 0.9966666666666666
