# Import package

In [21]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [22]:
df = pd.read_csv("./Dataset/processed_train.csv")

In [23]:
df

Unnamed: 0,hypertension,heart_disease,ever_married,stroke,gender_Female,gender_Male,gender_Other,blood_A,blood_AB,blood_B,...,Residence_type_Unknown,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,avg_glucose_level,bmi,age
0,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,-0.454391,-1.632301,0.000364
1,0,0,1,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,-0.377299,0.528038,0.535819
2,0,0,0,0,1,0,0,0,0,1,...,1,0,0,0,1,0,0,-0.162540,-1.649074,-0.047722
3,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,-0.325215,0.450906,-0.227273
4,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,1,0,0,-0.185025,-0.357567,-0.855701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,-0.909377,0.528038,0.715369
3905,0,0,1,0,1,0,0,0,0,1,...,1,0,0,0,1,0,0,-0.833202,0.528038,0.401155
3906,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,1,0,0,-0.140513,-1.062025,-1.214803
3907,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,-0.695536,-0.558841,0.176717


In [24]:
x = df.copy()
x = x.drop(columns='stroke')
y = df['stroke']

In [25]:
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x, y)

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size= 0.2, random_state= 42)

In [27]:
print(len(x_train))
print(len(y_train))
print(len(x_test))
print(len(y_test))

6052
6052
1514
1514


## Training Model

In [28]:
df.corr()['stroke'].sort_values(ascending=False)

stroke                            1.000000
blood_O                           0.206690
age                               0.192354
Residence_type_Urban              0.116430
avg_glucose_level                 0.102858
heart_disease                     0.095742
ever_married                      0.087278
Residence_type_Rural              0.070189
hypertension                      0.064028
smoking_status_formerly smoked    0.051671
bmi                               0.037239
work_type_Self-employed           0.024138
work_type_Private                 0.022925
work_type_Govt_job                0.017460
gender_Male                       0.012944
smoking_status_smokes             0.006403
gender_Other                      0.001244
smoking_status_never smoked      -0.003192
work_type_Never_worked           -0.012062
gender_Female                    -0.013280
blood_A                          -0.037011
smoking_status_Unknown           -0.043779
blood_B                          -0.047782
work_type_c

## smoking_status

In [29]:
randomFor = RandomForestClassifier()
randomFor.fit(x_train,y_train)
y_randomFor_pred = randomFor.predict(x_test)
y_randomFor_pred

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [30]:
cm = confusion_matrix(y_test, y_randomFor_pred)
cm

array([[762,   9],
       [  8, 735]], dtype=int64)

In [31]:
tn = cm[0,0]
fp = cm[0,1]
tp = cm[1,1]
fn = cm[1,0]
accuracy  = (tp + tn) / (tp + fp + tn + fn)
precision = tp / (tp + fp)
recall    = tp / (tp + fn)
f1score  = 2 * precision * recall / (precision + recall)
print(f1score)
print(accuracy)

0.988567585743107
0.9887714663143989


## Test Submission

In [32]:
test_df = pd.read_csv("./Dataset/processed_test.csv")
test_df

Unnamed: 0.1,Unnamed: 0,id,hypertension,heart_disease,ever_married,gender_Female,gender_Male,gender_Other,blood_A,blood_AB,...,Residence_type_Unknown,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,avg_glucose_level,bmi,age
0,0,67,0,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,-0.288097,-0.071492,-1.162120
1,1,210,0,0,1,0,1,0,1,0,...,0,0,0,0,0,1,0,-0.318603,0.286883,1.711148
2,2,242,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,1,-0.168848,-1.462595,-1.745753
3,3,711,0,0,1,0,1,0,0,1,...,0,0,0,0,1,0,0,-0.288311,-0.871045,1.711148
4,4,724,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,-0.527022,1.960843,-1.162120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,72393,0,0,1,1,0,0,1,0,...,0,0,1,0,0,0,0,-0.459611,-0.619321,-0.892751
496,496,72491,0,0,1,0,1,0,0,0,...,0,1,0,0,1,0,0,-0.678697,0.009987,0.454093
497,497,72562,0,0,1,1,0,0,0,1,...,0,0,0,0,1,0,0,-0.898210,0.463089,0.633672
498,498,72792,0,0,1,1,0,0,0,1,...,0,0,0,0,1,0,0,-0.608726,0.221938,0.454093


In [33]:
test_df.drop(columns= 'Unnamed: 0', inplace=True)

In [34]:
x = test_df.copy()
x.drop(columns='id', inplace=True)
id_test = test_df['id']

In [35]:
y_randomFor_test = randomFor.predict(x)

In [36]:
y_randomFor_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [37]:
data_test_submit = {'id': id_test, 'stroke': y_randomFor_test}
df_test_submit = pd.DataFrame(data= data_test_submit)
df_test_submit

Unnamed: 0,id,stroke
0,67,0
1,210,0
2,242,0
3,711,0
4,724,0
...,...,...
495,72393,0
496,72491,1
497,72562,0
498,72792,0


In [38]:
#df_test_submit.to_csv('./Dataset/Submission_randomForest.csv', index= False)

In [39]:
from model_utils import save_model, load_model
save_model(randomFor, './models/random_forest')