# Random Forest Classifier

### Talha Altaf 2024

#### Imports

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


#### Dataset

In [59]:
health_data=pd.read_csv('/healthcare-dataset-stroke-data.csv')
health_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### Preparing Dataset


In [60]:

df = pd.DataFrame(health_data)

#Converts categorical variable into dummy/indicator variables.
#Each variable is converted in as many 0/1 variables as there are different values.
df = pd.get_dummies(df, columns=['gender', 'work_type', 'Residence_type', 'smoking_status'], prefix='', prefix_sep='', dtype=int)

# handling Nan values
df = df.fillna(0)

# df['column_name'] = df['column_name'].astype(int) <- only used when you have True/False values
# converting "Yes" values to 1 and vice versa
df['ever_married'] = (df['ever_married'] == 'Yes').astype(int)

# Rearrange columns
# column_order = ['sunny', 'overcast', 'rainy', 'Temperature', 'Humidity', 'Wind', 'Play']
# df = df[column_order]

#dropping irrelevant featuers
df = df.drop('id', axis=1)

# Split features and target
X, y = df.drop('stroke', axis=1), df['stroke']
print(X)
print(y)


       age  hypertension  heart_disease  ever_married  avg_glucose_level  \
0     67.0             0              1             1             228.69   
1     61.0             0              0             1             202.21   
2     80.0             0              1             1             105.92   
3     49.0             0              0             1             171.23   
4     79.0             1              0             1             174.12   
...    ...           ...            ...           ...                ...   
5105  80.0             1              0             1              83.75   
5106  81.0             0              0             1             125.20   
5107  35.0             0              0             1              82.99   
5108  51.0             0              0             1             166.29   
5109  44.0             0              0             1              85.28   

       bmi  Female  Male  Other  Govt_job  Never_worked  Private  \
0     36.6       0 

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=False)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.9954337899543378


##### Testing with new test sets

In [67]:
my_x_test = X_test.loc[[3577]]
my_x_test['age'] = 21
my_x_test['heart_disease'] = 0
my_x_test['hypertension'] = 0
my_x_test['ever_married'] = 0
my_x_test['avg_glucose_level'] = 85.0
my_x_test['bmi'] = 23.6
my_x_test['Female'] = 1
my_x_test['Male'] = 0

my_x_test['Other'] = 0
my_x_test['Govt_job'] = 0
my_x_test['Never_worked'] = 1
my_x_test['Private'] = 0
my_x_test['Self-employed'] = 0
my_x_test['children'] = 0

my_x_test['Urban'] = 1
my_x_test['Rural'] = 0
my_x_test['Unknown'] = 0
my_x_test['formerly smoked'] = 0
my_x_test['never smoked'] = 1
my_x_test['smokes'] = 0

my_y_test = y_test.loc[[3577]]
# my_y_test['stroke']
print(my_x_test)
print(my_y_test)

y_pred = rf.predict(my_x_test)
print(y_pred)
print(f"Accuracy: {accuracy_score(my_y_test, y_pred)}")

      age  hypertension  heart_disease  ever_married  avg_glucose_level   bmi  \
3577   21             0              0             0               85.0  23.6   

      Female  Male  Other  Govt_job  Never_worked  Private  Self-employed  \
3577       1     0      0         0             1        0              0   

      children  Rural  Urban  Unknown  formerly smoked  never smoked  smokes  
3577         0      0      1        0                0             1       0  
3577    0
Name: stroke, dtype: int64
[0]
Accuracy: 1.0
