In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load csv into a DataFrame
prediction_df = pd.read_csv('full_data.csv')
prediction_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [3]:
# Make a copy of the dataframe and drop unnecessary columns and target column
features_df = prediction_df.copy().drop(columns=['ever_married', 'work_type',
                                                 'Residence_type', 'stroke'])

# Convert categorical data to numerical
dummies = pd.get_dummies(features_df[['gender', 'smoking_status']])

# Combine dummies and features_df and drop the original columns that have been split
features_df = pd.concat([features_df, dummies], axis=1)
features_df = features_df.drop(columns=['gender', 'smoking_status'])
features_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,0,1,0,1,0,0
1,80.0,0,1,105.92,32.5,0,1,0,0,1,0
2,49.0,0,0,171.23,34.4,1,0,0,0,0,1
3,79.0,1,0,174.12,24.0,1,0,0,0,1,0
4,81.0,0,0,186.21,29.0,0,1,0,1,0,0


In [4]:
# Target Column
y = prediction_df['stroke']

# Feature Columns
X = features_df

In [5]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a Logistic Regression model
model = LogisticRegression(solver='lbfgs', max_iter=200)

# Fit the model using training data
model.fit(X_train_scaled, y_train)

In [8]:
# Make predictions using the testing data
predictions = model.predict(X_test_scaled)

In [9]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.5081967213114754

In [10]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1185
           1       1.00      0.02      0.03        61

    accuracy                           0.95      1246
   macro avg       0.98      0.51      0.50      1246
weighted avg       0.95      0.95      0.93      1246



In [11]:
# Create the random forest classifier model
rfModel = RandomForestClassifier(n_estimators=500, random_state=42)

In [12]:
# Fit the model on the scaled training data
rfModel = rfModel.fit(X_train_scaled, y_train)

In [13]:
# Make a list of predictions using the scaled testing data
rfPredictions = rfModel.predict(X_test_scaled)
rfPredictions[:3]

array([0, 0, 0])

In [14]:
# Print the accuracy score
rfAccuracyScore = accuracy_score(y_test, rfPredictions)
print(f"Random Forest Accuracy Score: {rfAccuracyScore * 100:.2f}%")

Random Forest Accuracy Score: 94.70%
