## Data Cleaning for healthcare-data-stroke.csv
This notebook cleans the data in `healthcare-data-stroke.csv` by removing duplicates, `NaN` values, re-scaling values of the data by normalization, and convert non-numerical data to numerical data for the ease of model training in future.

### Import `pandas` library and read csv file. Also list out some basic informations about the dataset. Remove records that has `NaN` values.

In [85]:
import pandas as pd
df = pd.read_csv('/Users/wenghong/ML-Assignment/healthcare-dataset-stroke-data.csv')

num_nan_rows = df.isna().sum(axis=1).astype(bool).sum()
total_rows = df.shape[0]
print('Number of records in total: {}\nNumber of records that has NaN values: {}'.format(total_rows, num_nan_rows))

# remove NaN values
df = df.dropna()
print('Number of records after removing NaN values: {}'.format(df.shape[0]))

Number of records in total: 5110
Number of records that has NaN values: 201
Number of records after removing NaN values: 4909


### Give an overview of the dataset

In [86]:
df.head(20)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
10,12109,Female,81.0,1,0,Yes,Private,Rural,80.43,29.7,never smoked,1
11,12095,Female,61.0,0,1,Yes,Govt_job,Rural,120.46,36.8,smokes,1


### List out all the columns

In [87]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

### List out all the variety of values in each column

In [88]:
for col in df.columns:
    print("{}\n".format(col))
    print(df[col].unique())

id

[ 9046 31112 60182 ... 19723 37544 44679]
gender

['Male' 'Female' 'Other']
age

[6.70e+01 8.00e+01 4.90e+01 7.90e+01 8.10e+01 7.40e+01 6.90e+01 7.80e+01
 6.10e+01 5.40e+01 5.00e+01 6.40e+01 7.50e+01 6.00e+01 7.10e+01 5.20e+01
 8.20e+01 6.50e+01 5.70e+01 4.20e+01 4.80e+01 7.20e+01 5.80e+01 7.60e+01
 3.90e+01 7.70e+01 6.30e+01 7.30e+01 5.60e+01 4.50e+01 7.00e+01 5.90e+01
 6.60e+01 4.30e+01 6.80e+01 4.70e+01 5.30e+01 3.80e+01 5.50e+01 4.60e+01
 3.20e+01 5.10e+01 1.40e+01 3.00e+00 8.00e+00 3.70e+01 4.00e+01 3.50e+01
 2.00e+01 4.40e+01 2.50e+01 2.70e+01 2.30e+01 1.70e+01 1.30e+01 4.00e+00
 1.60e+01 2.20e+01 3.00e+01 2.90e+01 1.10e+01 2.10e+01 1.80e+01 3.30e+01
 2.40e+01 3.60e+01 6.40e-01 3.40e+01 4.10e+01 8.80e-01 5.00e+00 2.60e+01
 3.10e+01 7.00e+00 1.20e+01 6.20e+01 2.00e+00 9.00e+00 1.50e+01 2.80e+01
 1.00e+01 1.80e+00 3.20e-01 1.08e+00 1.90e+01 6.00e+00 1.16e+00 1.00e+00
 1.40e+00 1.72e+00 2.40e-01 1.64e+00 1.56e+00 7.20e-01 1.88e+00 1.24e+00
 8.00e-01 4.00e-01 8.00e-02 1.48e+00 5.

In [89]:
print('Before drop `Other` in gender: {}'.format(df.shape[0]))
other_rows = df[df['gender'] == 'Other']
num_other_rows = (df['gender'] == 'Other').sum()
# print(other_rows)
print(num_other_rows)

df.drop(df[df['gender'] == 'Other'].index, inplace=True)
print('After drop `Other` in gender: {}'.format(df.shape[0]))



Before drop `Other` in gender: 4909
1
After drop `Other` in gender: 4908


### Find the number of records that has 'Unknown' in `smoking_status` 

In [90]:
# drop records that has 'Unknown' in smoking_status
# smoking_unknown_rows = df[df['smoking_status'] == 'Unknown'].shape[0]
# print(smoking_unknown_rows)
# print('Number of records before dropping record that has `Unknown` values: {}'.format(df.shape[0]))
# df = df[~(df['smoking_status'] == 'Unknown')]
# print('Number of records after dropping record that has `Unknown` values: {}'.format(df.shape[0]))

### Convert non-numerical values to numerical

In [91]:

df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['ever_married'] = df['ever_married'].map({'Yes': 1, 'No': 0})
df['work_type'] = df['work_type'].map({'Never_worked': 0, 'Private': 1, 'Self-employed':2, 'Govt_job': 3, 'children': 4})
df['residence_type'] = df['Residence_type'].map({'Urban': 1, 'Rural': 0})
df['smoking_status'] = df['smoking_status'].map({'never smoked': 0, 'smokes': 1, 'formerly smoked': 2, 'Unknown': 3})
df = df.drop('Residence_type', axis=1)
df.head(20)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke,residence_type
0,9046,1,67.0,0,1,1,1,228.69,36.6,2,1,1
2,31112,1,80.0,0,1,1,1,105.92,32.5,0,1,0
3,60182,0,49.0,0,0,1,1,171.23,34.4,1,1,1
4,1665,0,79.0,1,0,1,2,174.12,24.0,0,1,0
5,56669,1,81.0,0,0,1,1,186.21,29.0,2,1,1
6,53882,1,74.0,1,1,1,1,70.09,27.4,0,1,0
7,10434,0,69.0,0,0,0,1,94.39,22.8,0,1,1
9,60491,0,78.0,0,0,1,1,58.57,24.2,3,1,1
10,12109,0,81.0,1,0,1,1,80.43,29.7,0,1,0
11,12095,0,61.0,0,1,1,3,120.46,36.8,1,1,0


In [92]:
import numpy as np
from scipy import stats


z_scores = np.abs(stats.zscore(df))
threshold = 3
df1 = df[(z_scores < threshold).all(axis=1)]
df1

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke,residence_type
249,30669,1,3.0,0,0,0,4,95.12,18.0,3,0,0
251,16523,0,8.0,0,0,0,1,110.89,17.6,3,0,1
252,56543,0,70.0,0,0,1,1,69.04,35.9,2,0,0
253,46136,1,14.0,0,0,0,0,161.28,19.1,3,0,0
254,32257,0,47.0,0,0,1,1,210.95,50.1,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,0,13.0,0,0,0,4,103.08,18.6,3,0,0
5106,44873,0,81.0,0,0,1,2,125.20,40.0,0,0,1
5107,19723,0,35.0,0,0,1,2,82.99,30.6,0,0,0
5108,37544,1,51.0,0,0,1,1,166.29,25.6,2,0,0


### Normalize data by converting the values to the scale of [0, 1]

In [93]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
id = df.pop('id') 
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_normalized = pd.concat([df_normalized, id.reset_index(drop=True)], axis=1)
df_normalized.head(20)



Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,stroke,residence_type,id
0,1.0,0.816895,0.0,1.0,1.0,0.25,0.801265,0.30126,0.666667,1.0,1.0,9046
1,1.0,0.975586,0.0,1.0,1.0,0.25,0.234512,0.254296,0.0,1.0,0.0,31112
2,0.0,0.597168,0.0,0.0,1.0,0.25,0.536008,0.27606,0.333333,1.0,1.0,60182
3,0.0,0.963379,1.0,0.0,1.0,0.5,0.549349,0.15693,0.0,1.0,0.0,1665
4,1.0,0.987793,0.0,0.0,1.0,0.25,0.605161,0.214204,0.666667,1.0,1.0,56669
5,1.0,0.902344,1.0,1.0,1.0,0.25,0.069107,0.195876,0.0,1.0,0.0,53882
6,0.0,0.841309,0.0,0.0,0.0,0.25,0.181285,0.143184,0.0,1.0,1.0,10434
7,0.0,0.951172,0.0,0.0,1.0,0.25,0.015927,0.159221,1.0,1.0,1.0,60491
8,0.0,0.987793,1.0,0.0,1.0,0.25,0.116841,0.222222,0.0,1.0,0.0,12109
9,0.0,0.743652,0.0,1.0,1.0,0.75,0.301634,0.303551,0.333333,1.0,0.0,12095


In [94]:
# nan_rows = (df[df.isna().any(axis=1)])
# num_nan_rows = nan_rows.sum()
# print(nan_rows)
# print(num_nan_rows)

stroke_num_rows = (df['stroke'] == 1).sum()
non_stroke_rows = (df['stroke'] == 0).sum()
print(stroke_num_rows)
print(non_stroke_rows)

209
4699


### Write to new CSV File

In [95]:
# new_id = df_normalized.pop('id')
# print(new_id)
# df_normalized.insert(0, 'id', id)
# df_normalized.head(20)
print('\nNumber of records in dataset after cleaning and pre-processing: {}'.format(df.shape[0]))
df_normalized.to_csv('cleaned_data_stroke.csv', index=False)



Number of records in dataset after cleaning and pre-processing: 4908


In [98]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data


# Preprocess the data
# ...

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df_normalized.drop("stroke", axis=1), df_normalized["stroke"], test_size=0.2, random_state=42)

# Create a logistic regression model
lr_model = LogisticRegression()

# Fit the model
lr_model.fit(X_train, y_train)



print(X_train.head())
print("X_test: \n", X_test.head(20))
# print("y_train: \n", y_train.head(20))
# print("y_test: \n", y_test.head(20))
# Predict using the model
y_pred = lr_model.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

max_depth=2, min_samples_split=2
Accuracy: 0.9460285132382892


max_depth=2, min_samples_split=4
Accuracy: 0.9460285132382892


max_depth=2, min_samples_split=6
Accuracy: 0.9460285132382892


max_depth=2, min_samples_split=8
Accuracy: 0.9460285132382892


max_depth=2, min_samples_split=10
Accuracy: 0.9460285132382892


max_depth=4, min_samples_split=2
Accuracy: 0.9419551934826884


max_depth=4, min_samples_split=4
Accuracy: 0.9419551934826884


max_depth=4, min_samples_split=6
Accuracy: 0.9419551934826884


max_depth=4, min_samples_split=8
Accuracy: 0.9419551934826884


max_depth=4, min_samples_split=10
Accuracy: 0.9419551934826884


max_depth=6, min_samples_split=2
Accuracy: 0.939918533604888


max_depth=6, min_samples_split=4
Accuracy: 0.9389002036659878


max_depth=6, min_samples_split=6
Accuracy: 0.939918533604888


max_depth=6, min_samples_split=8
Accuracy: 0.939918533604888


max_depth=6, min_samples_split=10
Accuracy: 0.939918533604888


max_depth=8, min_samples_split=2
Accuracy

In [96]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Calculate the mean squared error and coefficient of determination
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Plot predicted vs. actual house prices
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Predicted vs. Actual Getting Stroke')
plt.show()