In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
file_path = (r'C:\Users\vaish\Downloads\healthcare-heart-stroke-dataset1.csv')
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,gender,age,hypertension,diseases,glucose,bmi,stroke
0,Male,58.0,1,0,170.34,50.0,1
1,Female,75.0,1,0,170.34,50.0,1
2,Male,78.0,1,0,170.34,50.0,1
3,Female,64.0,1,0,170.34,39.2,1
4,Male,58.0,1,0,170.34,50.0,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gender        5110 non-null   object 
 1   age           5110 non-null   float64
 2   hypertension  5110 non-null   int64  
 3   diseases      5110 non-null   int64  
 4   glucose       5110 non-null   float64
 5   bmi           5110 non-null   float64
 6   stroke        5110 non-null   int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 279.6+ KB


In [6]:
df.isnull().sum()

gender          0
age             0
hypertension    0
diseases        0
glucose         0
bmi             0
stroke          0
dtype: int64

In [7]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])

In [8]:

df.head()

Unnamed: 0,gender,age,hypertension,diseases,glucose,bmi,stroke
0,1,58.0,1,0,170.34,50.0,1
1,0,75.0,1,0,170.34,50.0,1
2,1,78.0,1,0,170.34,50.0,1
3,0,64.0,1,0,170.34,39.2,1
4,1,58.0,1,0,170.34,50.0,1


In [9]:
X = df.drop('stroke', axis=1)
y = df['stroke']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (4088, 6)
X_test shape: (1022, 6)


In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  

In [13]:
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [14]:
print(y_pred)
pd.DataFrame(y_pred)

[0 0 0 ... 0 0 1]


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
1017,0
1018,0
1019,0
1020,0


In [15]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy, conf_matrix, class_report

(0.9931506849315068,
 array([[306,   0],
        [  7, 709]]),
 '              precision    recall  f1-score   support\n\n           0       0.98      1.00      0.99       306\n           1       1.00      0.99      1.00       716\n\n    accuracy                           0.99      1022\n   macro avg       0.99      1.00      0.99      1022\nweighted avg       0.99      0.99      0.99      1022\n')

In [16]:
import joblib

joblib.dump(scaler, './Models/scaler.joblib')
joblib.dump(model, './Models/logistic_regression_model.joblib')
joblib.dump(le, './Models/label_encoder.joblib')

['./Models/label_encoder.joblib']